Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
diff --git a/block/bio.c b/block/bio.c

index 1319dd2..e16849f 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -1544,12 +1544,15 @@ EXPORT_SYMBOL(bio_split);
   * @bio:       bio to trim
   * @offset:    number of sectors to trim from the front of @bio
   * @size:      size we want to trim @bio to, in sectors
+ *
+ * This function is typically used for bios that are cloned and submitted
+ * to the underlying device in parts.
   */
-void bio_trim(struct bio *bio, int offset, int size)
+void bio_trim(struct bio *bio, sector_t offset, sector_t size)
  {
-       /* 'bio' is a cloned bio which we need to trim to match
-        * the given offset and size.
-        */
+       if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+                        offset + size > bio->bi_iter.bi_size))
+               return;
  
         size <<= 9;
         if (offset == 0 && size == bio->bi_iter.bi_size)
@@ -1560,7 +1563,6 @@ void bio_trim(struct bio *bio, int offset, int size)
  
         if (bio_integrity(bio))
                 bio_integrity_trim(bio);
-
  }
  EXPORT_SYMBOL_GPL(bio_trim);
  
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c

index c4a2dc4..aab5e65 100644 (file)
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -612,12 +612,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
         p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
  
         inode = file_inode(vma->vm_file);
-
-       if (!mapping_can_writeback(inode->i_mapping))
-               wbc.nr_to_write = 0;
-
-       might_sleep();
-       sync_inode(inode, &wbc);
+       filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
  }
  
  
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index cec88a6..3dcf9bc 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
  btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
  btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
  
  btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
         tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c

index d95eb5c..c9f9789 100644 (file)
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -53,7 +53,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
  }
  
  static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
-                        struct inode *inode, struct posix_acl *acl, int type)
+                          struct user_namespace *mnt_userns,
+                          struct inode *inode, struct posix_acl *acl, int type)
  {
         int ret, size = 0;
         const char *name;
@@ -114,12 +115,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
         umode_t old_mode = inode->i_mode;
  
         if (type == ACL_TYPE_ACCESS && acl) {
-               ret = posix_acl_update_mode(&init_user_ns, inode,
+               ret = posix_acl_update_mode(mnt_userns, inode,
                                             &inode->i_mode, &acl);
                 if (ret)
                         return ret;
         }
-       ret = __btrfs_set_acl(NULL, inode, acl, type);
+       ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
         if (ret)
                 inode->i_mode = old_mode;
         return ret;
@@ -140,14 +141,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
                 return ret;
  
         if (default_acl) {
-               ret = __btrfs_set_acl(trans, inode, default_acl,
+               ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
                                       ACL_TYPE_DEFAULT);
                 posix_acl_release(default_acl);
         }
  
         if (acl) {
                 if (!ret)
-                       ret = __btrfs_set_acl(trans, inode, acl,
+                       ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
                                               ACL_TYPE_ACCESS);
                 posix_acl_release(acl);
         }
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 78b202d..f735b87 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
  again:
         head = NULL;
  
-       ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
         if (ret < 0)
                 goto out;
         BUG_ON(ret == 0);
@@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                          struct btrfs_fs_info *fs_info, u64 bytenr,
                          u64 time_seq, struct ulist **roots,
-                        bool ignore_offset, bool skip_commit_root_sem)
+                        bool skip_commit_root_sem)
  {
         int ret;
  
         if (!trans && !skip_commit_root_sem)
                 down_read(&fs_info->commit_root_sem);
         ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
-                                       time_seq, roots, ignore_offset);
+                                       time_seq, roots, false);
         if (!trans && !skip_commit_root_sem)
                 up_read(&fs_info->commit_root_sem);
         return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h

index ff5f07f..ba45403 100644 (file)
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                          const u64 *extent_item_pos, bool ignore_offset);
  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                          struct btrfs_fs_info *fs_info, u64 bytenr,
-                        u64 time_seq, struct ulist **roots, bool ignore_offset,
+                        u64 time_seq, struct ulist **roots,
                          bool skip_commit_root_sem);
  char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                         u32 name_len, unsigned long name_off,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c

index 9e7d9d0..a3b830b 100644 (file)
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
                                 div64_u64(zone_unusable * 100, bg->length));
                 trace_btrfs_reclaim_block_group(bg);
                 ret = btrfs_relocate_chunk(fs_info, bg->start);
-               if (ret)
+               if (ret && ret != -EAGAIN)
                         btrfs_err(fs_info, "error relocating chunk %llu",
                                   bg->start);
  
@@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
                 bg->used = em->len;
                 bg->flags = map->type;
                 ret = btrfs_add_block_group_cache(fs_info, bg);
+               /*
+                * We may have some valid block group cache added already, in
+                * that case we skip to the next one.
+                */
+               if (ret == -EEXIST) {
+                       ret = 0;
+                       btrfs_put_block_group(bg);
+                       continue;
+               }
+
                 if (ret) {
                         btrfs_remove_free_space_cache(bg);
                         btrfs_put_block_group(bg);
                         break;
                 }
+
                 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
                                         0, 0, &space_info);
                 bg->space_info = space_info;
@@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
         ret = check_chunk_block_group_mappings(info);
  error:
         btrfs_free_path(path);
+       /*
+        * We've hit some error while reading the extent tree, and have
+        * rescue=ibadroots mount option.
+        * Try to fill the tree using dummy block groups so that the user can
+        * continue to mount and grab their data.
+        */
+       if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+               ret = fill_dummy_bgs(info);
         return ret;
  }
  
@@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
         return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
  }
  
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_device *device, u64 chunk_offset,
+                           u64 start, u64 num_bytes)
+{
+       struct btrfs_fs_info *fs_info = device->fs_info;
+       struct btrfs_root *root = fs_info->dev_root;
+       struct btrfs_path *path;
+       struct btrfs_dev_extent *extent;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret;
+
+       WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+       WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = device->devid;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+       key.offset = start;
+       ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+       btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+       btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+                                           BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+       btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+       btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+       btrfs_mark_buffer_dirty(leaf);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+                                  u64 chunk_offset, u64 chunk_size)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_device *device;
+       struct extent_map *em;
+       struct map_lookup *map;
+       u64 dev_offset;
+       u64 stripe_size;
+       int i;
+       int ret = 0;
+
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+
+       map = em->map_lookup;
+       stripe_size = em->orig_block_len;
+
+       /*
+        * Take the device list mutex to prevent races with the final phase of
+        * a device replace operation that replaces the device object associated
+        * with the map's stripes, because the device object's id can change
+        * at any time during that final phase of the device replace operation
+        * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+        * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+        * resulting in persisting a device extent item with such ID.
+        */
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
+
+               ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+                                      stripe_size);
+               if (ret)
+                       break;
+       }
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+       free_extent_map(em);
+       return ret;
+}
+
  /*
   * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
   * chunk allocation.
@@ -2278,8 +2386,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                         if (ret)
                                 btrfs_abort_transaction(trans, ret);
                 }
-               ret = btrfs_finish_chunk_alloc(trans, block_group->start,
-                                       block_group->length);
+               ret = insert_dev_extents(trans, block_group->start,
+                                        block_group->length);
                 if (ret)
                         btrfs_abort_transaction(trans, ret);
                 add_block_group_free_space(trans, block_group);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index c652e19..76ee145 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
          * the file range, inode's io_tree).
          */
         BTRFS_INODE_NO_DELALLOC_FLUSH,
+       /*
+        * Set when we are working on enabling verity for a file. Computing and
+        * writing the whole Merkle tree can take a while so we want to prevent
+        * races where two separate tasks attempt to simultaneously start verity
+        * on the same file.
+        */
+       BTRFS_INODE_VERITY_IN_PROGRESS,
  };
  
  /* in memory btrfs inode */
@@ -189,8 +196,10 @@ struct btrfs_inode {
          */
         u64 csum_bytes;
  
-       /* flags field from the on disk inode */
+       /* Backwards incompatible flags, lower half of inode_item::flags  */
         u32 flags;
+       /* Read-only compatibility flags, upper half of inode_item::flags */
+       u32 ro_flags;
  
         /*
          * Counters to keep track of the number of extent item's we may use due
@@ -348,6 +357,22 @@ struct btrfs_dio_private {
         u8 csums[];
  };
  
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+       return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+                                          u32 *flags, u32 *ro_flags)
+{
+       *flags = (u32)inode_item_flags;
+       *ro_flags = (u32)(inode_item_flags >> 32);
+}
+
  /* Array of bytes with variable length, hexadecimal format 0x1234 */
  #define CSUM_FMT                               "0x%*phN"
  #define CSUM_FMT_VALUE(size, bytes)            size, bytes
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index 1695086..8681608 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -243,47 +243,6 @@ struct btrfsic_state {
         u32 datablock_size;
  };
  
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-                                       struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-               struct block_device *bdev,
-               u64 dev_bytenr,
-               struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
-               struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
-               struct btrfsic_block_link *l,
-               struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-               struct block_device *bdev_ref_to,
-               u64 dev_bytenr_ref_to,
-               struct block_device *bdev_ref_from,
-               u64 dev_bytenr_ref_from,
-               struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
-               struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
-               struct btrfsic_dev_state *ds,
-               struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
-               struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-                                     struct btrfs_fs_devices *fs_devices);
  static int btrfsic_process_metablock(struct btrfsic_state *state,
                                      struct btrfsic_block *block,
                                      struct btrfsic_block_data_ctx *block_ctx,
@@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
  static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
  static int btrfsic_read_block(struct btrfsic_state *state,
                               struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    char **datav, unsigned int num_pages);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr, char **mapped_datav,
-                                         unsigned int num_pages,
-                                         struct bio *bio, int *bio_is_patched,
-                                         int submit_bio_bh_rw);
  static int btrfsic_process_written_superblock(
                 struct btrfsic_state *state,
                 struct btrfsic_block *const block,
@@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
                 /* Pages must be unmapped in reverse order */
                 while (num_pages > 0) {
                         num_pages--;
-                       if (block_ctx->datav[num_pages]) {
-                               kunmap_local(block_ctx->datav[num_pages]);
+                       if (block_ctx->datav[num_pages])
                                 block_ctx->datav[num_pages] = NULL;
-                       }
                         if (block_ctx->pagev[num_pages]) {
                                 __free_page(block_ctx->pagev[num_pages]);
                                 block_ctx->pagev[num_pages] = NULL;
@@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                 i = j;
         }
         for (i = 0; i < num_pages; i++)
-               block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
+               block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
  
         return block_ctx->len;
  }
@@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
  
                 bio_for_each_segment(bvec, bio, iter) {
                         BUG_ON(bvec.bv_len != PAGE_SIZE);
-                       mapped_datav[i] = kmap_local_page(bvec.bv_page);
+                       mapped_datav[i] = page_address(bvec.bv_page);
                         i++;
  
                         if (dev_state->state->print_mask &
@@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio)
                                               mapped_datav, segs,
                                               bio, &bio_is_patched,
                                               bio->bi_opf);
-               /* Unmap in reverse order */
-               for (--i; i >= 0; i--)
-                       kunmap_local(mapped_datav[i]);
                 kfree(mapped_datav);
         } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
                 if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 30d82cd..7869ad1 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
                 /* Hash through the page sector by sector */
                 for (pg_offset = 0; pg_offset < bytes_left;
                      pg_offset += sectorsize) {
-                       kaddr = kmap_atomic(page);
+                       kaddr = page_address(page);
                         crypto_shash_digest(shash, kaddr + pg_offset,
                                             sectorsize, csum);
-                       kunmap_atomic(kaddr);
  
                         if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                 btrfs_print_data_csum_error(inode, disk_start,
@@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode,
         if (isize == 0)
                 return 0;
  
+       /*
+        * For current subpage support, we only support 64K page size,
+        * which means maximum compressed extent size (128K) is just 2x page
+        * size.
+        * This makes readahead less effective, so here disable readahead for
+        * subpage for now, until full compressed write is supported.
+        */
+       if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+               return 0;
+
         end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
  
         while (last_offset < compressed_end) {
@@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
         struct page *page;
         struct bio *comp_bio;
         u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+       u64 file_offset;
         u64 em_len;
         u64 em_start;
         struct extent_map *em;
@@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
  
         em_tree = &BTRFS_I(inode)->extent_tree;
  
+       file_offset = bio_first_bvec_all(bio)->bv_offset +
+                     page_offset(bio_first_page_all(bio));
+
         /* we need the actual starting offset of this extent in the file */
         read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree,
-                                  page_offset(bio_first_page_all(bio)),
-                                  fs_info->sectorsize);
+       em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
         read_unlock(&em_tree->lock);
         if (!em)
                 return BLK_STS_IOERR;
  
+       ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
         compressed_len = em->block_len;
         cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
         if (!cb)
@@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                 goto fail1;
  
         for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-               cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
-                                                             __GFP_HIGHMEM);
+               cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
                 if (!cb->compressed_pages[pg_index]) {
                         faili = pg_index - 1;
                         ret = BLK_STS_RESOURCE;
@@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void)
  }
  
  /*
- * Copy uncompressed data from working buffer to pages.
+ * Copy decompressed data from working buffer to pages.
+ *
+ * @buf:               The decompressed data buffer
+ * @buf_len:           The decompressed data length
+ * @decompressed:      Number of bytes that are already decompressed inside the
+ *                     compressed extent
+ * @cb:                        The compressed extent descriptor
+ * @orig_bio:          The original bio that the caller wants to read for
+ *
+ * An easier to understand graph is like below:
+ *
+ *             |<- orig_bio ->|     |<- orig_bio->|
+ *     |<-------      full decompressed extent      ----->|
+ *     |<-----------    @cb range   ---->|
+ *     |                       |<-- @buf_len -->|
+ *     |<--- @decompressed --->|
+ *
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
   *
- * buf_start is the byte offset we're of the start of our workspace buffer.
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
   *
- * total_out is the last byte of the buffer
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
   */
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
-                             unsigned long total_out, u64 disk_start,
-                             struct bio *bio)
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+                             struct compressed_bio *cb, u32 decompressed)
  {
-       unsigned long buf_offset;
-       unsigned long current_buf_start;
-       unsigned long start_byte;
-       unsigned long prev_start_byte;
-       unsigned long working_bytes = total_out - buf_start;
-       unsigned long bytes;
-       struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
-
-       /*
-        * start byte is the first byte of the page we're currently
-        * copying into relative to the start of the compressed data.
-        */
-       start_byte = page_offset(bvec.bv_page) - disk_start;
-
-       /* we haven't yet hit data corresponding to this page */
-       if (total_out <= start_byte)
-               return 1;
-
-       /*
-        * the start of the data we care about is offset into
-        * the middle of our working buffer
-        */
-       if (total_out > start_byte && buf_start < start_byte) {
-               buf_offset = start_byte - buf_start;
-               working_bytes -= buf_offset;
-       } else {
-               buf_offset = 0;
-       }
-       current_buf_start = buf_start;
-
-       /* copy bytes from the working buffer into the pages */
-       while (working_bytes > 0) {
-               bytes = min_t(unsigned long, bvec.bv_len,
-                               PAGE_SIZE - (buf_offset % PAGE_SIZE));
-               bytes = min(bytes, working_bytes);
-
-               memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
-                              bytes);
-               flush_dcache_page(bvec.bv_page);
+       struct bio *orig_bio = cb->orig_bio;
+       /* Offset inside the full decompressed extent */
+       u32 cur_offset;
+
+       cur_offset = decompressed;
+       /* The main loop to do the copy */
+       while (cur_offset < decompressed + buf_len) {
+               struct bio_vec bvec;
+               size_t copy_len;
+               u32 copy_start;
+               /* Offset inside the full decompressed extent */
+               u32 bvec_offset;
+
+               bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+               /*
+                * cb->start may underflow, but subtracting that value can still
+                * give us correct offset inside the full decompressed extent.
+                */
+               bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
  
-               buf_offset += bytes;
-               working_bytes -= bytes;
-               current_buf_start += bytes;
+               /* Haven't reached the bvec range, exit */
+               if (decompressed + buf_len <= bvec_offset)
+                       return 1;
  
-               /* check if we need to pick another page */
-               bio_advance(bio, bytes);
-               if (!bio->bi_iter.bi_size)
-                       return 0;
-               bvec = bio_iter_iovec(bio, bio->bi_iter);
-               prev_start_byte = start_byte;
-               start_byte = page_offset(bvec.bv_page) - disk_start;
+               copy_start = max(cur_offset, bvec_offset);
+               copy_len = min(bvec_offset + bvec.bv_len,
+                              decompressed + buf_len) - copy_start;
+               ASSERT(copy_len);
  
                 /*
-                * We need to make sure we're only adjusting
-                * our offset into compression working buffer when
-                * we're switching pages.  Otherwise we can incorrectly
-                * keep copying when we were actually done.
+                * Extra range check to ensure we didn't go beyond
+                * @buf + @buf_len.
                  */
-               if (start_byte != prev_start_byte) {
-                       /*
-                        * make sure our new page is covered by this
-                        * working buffer
-                        */
-                       if (total_out <= start_byte)
-                               return 1;
+               ASSERT(copy_start - decompressed < buf_len);
+               memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+                              buf + copy_start - decompressed, copy_len);
+               flush_dcache_page(bvec.bv_page);
+               cur_offset += copy_len;
  
-                       /*
-                        * the next page in the biovec might not be adjacent
-                        * to the last page, but it might still be found
-                        * inside this working buffer. bump our offset pointer
-                        */
-                       if (total_out > start_byte &&
-                           current_buf_start < start_byte) {
-                               buf_offset = start_byte - buf_start;
-                               working_bytes = total_out - start_byte;
-                               current_buf_start = buf_start + buf_offset;
-                       }
-               }
+               bio_advance(orig_bio, copy_len);
+               /* Finished the bio */
+               if (!orig_bio->bi_iter.bi_size)
+                       return 0;
         }
-
         return 1;
  }
  
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h

index c359f20..399be0b 100644 (file)
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
                          unsigned long *total_out);
  int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
                      unsigned long start_byte, size_t srclen, size_t destlen);
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
-                             unsigned long total_out, u64 disk_start,
-                             struct bio *bio);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+                             struct compressed_bio *cb, u32 decompressed);
  
  blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
                                   unsigned int len, u64 disk_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index c5c08c8..84627cb 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
  
  /*
   * search for key in the extent_buffer.  The items start at offset p,
- * and they are item_size apart.  There are 'max' items in p.
+ * and they are item_size apart.
   *
   * the slot in the array is returned via slot, and it points to
   * the place where you would insert key if it is not found in
   * the array.
   *
- * slot may point to max if the key is bigger than all of the keys
+ * Slot may point to total number of items if the key is bigger than
+ * all of the keys
   */
  static noinline int generic_bin_search(struct extent_buffer *eb,
                                        unsigned long p, int item_size,
-                                      const struct btrfs_key *key,
-                                      int max, int *slot)
+                                      const struct btrfs_key *key, int *slot)
  {
         int low = 0;
-       int high = max;
+       int high = btrfs_header_nritems(eb);
         int ret;
         const int key_size = sizeof(struct btrfs_disk_key);
  
@@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
         if (btrfs_header_level(eb) == 0)
                 return generic_bin_search(eb,
                                           offsetof(struct btrfs_leaf, items),
-                                         sizeof(struct btrfs_item),
-                                         key, btrfs_header_nritems(eb),
-                                         slot);
+                                         sizeof(struct btrfs_item), key, slot);
         else
                 return generic_bin_search(eb,
                                           offsetof(struct btrfs_node, ptrs),
-                                         sizeof(struct btrfs_key_ptr),
-                                         key, btrfs_header_nritems(eb),
-                                         slot);
+                                         sizeof(struct btrfs_key_ptr), key, slot);
  }
  
  static void root_add_used(struct btrfs_root *root, u32 size)
@@ -1237,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
         u64 target;
         u64 nread = 0;
         u64 nread_max;
-       struct extent_buffer *eb;
         u32 nr;
         u32 blocksize;
         u32 nscan = 0;
@@ -1266,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
  
         search = btrfs_node_blockptr(node, slot);
         blocksize = fs_info->nodesize;
-       eb = find_extent_buffer(fs_info, search);
-       if (eb) {
-               free_extent_buffer(eb);
-               return;
+       if (path->reada != READA_FORWARD_ALWAYS) {
+               struct extent_buffer *eb;
+
+               eb = find_extent_buffer(fs_info, search);
+               if (eb) {
+                       free_extent_buffer(eb);
+                       return;
+               }
         }
  
         target = search;
@@ -2102,6 +2101,27 @@ again:
         return 0;
  }
  
+/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+                          struct btrfs_path *path)
+{
+       int ret;
+
+       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       if (ret > 0)
+               ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+       if (ret == 0)
+               btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+       return ret;
+}
+
  /*
   * adjust the pointers going up the tree, starting at level
   * making sure the right key of each node is points to 'key'.
@@ -4358,16 +4378,6 @@ next:
         return 1;
  }
  
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
-       return btrfs_next_old_leaf(root, path, 0);
-}
-
  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                         u64 time_seq)
  {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index e5e53e5..f07c82f 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
  
  #define BTRFS_FEATURE_COMPAT_RO_SUPP                   \
         (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |      \
-        BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+        BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+        BTRFS_FEATURE_COMPAT_RO_VERITY)
  
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET       0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR     0ULL
@@ -1012,8 +1013,6 @@ struct btrfs_fs_info {
                 u64 zoned;
         };
  
-       /* Max size to emit ZONE_APPEND write command */
-       u64 max_zone_append_size;
         struct mutex zoned_meta_io_lock;
         spinlock_t treelog_bg_lock;
         u64 treelog_bg;
@@ -1484,20 +1483,20 @@ do {                                                                   \
  /*
   * Inode flags
   */
-#define BTRFS_INODE_NODATASUM          (1 << 0)
-#define BTRFS_INODE_NODATACOW          (1 << 1)
-#define BTRFS_INODE_READONLY           (1 << 2)
-#define BTRFS_INODE_NOCOMPRESS         (1 << 3)
-#define BTRFS_INODE_PREALLOC           (1 << 4)
-#define BTRFS_INODE_SYNC               (1 << 5)
-#define BTRFS_INODE_IMMUTABLE          (1 << 6)
-#define BTRFS_INODE_APPEND             (1 << 7)
-#define BTRFS_INODE_NODUMP             (1 << 8)
-#define BTRFS_INODE_NOATIME            (1 << 9)
-#define BTRFS_INODE_DIRSYNC            (1 << 10)
-#define BTRFS_INODE_COMPRESS           (1 << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT     (1 << 31)
+#define BTRFS_INODE_NODATASUM          (1U << 0)
+#define BTRFS_INODE_NODATACOW          (1U << 1)
+#define BTRFS_INODE_READONLY           (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS         (1U << 3)
+#define BTRFS_INODE_PREALLOC           (1U << 4)
+#define BTRFS_INODE_SYNC               (1U << 5)
+#define BTRFS_INODE_IMMUTABLE          (1U << 6)
+#define BTRFS_INODE_APPEND             (1U << 7)
+#define BTRFS_INODE_NODUMP             (1U << 8)
+#define BTRFS_INODE_NOATIME            (1U << 9)
+#define BTRFS_INODE_DIRSYNC            (1U << 10)
+#define BTRFS_INODE_COMPRESS           (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT     (1U << 31)
  
  #define BTRFS_INODE_FLAG_MASK                                          \
         (BTRFS_INODE_NODATASUM |                                        \
@@ -1514,6 +1513,10 @@ do {                                                                   \
          BTRFS_INODE_COMPRESS |                                         \
          BTRFS_INODE_ROOT_ITEM_INIT)
  
+#define BTRFS_INODE_RO_VERITY          (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK       (BTRFS_INODE_RO_VERITY)
+
  struct btrfs_map_token {
         struct extent_buffer *eb;
         char *kaddr;
@@ -2781,10 +2784,11 @@ enum btrfs_flush_state {
         FLUSH_DELAYED_REFS      =       4,
         FLUSH_DELALLOC          =       5,
         FLUSH_DELALLOC_WAIT     =       6,
-       ALLOC_CHUNK             =       7,
-       ALLOC_CHUNK_FORCE       =       8,
-       RUN_DELAYED_IPUTS       =       9,
-       COMMIT_TRANS            =       10,
+       FLUSH_DELALLOC_FULL     =       7,
+       ALLOC_CHUNK             =       8,
+       ALLOC_CHUNK_FORCE       =       9,
+       RUN_DELAYED_IPUTS       =       10,
+       COMMIT_TRANS            =       11,
  };
  
  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
         return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
  }
  
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                         u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+                          struct btrfs_path *path);
+
  static inline int btrfs_next_old_item(struct btrfs_root *root,
                                       struct btrfs_path *p, u64 time_seq)
  {
@@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
                 return btrfs_next_old_leaf(root, p, time_seq);
         return 0;
  }
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+       return btrfs_next_old_leaf(root, path, 0);
+}
+
  static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
  {
         return btrfs_next_old_item(root, p, 0);
@@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                               struct extent_state **cached_state);
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
-                            struct btrfs_root *parent_root);
+                            struct btrfs_root *parent_root,
+                            struct user_namespace *mnt_userns);
   void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
                                unsigned *bits);
  void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
  int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
                 struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+int btrfs_writepage_cow_fixup(struct page *page);
  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                           struct page *page, u64 start,
-                                         u64 end, int uptodate);
+                                         u64 end, bool uptodate);
  extern const struct dentry_operations btrfs_dentry_operations;
  extern const struct iomap_ops btrfs_dio_iomap_ops;
  extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
         return signal_pending(current);
  }
  
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+                  encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+                  size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+                        struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+                        struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+       return 0;
+}
+
+#endif
+
  /* Sanity test specific functions */
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index 257c1e1..1e08eb2 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,7 +6,6 @@
  
  #include <linux/slab.h>
  #include <linux/iversion.h>
-#include <linux/sched/mm.h>
  #include "misc.h"
  #include "delayed-inode.h"
  #include "disk-io.h"
@@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
  }
  
  /*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items that have consecutive
+ * keys if they exist.
   */
-static int btrfs_batch_insert_items(struct btrfs_root *root,
-                                   struct btrfs_path *path,
-                                   struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_delayed_item *first_item)
  {
-       struct btrfs_delayed_item *curr, *next;
-       int free_space;
-       int total_size = 0;
-       struct extent_buffer *leaf;
-       char *data_ptr;
-       struct btrfs_key *keys;
-       u32 *data_size;
-       struct list_head head;
-       int slot;
+       LIST_HEAD(batch);
+       struct btrfs_delayed_item *curr;
+       struct btrfs_delayed_item *next;
+       const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+       int total_size;
         int nitems;
-       int i;
-       int ret = 0;
-
-       BUG_ON(!path->nodes[0]);
+       char *ins_data = NULL;
+       struct btrfs_key *ins_keys;
+       u32 *ins_sizes;
+       int ret;
  
-       leaf = path->nodes[0];
-       free_space = btrfs_leaf_free_space(leaf);
-       INIT_LIST_HEAD(&head);
+       list_add_tail(&first_item->tree_list, &batch);
+       nitems = 1;
+       total_size = first_item->data_len + sizeof(struct btrfs_item);
+       curr = first_item;
  
-       next = item;
-       nitems = 0;
+       while (true) {
+               int next_size;
  
-       /*
-        * count the number of the continuous items that we can insert in batch
-        */
-       while (total_size + next->data_len + sizeof(struct btrfs_item) <=
-              free_space) {
-               total_size += next->data_len + sizeof(struct btrfs_item);
-               list_add_tail(&next->tree_list, &head);
-               nitems++;
-
-               curr = next;
                 next = __btrfs_next_delayed_item(curr);
-               if (!next)
+               if (!next || !btrfs_is_continuous_delayed_item(curr, next))
                         break;
  
-               if (!btrfs_is_continuous_delayed_item(curr, next))
+               next_size = next->data_len + sizeof(struct btrfs_item);
+               if (total_size + next_size > max_size)
                         break;
-       }
  
-       if (!nitems) {
-               ret = 0;
-               goto out;
+               list_add_tail(&next->tree_list, &batch);
+               nitems++;
+               total_size += next_size;
+               curr = next;
         }
  
-       keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
-       if (!keys) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       if (nitems == 1) {
+               ins_keys = &first_item->key;
+               ins_sizes = &first_item->data_len;
+       } else {
+               int i = 0;
  
-       data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
-       if (!data_size) {
-               ret = -ENOMEM;
-               goto error;
+               ins_data = kmalloc(nitems * sizeof(u32) +
+                                  nitems * sizeof(struct btrfs_key), GFP_NOFS);
+               if (!ins_data) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ins_sizes = (u32 *)ins_data;
+               ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
+               list_for_each_entry(curr, &batch, tree_list) {
+                       ins_keys[i] = curr->key;
+                       ins_sizes[i] = curr->data_len;
+                       i++;
+               }
         }
  
-       /* get keys of all the delayed items */
-       i = 0;
-       list_for_each_entry(next, &head, tree_list) {
-               keys[i] = next->key;
-               data_size[i] = next->data_len;
-               i++;
-       }
+       ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
+                                      nitems);
+       if (ret)
+               goto out;
  
-       /* insert the keys of the items */
-       setup_items_for_insert(root, path, keys, data_size, nitems);
+       list_for_each_entry(curr, &batch, tree_list) {
+               char *data_ptr;
  
-       /* insert the dir index items */
-       slot = path->slots[0];
-       list_for_each_entry_safe(curr, next, &head, tree_list) {
-               data_ptr = btrfs_item_ptr(leaf, slot, char);
-               write_extent_buffer(leaf, &curr->data,
-                                   (unsigned long)data_ptr,
-                                   curr->data_len);
-               slot++;
+               data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+               write_extent_buffer(path->nodes[0], &curr->data,
+                                   (unsigned long)data_ptr, curr->data_len);
+               path->slots[0]++;
+       }
  
-               btrfs_delayed_item_release_metadata(root, curr);
+       /*
+        * Now release our path before releasing the delayed items and their
+        * metadata reservations, so that we don't block other tasks for more
+        * time than needed.
+        */
+       btrfs_release_path(path);
  
+       list_for_each_entry_safe(curr, next, &batch, tree_list) {
                 list_del(&curr->tree_list);
+               btrfs_delayed_item_release_metadata(root, curr);
                 btrfs_release_delayed_item(curr);
         }
-
-error:
-       kfree(data_size);
-       kfree(keys);
  out:
+       kfree(ins_data);
         return ret;
  }
  
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_path *path,
-                                    struct btrfs_delayed_item *delayed_item)
-{
-       struct extent_buffer *leaf;
-       unsigned int nofs_flag;
-       char *ptr;
-       int ret;
-
-       nofs_flag = memalloc_nofs_save();
-       ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
-                                     delayed_item->data_len);
-       memalloc_nofs_restore(nofs_flag);
-       if (ret < 0 && ret != -EEXIST)
-               return ret;
-
-       leaf = path->nodes[0];
-
-       ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
-       write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
-                           delayed_item->data_len);
-       btrfs_mark_buffer_dirty(leaf);
-
-       btrfs_delayed_item_release_metadata(root, delayed_item);
-       return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
  static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
                                       struct btrfs_path *path,
                                       struct btrfs_root *root,
                                       struct btrfs_delayed_node *node)
  {
-       struct btrfs_delayed_item *curr, *prev;
         int ret = 0;
  
-do_again:
-       mutex_lock(&node->mutex);
-       curr = __btrfs_first_delayed_insertion_item(node);
-       if (!curr)
-               goto insert_end;
-
-       ret = btrfs_insert_delayed_item(trans, root, path, curr);
-       if (ret < 0) {
-               btrfs_release_path(path);
-               goto insert_end;
-       }
+       while (ret == 0) {
+               struct btrfs_delayed_item *curr;
  
-       prev = curr;
-       curr = __btrfs_next_delayed_item(prev);
-       if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
-               /* insert the continuous items into the same leaf */
-               path->slots[0]++;
-               btrfs_batch_insert_items(root, path, curr);
+               mutex_lock(&node->mutex);
+               curr = __btrfs_first_delayed_insertion_item(node);
+               if (!curr) {
+                       mutex_unlock(&node->mutex);
+                       break;
+               }
+               ret = btrfs_insert_delayed_item(trans, root, path, curr);
+               mutex_unlock(&node->mutex);
         }
-       btrfs_release_delayed_item(prev);
-       btrfs_mark_buffer_dirty(path->nodes[0]);
  
-       btrfs_release_path(path);
-       mutex_unlock(&node->mutex);
-       goto do_again;
-
-insert_end:
-       mutex_unlock(&node->mutex);
         return ret;
  }
  
@@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
                                       struct btrfs_delayed_node *node)
  {
         struct btrfs_delayed_item *curr, *prev;
-       unsigned int nofs_flag;
         int ret = 0;
  
  do_again:
@@ -923,9 +864,7 @@ do_again:
         if (!curr)
                 goto delete_fail;
  
-       nofs_flag = memalloc_nofs_save();
         ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
-       memalloc_nofs_restore(nofs_flag);
         if (ret < 0)
                 goto delete_fail;
         else if (ret > 0) {
@@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
         struct btrfs_key key;
         struct btrfs_inode_item *inode_item;
         struct extent_buffer *leaf;
-       unsigned int nofs_flag;
         int mod;
         int ret;
  
@@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
         else
                 mod = 1;
  
-       nofs_flag = memalloc_nofs_save();
         ret = btrfs_lookup_inode(trans, root, path, &key, mod);
-       memalloc_nofs_restore(nofs_flag);
         if (ret > 0)
                 ret = -ENOENT;
         if (ret < 0)
@@ -1066,9 +1002,7 @@ search:
         key.type = BTRFS_INODE_EXTREF_KEY;
         key.offset = -1;
  
-       nofs_flag = memalloc_nofs_save();
         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-       memalloc_nofs_restore(nofs_flag);
         if (ret < 0)
                 goto err_out;
         ASSERT(ret);
@@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_inode_item *inode_item,
                                   struct inode *inode)
  {
+       u64 flags;
+
         btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
         btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
         btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
@@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
                                        inode_peek_iversion(inode));
         btrfs_set_stack_inode_transid(inode_item, trans->transid);
         btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
-       btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+       flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+                                         BTRFS_I(inode)->ro_flags);
+       btrfs_set_stack_inode_flags(inode_item, flags);
         btrfs_set_stack_inode_block_group(inode_item, 0);
  
         btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
                                    btrfs_stack_inode_sequence(inode_item));
         inode->i_rdev = 0;
         *rdev = btrfs_stack_inode_rdev(inode_item);
-       BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+       btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+                               &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
  
         inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
         inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c

index 98b63eb..f1274d5 100644 (file)
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -170,6 +170,25 @@ out_free:
         return 0;
  }
  
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+                       struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct btrfs_path *path,
+                       struct btrfs_key *key, const char *name,
+                       int name_len, int mod)
+{
+       const int ins_len = (mod < 0 ? -1 : 0);
+       const int cow = (mod != 0);
+       int ret;
+
+       ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+       if (ret < 0)
+               return ERR_PTR(ret);
+       if (ret > 0)
+               return ERR_PTR(-ENOENT);
+
+       return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
  /*
   * lookup a directory item based on name.  'dir' is the objectid
   * we're searching in, and 'mod' tells us if you plan on deleting the
@@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
                                              const char *name, int name_len,
                                              int mod)
  {
-       int ret;
         struct btrfs_key key;
-       int ins_len = mod < 0 ? -1 : 0;
-       int cow = mod != 0;
+       struct btrfs_dir_item *di;
  
         key.objectid = dir;
         key.type = BTRFS_DIR_ITEM_KEY;
-
         key.offset = btrfs_name_hash(name, name_len);
  
-       ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-       if (ret < 0)
-               return ERR_PTR(ret);
-       if (ret > 0)
+       di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+       if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
                 return NULL;
  
-       return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+       return di;
  }
  
  int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
         int slot;
         struct btrfs_path *path;
  
-
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
         key.type = BTRFS_DIR_ITEM_KEY;
         key.offset = btrfs_name_hash(name, name_len);
  
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
-       /* return back any errors */
-       if (ret < 0)
-               goto out;
+       di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+       if (IS_ERR(di)) {
+               ret = PTR_ERR(di);
+               /* Nothing found, we're safe */
+               if (ret == -ENOENT) {
+                       ret = 0;
+                       goto out;
+               }
  
-       /* nothing found, we're safe */
-       if (ret > 0) {
-               ret = 0;
-               goto out;
+               if (ret < 0)
+                       goto out;
         }
  
         /* we found an item, look for our name in the item */
-       di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
         if (di) {
                 /* our exact name was found */
                 ret = -EEXIST;
@@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
                             u64 objectid, const char *name, int name_len,
                             int mod)
  {
-       int ret;
         struct btrfs_key key;
-       int ins_len = mod < 0 ? -1 : 0;
-       int cow = mod != 0;
  
         key.objectid = dir;
         key.type = BTRFS_DIR_INDEX_KEY;
         key.offset = objectid;
  
-       ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-       if (ret < 0)
-               return ERR_PTR(ret);
-       if (ret > 0)
-               return ERR_PTR(-ENOENT);
-       return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+       return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
  }
  
  struct btrfs_dir_item *
@@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                           const char *name, u16 name_len,
                                           int mod)
  {
-       int ret;
         struct btrfs_key key;
-       int ins_len = mod < 0 ? -1 : 0;
-       int cow = mod != 0;
+       struct btrfs_dir_item *di;
  
         key.objectid = dir;
         key.type = BTRFS_XATTR_ITEM_KEY;
         key.offset = btrfs_name_hash(name, name_len);
-       ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-       if (ret < 0)
-               return ERR_PTR(ret);
-       if (ret > 0)
+
+       di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+       if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
                 return NULL;
  
-       return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+       return di;
  }
  
  /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index a59ab7b..2f9515d 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                 goto fail_alloc;
         }
  
-       /* For 4K sector size support, it's only read-only */
-       if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
-               if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+       if (sectorsize != PAGE_SIZE) {
+               btrfs_warn(fs_info,
+               "read-write for sector size %u with page size %lu is experimental",
+                          sectorsize, PAGE_SIZE);
+       }
+       if (sectorsize != PAGE_SIZE) {
+               if (btrfs_super_incompat_flags(fs_info->super_copy) &
+                       BTRFS_FEATURE_INCOMPAT_RAID56) {
                         btrfs_err(fs_info,
-       "subpage sectorsize %u only supported read-only for page size %lu",
+               "RAID56 is not yet supported for sector size %u with page size %lu",
                                 sectorsize, PAGE_SIZE);
                         err = -EINVAL;
                         goto fail_alloc;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 268ce58..fc3da75 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -153,7 +153,7 @@ search_again:
         else
                 key.type = BTRFS_EXTENT_ITEM_KEY;
  
-       ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
         if (ret < 0)
                 goto out_free;
  
@@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
   */
  int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
  {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         struct btrfs_block_group *cache = NULL;
         struct btrfs_device *device;
-       struct list_head *devices;
         u64 group_trimmed;
         u64 range_end = U64_MAX;
         u64 start;
@@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
                 btrfs_warn(fs_info,
                         "failed to trim %llu block group(s), last error %d",
                         bg_failed, bg_ret);
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       devices = &fs_info->fs_devices->devices;
-       list_for_each_entry(device, devices, dev_list) {
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                         continue;
  
@@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
  
                 trimmed += group_trimmed;
         }
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+       mutex_unlock(&fs_devices->device_list_mutex);
  
         if (dev_failed)
                 btrfs_warn(fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 9e81d25..aaddd72 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
  #include <linux/pagevec.h>
  #include <linux/prefetch.h>
  #include <linux/cleancache.h>
+#include <linux/fsverity.h>
  #include "misc.h"
  #include "extent_io.h"
  #include "extent-io-tree.h"
@@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
  
         bio->bi_private = NULL;
  
+       /* Caller should ensure the bio has at least some range added */
+       ASSERT(bio->bi_iter.bi_size);
         if (is_data_inode(tree->private_data))
                 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
                                             bio_flags);
@@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
         return bitset;
  }
  
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
-       u64 start = page_offset(page);
-       u64 end = start + PAGE_SIZE - 1;
-       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
-               SetPageUptodate(page);
-}
-
  int free_io_failure(struct extent_io_tree *failure_tree,
                     struct extent_io_tree *io_tree,
                     struct io_failure_record *rec)
@@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
                start + len <= page_offset(page) + PAGE_SIZE);
  
         if (uptodate) {
-               btrfs_page_set_uptodate(fs_info, page, start, len);
+               if (fsverity_active(page->mapping->host) &&
+                   !PageError(page) &&
+                   !PageUptodate(page) &&
+                   start < i_size_read(page->mapping->host) &&
+                   !fsverity_verify_page(page)) {
+                       btrfs_page_set_error(fs_info, page, start, len);
+               } else {
+                       btrfs_page_set_uptodate(fs_info, page, start, len);
+               }
         } else {
                 btrfs_page_clear_uptodate(fs_info, page, start, len);
                 btrfs_page_set_error(fs_info, page, start, len);
@@ -2779,7 +2778,7 @@ next:
  void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  {
         struct btrfs_inode *inode;
-       int uptodate = (err == 0);
+       const bool uptodate = (err == 0);
         int ret = 0;
  
         ASSERT(page && page->mapping);
@@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
         btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
  
         if (!uptodate) {
-               ClearPageUptodate(page);
-               SetPageError(page);
+               const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+               u32 len;
+
+               ASSERT(end + 1 - start <= U32_MAX);
+               len = end + 1 - start;
+
+               btrfs_page_clear_uptodate(fs_info, page, start, len);
+               btrfs_page_set_error(fs_info, page, start, len);
                 ret = err < 0 ? err : -EIO;
                 mapping_set_error(page->mapping, ret);
         }
@@ -3097,7 +3102,7 @@ readpage_ok:
                 /* Update page status and unlock */
                 end_page_read(page, uptodate, start, len);
                 endio_readpage_release_extent(&processed, BTRFS_I(inode),
-                                             start, end, uptodate);
+                                             start, end, PageUptodate(page));
         }
         /* Release the last extent */
         endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
         return bio;
  }
  
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
  {
         struct bio *bio;
         struct btrfs_io_bio *btrfs_bio;
  
+       ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
         /* this will never fail when it's backed by a bioset */
         bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
         ASSERT(bio);
@@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
   * @size:      portion of page that we want to write
   * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
   * @bio_flags: flags of the current bio to see if we can merge them
- * @return:    true if page was added, false otherwise
   *
   * Attempt to add a page to bio considering stripe alignment etc.
   *
- * Return true if successfully page added. Otherwise, return false.
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
   */
-static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
-                              struct page *page,
-                              u64 disk_bytenr, unsigned int size,
-                              unsigned int pg_offset,
-                              unsigned long bio_flags)
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+                             struct page *page,
+                             u64 disk_bytenr, unsigned int size,
+                             unsigned int pg_offset,
+                             unsigned long bio_flags)
  {
         struct bio *bio = bio_ctrl->bio;
         u32 bio_size = bio->bi_iter.bi_size;
+       u32 real_size;
         const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
         bool contig;
         int ret;
@@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
         /* The limit should be calculated when bio_ctrl->bio is allocated */
         ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
         if (bio_ctrl->bio_flags != bio_flags)
-               return false;
+               return 0;
  
         if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
                 contig = bio->bi_iter.bi_sector == sector;
         else
                 contig = bio_end_sector(bio) == sector;
         if (!contig)
-               return false;
+               return 0;
  
-       if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
-           bio_size + size > bio_ctrl->len_to_stripe_boundary)
-               return false;
+       real_size = min(bio_ctrl->len_to_oe_boundary,
+                       bio_ctrl->len_to_stripe_boundary) - bio_size;
+       real_size = min(real_size, size);
+
+       /*
+        * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+        * bio will still execute its endio function on the page!
+        */
+       if (real_size == 0)
+               return 0;
  
         if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-               ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+               ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
         else
-               ret = bio_add_page(bio, page, size, pg_offset);
+               ret = bio_add_page(bio, page, real_size, pg_offset);
  
-       return ret == size;
+       return ret;
  }
  
  static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
-                              struct btrfs_inode *inode)
+                              struct btrfs_inode *inode, u64 file_offset)
  {
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         struct btrfs_io_geometry geom;
@@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
                 return 0;
         }
  
-       ASSERT(fs_info->max_zone_append_size > 0);
         /* Ordered extent not yet created, so we're good */
-       ordered = btrfs_lookup_ordered_extent(inode, logical);
+       ordered = btrfs_lookup_ordered_extent(inode, file_offset);
         if (!ordered) {
                 bio_ctrl->len_to_oe_boundary = U32_MAX;
                 return 0;
@@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
         return 0;
  }
  
+static int alloc_new_bio(struct btrfs_inode *inode,
+                        struct btrfs_bio_ctrl *bio_ctrl,
+                        struct writeback_control *wbc,
+                        unsigned int opf,
+                        bio_end_io_t end_io_func,
+                        u64 disk_bytenr, u32 offset, u64 file_offset,
+                        unsigned long bio_flags)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct bio *bio;
+       int ret;
+
+       /*
+        * For compressed page range, its disk_bytenr is always @disk_bytenr
+        * passed in, no matter if we have added any range into previous bio.
+        */
+       if (bio_flags & EXTENT_BIO_COMPRESSED)
+               bio = btrfs_bio_alloc(disk_bytenr);
+       else
+               bio = btrfs_bio_alloc(disk_bytenr + offset);
+       bio_ctrl->bio = bio;
+       bio_ctrl->bio_flags = bio_flags;
+       bio->bi_end_io = end_io_func;
+       bio->bi_private = &inode->io_tree;
+       bio->bi_write_hint = inode->vfs_inode.i_write_hint;
+       bio->bi_opf = opf;
+       ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+       if (ret < 0)
+               goto error;
+       if (wbc) {
+               struct block_device *bdev;
+
+               bdev = fs_info->fs_devices->latest_bdev;
+               bio_set_dev(bio, bdev);
+               wbc_init_bio(wbc, bio);
+       }
+       if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               struct btrfs_device *device;
+
+               device = btrfs_zoned_get_device(fs_info, disk_bytenr,
+                                               fs_info->sectorsize);
+               if (IS_ERR(device)) {
+                       ret = PTR_ERR(device);
+                       goto error;
+               }
+
+               btrfs_io_bio(bio)->device = device;
+       }
+       return 0;
+error:
+       bio_ctrl->bio = NULL;
+       bio->bi_status = errno_to_blk_status(ret);
+       bio_endio(bio);
+       return ret;
+}
+
  /*
   * @opf:       bio REQ_OP_* and REQ_* flags as one value
   * @wbc:       optional writeback control for io accounting
@@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf,
                               bool force_bio_submit)
  {
         int ret = 0;
-       struct bio *bio;
-       size_t io_size = min_t(size_t, size, PAGE_SIZE);
         struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-       struct extent_io_tree *tree = &inode->io_tree;
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       unsigned int cur = pg_offset;
  
         ASSERT(bio_ctrl);
  
         ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
                pg_offset + size <= PAGE_SIZE);
-       if (bio_ctrl->bio) {
-               bio = bio_ctrl->bio;
-               if (force_bio_submit ||
-                   !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
-                                       pg_offset, bio_flags)) {
-                       ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+       if (force_bio_submit && bio_ctrl->bio) {
+               ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+               bio_ctrl->bio = NULL;
+               if (ret < 0)
+                       return ret;
+       }
+
+       while (cur < pg_offset + size) {
+               u32 offset = cur - pg_offset;
+               int added;
+
+               /* Allocate new bio if needed */
+               if (!bio_ctrl->bio) {
+                       ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+                                           end_io_func, disk_bytenr, offset,
+                                           page_offset(page) + cur,
+                                           bio_flags);
+                       if (ret < 0)
+                               return ret;
+               }
+               /*
+                * We must go through btrfs_bio_add_page() to ensure each
+                * page range won't cross various boundaries.
+                */
+               if (bio_flags & EXTENT_BIO_COMPRESSED)
+                       added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+                                       size - offset, pg_offset + offset,
+                                       bio_flags);
+               else
+                       added = btrfs_bio_add_page(bio_ctrl, page,
+                                       disk_bytenr + offset, size - offset,
+                                       pg_offset + offset, bio_flags);
+
+               /* Metadata page range should never be split */
+               if (!is_data_inode(&inode->vfs_inode))
+                       ASSERT(added == 0 || added == size - offset);
+
+               /* At least we added some page, update the account */
+               if (wbc && added)
+                       wbc_account_cgroup_owner(wbc, page, added);
+
+               /* We have reached boundary, submit right now */
+               if (added < size - offset) {
+                       /* The bio should contain some page(s) */
+                       ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+                       ret = submit_one_bio(bio_ctrl->bio, mirror_num,
+                                       bio_ctrl->bio_flags);
                         bio_ctrl->bio = NULL;
                         if (ret < 0)
                                 return ret;
-               } else {
-                       if (wbc)
-                               wbc_account_cgroup_owner(wbc, page, io_size);
-                       return 0;
                 }
+               cur += added;
         }
-
-       bio = btrfs_bio_alloc(disk_bytenr);
-       bio_add_page(bio, page, io_size, pg_offset);
-       bio->bi_end_io = end_io_func;
-       bio->bi_private = tree;
-       bio->bi_write_hint = page->mapping->host->i_write_hint;
-       bio->bi_opf = opf;
-       if (wbc) {
-               struct block_device *bdev;
-
-               bdev = fs_info->fs_devices->latest_bdev;
-               bio_set_dev(bio, bdev);
-               wbc_init_bio(wbc, bio);
-               wbc_account_cgroup_owner(wbc, page, io_size);
-       }
-       if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
-               struct btrfs_device *device;
-
-               device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
-               if (IS_ERR(device))
-                       return PTR_ERR(device);
-
-               btrfs_io_bio(bio)->device = device;
-       }
-
-       bio_ctrl->bio = bio;
-       bio_ctrl->bio_flags = bio_flags;
-       ret = calc_bio_boundaries(bio_ctrl, inode);
-
-       return ret;
+       return 0;
  }
  
  static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
         size_t pg_offset = 0;
         size_t iosize;
         size_t blocksize = inode->i_sb->s_blocksize;
-       unsigned long this_bio_flag = 0;
         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
  
         ret = set_page_extent_mapped(page);
@@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
         }
         begin_page_read(fs_info, page);
         while (cur <= end) {
+               unsigned long this_bio_flag = 0;
                 bool force_bio_submit = false;
                 u64 disk_bytenr;
  
@@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                 /* the get_extent function already copied into the page */
                 if (test_range_bit(tree, cur, cur_end,
                                    EXTENT_UPTODATE, 1, NULL)) {
-                       check_page_uptodate(tree, page);
                         unlock_extent(tree, cur, cur + iosize - 1);
                         end_page_read(page, true, cur, iosize);
                         cur = cur + iosize;
@@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
                 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
                                 delalloc_end, &page_started, nr_written, wbc);
                 if (ret) {
-                       SetPageError(page);
-                       /*
-                        * btrfs_run_delalloc_range should return < 0 for error
-                        * but just in case, we use > 0 here meaning the IO is
-                        * started, so we don't want to return > 0 unless
-                        * things are going well.
-                        */
-                       return ret < 0 ? ret : -EIO;
+                       btrfs_page_set_error(inode->root->fs_info, page,
+                                            page_offset(page), PAGE_SIZE);
+                       return ret;
                 }
                 /*
                  * delalloc_end is already one less than the total length, so
@@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                                  int *nr_ret)
  {
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       u64 start = page_offset(page);
-       u64 end = start + PAGE_SIZE - 1;
-       u64 cur = start;
+       u64 cur = page_offset(page);
+       u64 end = cur + PAGE_SIZE - 1;
         u64 extent_offset;
         u64 block_start;
         struct extent_map *em;
@@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
         const unsigned int write_flags = wbc_to_write_flags(wbc);
         bool compressed;
  
-       ret = btrfs_writepage_cow_fixup(page, start, end);
+       ret = btrfs_writepage_cow_fixup(page);
         if (ret) {
                 /* Fixup worker will requeue */
                 redirty_page_for_writepage(wbc, page);
@@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
  
                 if (cur >= i_size) {
                         btrfs_writepage_endio_finish_ordered(inode, page, cur,
-                                                            end, 1);
+                                                            end, true);
+                       /*
+                        * This range is beyond i_size, thus we don't need to
+                        * bother writing back.
+                        * But we still need to clear the dirty subpage bit, or
+                        * the next time the page gets dirtied, we will try to
+                        * writeback the sectors with subpage dirty bits,
+                        * causing writeback without ordered extent.
+                        */
+                       btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
                         break;
                 }
  
@@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                                 nr++;
                         else
                                 btrfs_writepage_endio_finish_ordered(inode,
-                                               page, cur, cur + iosize - 1, 1);
+                                               page, cur, cur + iosize - 1, true);
+                       btrfs_page_clear_dirty(fs_info, page, cur, iosize);
                         cur += iosize;
                         continue;
                 }
@@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                 cur += iosize;
                 nr++;
         }
+       /*
+        * If we finish without problem, we should not only clear page dirty,
+        * but also empty subpage dirty bits
+        */
+       if (!ret)
+               btrfs_page_assert_not_dirty(fs_info, page);
         *nr_ret = nr;
         return ret;
  }
@@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
  
         WARN_ON(!PageLocked(page));
  
-       ClearPageError(page);
+       btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+                              page_offset(page), PAGE_SIZE);
  
         pg_offset = offset_in_page(i_size);
         if (page->index > end_index ||
@@ -4022,10 +4109,39 @@ done:
                 set_page_writeback(page);
                 end_page_writeback(page);
         }
-       if (PageError(page)) {
-               ret = ret < 0 ? ret : -EIO;
+       /*
+        * Here we used to have a check for PageError() and then set @ret and
+        * call end_extent_writepage().
+        *
+        * But in fact setting @ret here will cause different error paths
+        * between subpage and regular sectorsize.
+        *
+        * For regular page size, we never submit current page, but only add
+        * current page to current bio.
+        * The bio submission can only happen in next page.
+        * Thus if we hit the PageError() branch, @ret is already set to
+        * non-zero value and will not get updated for regular sectorsize.
+        *
+        * But for subpage case, it's possible we submit part of current page,
+        * thus can get PageError() set by submitted bio of the same page,
+        * while our @ret is still 0.
+        *
+        * So here we unify the behavior and don't set @ret.
+        * Error can still be properly passed to higher layer as page will
+        * be set error, here we just don't handle the IO failure.
+        *
+        * NOTE: This is just a hotfix for subpage.
+        * The root fix will be properly ending ordered extent when we hit
+        * an error during writeback.
+        *
+        * But that needs a bigger refactoring, as we not only need to grab the
+        * submitted OE, but also need to know exactly at which bytenr we hit
+        * the error.
+        * Currently the full page based __extent_writepage_io() is not
+        * capable of that.
+        */
+       if (PageError(page))
                 end_extent_writepage(page, ret, start, page_end);
-       }
         unlock_page(page);
         ASSERT(ret <= 0);
         return ret;
@@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
                         ret = __extent_writepage(page, &wbc_writepages, &epd);
                 else {
                         btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
-                                       page, start, start + PAGE_SIZE - 1, 1);
+                                       page, start, start + PAGE_SIZE - 1, true);
                         unlock_page(page);
                 }
                 put_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index 62027f5..53abdc2 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
  struct bio *btrfs_bio_alloc(u64 first_byte);
  struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
  struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
  
  int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                       u64 length, u64 logical, struct page *page,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index df6631e..2673c6b 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_path *path, u64 objectid,
                              u64 offset, int mod)
  {
-       int ret;
         struct btrfs_key file_key;
         int ins_len = mod < 0 ? -1 : 0;
         int cow = mod != 0;
@@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
         file_key.objectid = objectid;
         file_key.offset = offset;
         file_key.type = BTRFS_EXTENT_DATA_KEY;
-       ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
-       return ret;
+
+       return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
  }
  
  /*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index ee34497..7ff5770 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
  #include <linux/btrfs.h>
  #include <linux/uio.h>
  #include <linux/iversion.h>
+#include <linux/fsverity.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode,
                         unlock_page(page);
                         return -EIO;
                 }
-               if (page->mapping != inode->i_mapping) {
+
+               /*
+                * Since btrfs_readpage() will unlock the page before it
+                * returns, there is a window where btrfs_releasepage() can be
+                * called to release the page.  Here we check both inode
+                * mapping and PagePrivate() to make sure the page was not
+                * released.
+                *
+                * The private flag check is essential for subpage as we need
+                * to store extra bitmap using page->private.
+                */
+               if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
                         unlock_page(page);
                         return -EAGAIN;
                 }
@@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
  
  static int btrfs_file_open(struct inode *inode, struct file *filp)
  {
+       int ret;
+
         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+       ret = fsverity_file_open(inode, filp);
+       if (ret)
+               return ret;
         return generic_file_open(inode, filp);
  }
  
@@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
         struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
  
+       if (fsverity_active(inode))
+               return 0;
+
         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
                 return 0;
  
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index 2131ae5..da0eee7 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -344,19 +344,13 @@ fail:
  
  static void readahead_cache(struct inode *inode)
  {
-       struct file_ra_state *ra;
+       struct file_ra_state ra;
         unsigned long last_index;
  
-       ra = kzalloc(sizeof(*ra), GFP_NOFS);
-       if (!ra)
-               return;
-
-       file_ra_state_init(ra, inode->i_mapping);
+       file_ra_state_init(&ra, inode->i_mapping);
         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
  
-       page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
-
-       kfree(ra);
+       page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
  }
  
  static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
@@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
         u64 offset = bytenr - block_group->start;
         u64 to_free, to_unusable;
+       const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
  
         spin_lock(&ctl->tree_lock);
         if (!used)
@@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
         /* All the region is now unusable. Mark it as unused and reclaim */
         if (block_group->zone_unusable == block_group->length) {
                 btrfs_mark_bg_unused(block_group);
-       } else if (block_group->zone_unusable >=
-                  div_factor_fine(block_group->length,
-                                  fs_info->bg_reclaim_threshold)) {
+       } else if (bg_reclaim_threshold &&
+                  block_group->zone_unusable >=
+                  div_factor_fine(block_group->length, bg_reclaim_threshold)) {
                 btrfs_mark_bg_to_reclaim(block_group);
         }
  
@@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
                  * btrfs_pin_extent_for_log_replay() when replaying the log.
                  * Advance the pointer not to overwrite the tree-log nodes.
                  */
-               if (block_group->alloc_offset < offset + bytes)
-                       block_group->alloc_offset = offset + bytes;
+               if (block_group->start + block_group->alloc_offset <
+                   offset + bytes) {
+                       block_group->alloc_offset =
+                               offset + bytes - block_group->start;
+               }
                 return 0;
         }
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index bd5689f..2b7fe98 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
  #include <linux/sched/mm.h>
  #include <linux/iomap.h>
  #include <asm/unaligned.h>
+#include <linux/fsverity.h>
  #include "misc.h"
  #include "ctree.h"
  #include "disk-io.h"
@@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
  
-                       kaddr = kmap_atomic(cpage);
+                       kaddr = page_address(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                       kunmap_atomic(kaddr);
  
                         i++;
                         ptr += cur_size;
@@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
   */
  static inline bool inode_can_compress(struct btrfs_inode *inode)
  {
+       /* Subpage doesn't support compression yet */
+       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+               return false;
         if (inode->flags & BTRFS_INODE_NODATACOW ||
             inode->flags & BTRFS_INODE_NODATASUM)
                 return false;
@@ -682,7 +685,11 @@ again:
                 }
         }
  cont:
-       if (start == 0) {
+       /*
+        * Check cow_file_range() for why we don't even try to create inline
+        * extent for subpage case.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 if (ret || total_in < actual_end) {
                         /* we didn't compress the entire range, try
@@ -973,7 +980,7 @@ retry:
  
                         p->mapping = inode->vfs_inode.i_mapping;
                         btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, 0);
+                                                            end, false);
  
                         p->mapping = NULL;
                         extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
  
         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
  
-       if (start == 0) {
+       /*
+        * Due to the page size limit, for subpage we can only trigger the
+        * writeback for the dirty sectors of page, that means data writeback
+        * is doing more writeback than what we want.
+        *
+        * This is especially unexpected for some call sites like fallocate,
+        * where we only increase i_size after everything is done.
+        * This means we can trigger inline extent even if we didn't want to.
+        * So here we skip inline extent creation completely.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 ret = cow_file_range_inline(inode, start, end, 0,
                                             BTRFS_COMPRESS_NONE, NULL);
@@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
                 PAGE_SHIFT;
  
-       /* atomic_sub_return implies a barrier */
-       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-           5 * SZ_1M)
-               cond_wake_up_nomb(&fs_info->async_submit_wait);
-
         /*
          * ->inode could be NULL if async_chunk_start has failed to compress,
          * in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
          */
         if (async_chunk->inode)
                 submit_compressed_extents(async_chunk);
+
+       /* atomic_sub_return implies a barrier */
+       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+           5 * SZ_1M)
+               cond_wake_up_nomb(&fs_info->async_submit_wait);
  }
  
  static noinline void async_cow_free(struct btrfs_work *work)
@@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
                                            page_started, nr_written);
         }
+       ASSERT(ret <= 0);
         if (ret)
                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
                                               end - start + 1);
@@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         struct extent_map *split_mid = NULL;
         struct extent_map *split_post = NULL;
         int ret = 0;
-       int modified;
         unsigned long flags;
  
         /* Sanity check */
@@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         ASSERT(em->len == len);
         ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+       ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+       ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+       ASSERT(!list_empty(&em->list));
  
         flags = em->flags;
         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-       clear_bit(EXTENT_FLAG_LOGGING, &flags);
-       modified = !list_empty(&em->list);
  
         /* First, replace the em with a new extent_map starting from * em->start */
         split_pre->start = em->start;
@@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         split_pre->compress_type = em->compress_type;
         split_pre->generation = em->generation;
  
-       replace_extent_mapping(em_tree, em, split_pre, modified);
+       replace_extent_mapping(em_tree, em, split_pre, 1);
  
         /*
          * Now we only have an extent_map at:
@@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
                 split_mid->flags = flags;
                 split_mid->compress_type = em->compress_type;
                 split_mid->generation = em->generation;
-               add_extent_mapping(em_tree, split_mid, modified);
+               add_extent_mapping(em_tree, split_mid, 1);
         }
  
         if (post) {
@@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
                 split_post->flags = flags;
                 split_post->compress_type = em->compress_type;
                 split_post->generation = em->generation;
-               add_extent_mapping(em_tree, split_post, modified);
+               add_extent_mapping(em_tree, split_post, 1);
         }
  
         /* Once for us */
@@ -2770,7 +2788,7 @@ out_page:
   * to fix it up.  The async helper will wait for ordered extents, set
   * the delalloc bit and make it safe to write the page.
   */
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
  {
         struct inode *inode = page->mapping->host;
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
  
  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                           struct page *page, u64 start,
-                                         u64 end, int uptodate)
+                                         u64 end, bool uptodate)
  {
         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
  
@@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                 return 0;
         }
  
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+       /*
+        * For subpage case, above PageChecked is not safe as it's not subpage
+        * compatible.
+        * But for now only cow fixup and compressed read utilize PageChecked
+        * flag, while in this context we can easily use io_bio->csum to
+        * determine if we really need to do csum verification.
+        *
+        * So for now, just exit if io_bio->csum is NULL, as it means it's
+        * compressed read, and its compressed data csum has already been
+        * verified.
+        */
+       if (io_bio->csum == NULL)
                 return 0;
  
-       if (!root->fs_info->csum_root)
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                 return 0;
  
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+       if (!root->fs_info->csum_root)
                 return 0;
-       }
  
         ASSERT(page_offset(page) <= start &&
                end <= page_offset(page) + PAGE_SIZE - 1);
         for (pg_off = offset_in_page(start);
              pg_off < offset_in_page(end);
              pg_off += sectorsize, bio_offset += sectorsize) {
+               u64 file_offset = pg_off + page_offset(page);
                 int ret;
  
+               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+                   test_range_bit(io_tree, file_offset,
+                                  file_offset + sectorsize - 1,
+                                  EXTENT_NODATASUM, 1, NULL)) {
+                       /* Skip the range without csum for data reloc inode */
+                       clear_extent_bits(io_tree, file_offset,
+                                         file_offset + sectorsize - 1,
+                                         EXTENT_NODATASUM);
+                       continue;
+               }
                 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
                                       page_offset(page) + pg_off);
                 if (ret < 0) {
@@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
  
                 /*
                  * If we have an inode with links, there are a couple of
-                * possibilities. Old kernels (before v3.12) used to create an
+                * possibilities:
+                *
+                * 1. We were halfway through creating fsverity metadata for the
+                * file. In that case, the orphan item represents incomplete
+                * fsverity metadata which must be cleaned up with
+                * btrfs_drop_verity_items and deleting the orphan item.
+
+                * 2. Old kernels (before v3.12) used to create an
                  * orphan item for truncate indicating that there were possibly
                  * extent items past i_size that needed to be deleted. In v3.12,
                  * truncate was changed to update i_size in sync with the extent
@@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                  * but either way, we can delete the orphan item.
                  */
                 if (ret == -ENOENT || inode->i_nlink) {
-                       if (!ret)
+                       if (!ret) {
+                               ret = btrfs_drop_verity_items(BTRFS_I(inode));
                                 iput(inode);
+                               if (ret)
+                                       goto out;
+                       }
                         trans = btrfs_start_transaction(root, 1);
                         if (IS_ERR(trans)) {
                                 ret = PTR_ERR(trans);
@@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
         rdev = btrfs_inode_rdev(leaf, inode_item);
  
         BTRFS_I(inode)->index_cnt = (u64)-1;
-       BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+       btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+                               &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
  
  cache_index:
         /*
@@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct inode *inode)
  {
         struct btrfs_map_token token;
+       u64 flags;
  
         btrfs_init_map_token(&token, leaf);
  
@@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
         btrfs_set_token_inode_transid(&token, item, trans->transid);
         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-       btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+       flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+                                         BTRFS_I(inode)->ro_flags);
+       btrfs_set_token_inode_flags(&token, item, flags);
         btrfs_set_token_inode_block_group(&token, item, 0);
  }
  
@@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
         int ret;
  
         /*
-        * Still need to make sure the inode looks like it's been updated so
-        * that any holes get logged if we fsync.
+        * If NO_HOLES is enabled, we don't need to do anything.
+        * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+        * or btrfs_update_inode() will be called, which guarantee that the next
+        * fsync will know this inode was changed and needs to be logged.
          */
-       if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
-               inode->last_trans = fs_info->generation;
-               inode->last_sub_trans = root->log_transid;
-               inode->last_log_commit = root->last_log_commit;
+       if (btrfs_fs_incompat(fs_info, NO_HOLES))
                 return 0;
-       }
  
         /*
          * 1 - for the one we're dropping
@@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
         if (btrfs_root_readonly(root))
                 return -EROFS;
  
-       err = setattr_prepare(&init_user_ns, dentry, attr);
+       err = setattr_prepare(mnt_userns, dentry, attr);
         if (err)
                 return err;
  
@@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
         }
  
         if (attr->ia_valid) {
-               setattr_copy(&init_user_ns, inode, attr);
+               setattr_copy(mnt_userns, inode, attr);
                 inode_inc_iversion(inode);
                 err = btrfs_dirty_inode(inode);
  
                 if (!err && attr->ia_valid & ATTR_MODE)
-                       err = posix_acl_chmod(&init_user_ns, inode,
-                                             inode->i_mode);
+                       err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
         }
  
         return err;
@@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
         trace_btrfs_inode_evict(inode);
  
         if (!root) {
+               fsverity_cleanup_inode(inode);
                 clear_inode(inode);
                 return;
         }
@@ -5604,6 +5654,7 @@ no_delete:
          * to retry these periodically in the future.
          */
         btrfs_remove_delayed_node(BTRFS_I(inode));
+       fsverity_cleanup_inode(inode);
         clear_inode(inode);
  }
  
@@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
  
  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      const char *name, int name_len,
                                      u64 ref_objectid, u64 objectid,
@@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         if (ret != 0)
                 goto fail_unlock;
  
-       inode_init_owner(&init_user_ns, inode, dir, mode);
+       inode_init_owner(mnt_userns, inode, dir, mode);
         inode_set_bytes(inode, 0);
  
         inode->i_mtime = current_time(inode);
@@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_fail;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid,
                         S_IFDIR | mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
@@ -8206,8 +8259,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
         u64 start_sector;
         int async_submit = 0;
         u64 submit_len;
-       int clone_offset = 0;
-       int clone_len;
+       u64 clone_offset = 0;
+       u64 clone_len;
         u64 logical;
         int ret;
         blk_status_t status;
@@ -8255,9 +8308,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
                         status = errno_to_blk_status(ret);
                         goto out_err_em;
                 }
-               ASSERT(geom.len <= INT_MAX);
  
-               clone_len = min_t(int, submit_len, geom.len);
+               clone_len = min(submit_len, geom.len);
+               ASSERT(clone_len <= UINT_MAX);
  
                 /*
                  * This will never fail as it's passing GPF_NOFS and
@@ -8401,11 +8454,47 @@ static void btrfs_readahead(struct readahead_control *rac)
         extent_readahead(rac);
  }
  
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock.  So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+       struct btrfs_subpage *subpage;
+
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return;
+
+       ASSERT(PagePrivate(page) && page->private);
+       subpage = (struct btrfs_subpage *)page->private;
+
+       /*
+        * This may look insane as we just acquire the spinlock and release it,
+        * without doing anything.  But we just want to make sure no one is
+        * still holding the subpage spinlock.
+        * And since the page is not dirty nor writeback, and we have page
+        * locked, the only possible way to hold a spinlock is from the endio
+        * function to clear page writeback.
+        *
+        * Here we just acquire the spinlock so that all existing callers
+        * should exit and we're safe to release/invalidate the page.
+        */
+       spin_lock_irq(&subpage->lock);
+       spin_unlock_irq(&subpage->lock);
+}
+
  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  {
         int ret = try_release_extent_mapping(page, gfp_flags);
-       if (ret == 1)
+
+       if (ret == 1) {
+               wait_subpage_spinlock(page);
                 clear_page_extent_mapped(page);
+       }
         return ret;
  }
  
@@ -8469,6 +8558,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
          * do double ordered extent accounting on the same page.
          */
         wait_on_page_writeback(page);
+       wait_subpage_spinlock(page);
  
         /*
          * For subpage case, we have call sites like
@@ -8557,7 +8647,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                 spin_unlock_irq(&inode->ordered_tree.lock);
  
                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
-                                       cur, range_end + 1 - cur, 1)) {
+                                                  cur, range_end + 1 - cur)) {
                         btrfs_finish_ordered_io(ordered);
                         /*
                          * The ordered extent has finished, now we're again
@@ -8938,7 +9028,8 @@ out:
   */
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
-                            struct btrfs_root *parent_root)
+                            struct btrfs_root *parent_root,
+                            struct user_namespace *mnt_userns)
  {
         struct inode *inode;
         int err;
@@ -8949,7 +9040,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
         if (err < 0)
                 return err;
  
-       inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+       inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+                               ino, ino,
                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
                                 &index);
         if (IS_ERR(inode))
@@ -8993,6 +9085,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->defrag_bytes = 0;
         ei->disk_i_size = 0;
         ei->flags = 0;
+       ei->ro_flags = 0;
         ei->csum_bytes = 0;
         ei->index_cnt = (u64)-1;
         ei->dir_index = 0;
@@ -9174,6 +9267,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
         struct inode *inode = d_inode(path->dentry);
         u32 blocksize = inode->i_sb->s_blocksize;
         u32 bi_flags = BTRFS_I(inode)->flags;
+       u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
  
         stat->result_mask |= STATX_BTIME;
         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9186,13 +9280,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
                 stat->attributes |= STATX_ATTR_IMMUTABLE;
         if (bi_flags & BTRFS_INODE_NODUMP)
                 stat->attributes |= STATX_ATTR_NODUMP;
+       if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+               stat->attributes |= STATX_ATTR_VERITY;
  
         stat->attributes_mask |= (STATX_ATTR_APPEND |
                                   STATX_ATTR_COMPRESSED |
                                   STATX_ATTR_IMMUTABLE |
                                   STATX_ATTR_NODUMP);
  
-       generic_fillattr(&init_user_ns, inode, stat);
+       generic_fillattr(mnt_userns, inode, stat);
         stat->dev = BTRFS_I(inode)->root->anon_dev;
  
         spin_lock(&BTRFS_I(inode)->lock);
@@ -9280,8 +9376,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               root_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9298,8 +9392,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(dest);
-               dest_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, root,
                                              old_dentry->d_name.name,
                                              old_dentry->d_name.len,
@@ -9330,6 +9422,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                 BTRFS_I(new_inode), 1);
         }
  
+       /*
+        * Now pin the logs of the roots. We do it to ensure that no other task
+        * can sync the logs while we are in progress with the rename, because
+        * that could result in an inconsistency in case any of the inodes that
+        * are part of this rename operation were logged before.
+        *
+        * We pin the logs even if at this precise moment none of the inodes was
+        * logged before. This is because right after we checked for that, some
+        * other task fsyncing some other inode not involved with this rename
+        * operation could log that one of our inodes exists.
+        *
+        * We don't need to pin the logs before the above calls to
+        * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+        */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+       }
+       if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+       }
+
         /* src is a subvolume */
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9411,8 +9526,7 @@ out_fail:
                 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
-                   (new_inode &&
-                    btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+                   btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
                         btrfs_set_log_full_commit(trans);
  
                 if (root_log_pinned) {
@@ -9436,6 +9550,7 @@ out_notrans:
  
  static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      struct dentry *dentry)
  {
@@ -9448,7 +9563,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       inode = btrfs_new_inode(trans, root, dir,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
                                 dentry->d_name.name,
                                 dentry->d_name.len,
                                 btrfs_ino(BTRFS_I(dir)),
@@ -9485,9 +9600,10 @@ out:
         return ret;
  }
  
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry,
-                          unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+                       struct inode *old_dir, struct dentry *old_dentry,
+                       struct inode *new_dir, struct dentry *new_dentry,
+                       unsigned int flags)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
         struct btrfs_trans_handle *trans;
@@ -9582,8 +9698,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9607,6 +9721,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
         } else {
+               /*
+                * Now pin the log. We do it to ensure that no other task can
+                * sync the log while we are in progress with the rename, as
+                * that could result in an inconsistency in case any of the
+                * inodes that are part of this rename operation were logged
+                * before.
+                *
+                * We pin the log even if at this precise moment none of the
+                * inodes was logged before. This is because right after we
+                * checked for that, some other task fsyncing some other inode
+                * not involved with this rename operation could log that one of
+                * our inodes exists.
+                *
+                * We don't need to pin the logs before the above call to
+                * btrfs_insert_inode_ref(), since that does not need to change
+                * a log.
+                */
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
                                         BTRFS_I(d_inode(old_dentry)),
                                         old_dentry->d_name.name,
@@ -9660,8 +9793,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         }
  
         if (flags & RENAME_WHITEOUT) {
-               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
-                                               old_dentry);
+               ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+                                               old_dir, old_dentry);
  
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -9711,7 +9844,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
                                           new_dentry);
  
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+       return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+                           new_dentry, flags);
  }
  
  struct btrfs_delalloc_work {
@@ -9808,11 +9942,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
                         btrfs_queue_work(root->fs_info->flush_workers,
                                          &work->work);
                 } else {
-                       ret = sync_inode(inode, wbc);
-                       if (!ret &&
-                           test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                                    &BTRFS_I(inode)->runtime_flags))
-                               ret = sync_inode(inode, wbc);
+                       ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
                         btrfs_add_delayed_iput(inode);
                         if (ret || wbc->nr_to_write <= 0)
                                 goto out;
@@ -9947,9 +10077,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                               dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
-                               objectid, S_IFLNK|S_IRWXUGO, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                               dentry->d_name.name, dentry->d_name.len,
+                               btrfs_ino(BTRFS_I(dir)), objectid,
+                               S_IFLNK | S_IRWXUGO, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -10273,7 +10404,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                         return -EACCES;
         }
-       return generic_permission(&init_user_ns, inode, mask);
+       return generic_permission(mnt_userns, inode, mask);
  }
  
  static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@ -10298,7 +10429,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
         if (ret)
                 goto out;
  
-       inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 ret = PTR_ERR(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 0ba98e0..41524f9 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
  #include <linux/uaccess.h>
  #include <linux/iversion.h>
  #include <linux/fileattr.h>
+#include <linux/fsverity.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "export.h"
@@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
   * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
   * ioctl.
   */
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
  {
         unsigned int iflags = 0;
+       u32 flags = binode->flags;
+       u32 ro_flags = binode->ro_flags;
  
         if (flags & BTRFS_INODE_SYNC)
                 iflags |= FS_SYNC_FL;
@@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
                 iflags |= FS_DIRSYNC_FL;
         if (flags & BTRFS_INODE_NODATACOW)
                 iflags |= FS_NOCOW_FL;
+       if (ro_flags & BTRFS_INODE_RO_VERITY)
+               iflags |= FS_VERITY_FL;
  
         if (flags & BTRFS_INODE_NOCOMPRESS)
                 iflags |= FS_NOCOMP_FL;
@@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
                 new_fl |= S_NOATIME;
         if (binode->flags & BTRFS_INODE_DIRSYNC)
                 new_fl |= S_DIRSYNC;
+       if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+               new_fl |= S_VERITY;
  
         set_mask_bits(&inode->i_flags,
-                     S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
-                     new_fl);
+                     S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+                     S_VERITY, new_fl);
  }
  
  /*
@@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
  {
         struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
  
-       fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+       fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
         return 0;
  }
  
@@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
                 return -EOPNOTSUPP;
  
         fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
-       old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+       old_fsflags = btrfs_inode_flags_to_fsflags(binode);
         ret = check_fsflags(old_fsflags, fsflags);
         if (ret)
                 return ret;
@@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
         return 1;
  }
  
-static noinline int create_subvol(struct inode *dir,
-                                 struct dentry *dentry,
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+                                 struct inode *dir, struct dentry *dentry,
                                   const char *name, int namelen,
                                   struct btrfs_qgroup_inherit *inherit)
  {
@@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir,
                 goto fail;
         }
  
-       ret = btrfs_create_subvol_root(trans, new_root, root);
+       ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
         btrfs_put_root(new_root);
         if (ret) {
                 /* We potentially lose an unused inode item here */
@@ -830,7 +837,8 @@ free_pending:
   *     nfs_async_unlink().
   */
  
-static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+                           struct inode *dir, struct dentry *victim, int isdir)
  {
         int error;
  
@@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
         BUG_ON(d_inode(victim->d_parent) != dir);
         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
  
-       error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
         if (IS_APPEND(dir))
                 return -EPERM;
-       if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+       if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
             IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
             IS_SWAPFILE(d_inode(victim)))
                 return -EPERM;
@@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
  }
  
  /* copy of may_create in fs/namei.c() */
-static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+                                  struct inode *dir, struct dentry *child)
  {
         if (d_really_is_positive(child))
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
-       return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+       if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
+               return -EOVERFLOW;
+       return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
  }
  
  /*
@@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
   * inside this filesystem so it's quite a bit simpler.
   */
  static noinline int btrfs_mksubvol(const struct path *parent,
+                                  struct user_namespace *mnt_userns,
                                    const char *name, int namelen,
                                    struct btrfs_root *snap_src,
                                    bool readonly,
@@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
         if (error == -EINTR)
                 return error;
  
-       dentry = lookup_one_len(name, parent->dentry, namelen);
+       dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
         error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
                 goto out_unlock;
  
-       error = btrfs_may_create(dir, dentry);
+       error = btrfs_may_create(mnt_userns, dir, dentry);
         if (error)
                 goto out_dput;
  
@@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
         if (snap_src)
                 error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
         else
-               error = create_subvol(dir, dentry, name, namelen, inherit);
+               error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
  
         if (!error)
                 fsnotify_mkdir(dir, dentry);
@@ -934,6 +946,7 @@ out_unlock:
  }
  
  static noinline int btrfs_mksnapshot(const struct path *parent,
+                                  struct user_namespace *mnt_userns,
                                    const char *name, int namelen,
                                    struct btrfs_root *root,
                                    bool readonly,
@@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
  
         btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
  
-       ret = btrfs_mksubvol(parent, name, namelen,
+       ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
                              root, readonly, inherit);
  out:
         if (snapshot_force_cow)
@@ -1792,6 +1805,7 @@ out_drop:
  }
  
  static noinline int __btrfs_ioctl_snap_create(struct file *file,
+                               struct user_namespace *mnt_userns,
                                 const char *name, unsigned long fd, int subvol,
                                 bool readonly,
                                 struct btrfs_qgroup_inherit *inherit)
@@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
         }
  
         if (subvol) {
-               ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                    NULL, readonly, inherit);
+               ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+                                    namelen, NULL, readonly, inherit);
         } else {
                 struct fd src = fdget(fd);
                 struct inode *src_inode;
@@ -1834,16 +1848,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
                         btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
                                    "Snapshot src from another FS");
                         ret = -EXDEV;
-               } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
+               } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
                         /*
                          * Subvolume creation is not restricted, but snapshots
                          * are limited to own subvolumes only
                          */
                         ret = -EPERM;
                 } else {
-                       ret = btrfs_mksnapshot(&file->f_path, name, namelen,
-                                            BTRFS_I(src_inode)->root,
-                                            readonly, inherit);
+                       ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+                                              name, namelen,
+                                              BTRFS_I(src_inode)->root,
+                                              readonly, inherit);
                 }
                 fdput(src);
         }
@@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                 return PTR_ERR(vol_args);
         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
  
-       ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
-                                       subvol, false, NULL);
+       ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+                                       vol_args->name, vol_args->fd, subvol,
+                                       false, NULL);
  
         kfree(vol_args);
         return ret;
@@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
                 }
         }
  
-       ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
-                                       subvol, readonly, inherit);
+       ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+                                       vol_args->name, vol_args->fd, subvol,
+                                       readonly, inherit);
         if (ret)
                 goto free_inherit;
  free_inherit:
@@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
         u64 flags;
         int ret = 0;
  
-       if (!inode_owner_or_capable(&init_user_ns, inode))
+       if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
                 return -EPERM;
  
         ret = mnt_want_write_file(file);
@@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
         key.offset = (u64)-1;
  
         while (1) {
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               ret = btrfs_search_backwards(root, &key, path);
                 if (ret < 0)
                         goto out;
                 else if (ret > 0) {
-                       ret = btrfs_previous_item(root, path, dirid,
-                                                 BTRFS_INODE_REF_KEY);
-                       if (ret < 0)
-                               goto out;
-                       else if (ret > 0) {
-                               ret = -ENOENT;
-                               goto out;
-                       }
+                       ret = -ENOENT;
+                       goto out;
                 }
  
                 l = path->nodes[0];
                 slot = path->slots[0];
-               btrfs_item_key_to_cpu(l, &key, slot);
  
                 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
                 len = btrfs_inode_ref_name_len(l, iref);
@@ -2429,7 +2439,8 @@ out:
         return ret;
  }
  
-static int btrfs_search_path_in_tree_user(struct inode *inode,
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+                               struct inode *inode,
                                 struct btrfs_ioctl_ino_lookup_user_args *args)
  {
         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
                 key.type = BTRFS_INODE_REF_KEY;
                 key.offset = (u64)-1;
                 while (1) {
-                       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                       if (ret < 0) {
+                       ret = btrfs_search_backwards(root, &key, path);
+                       if (ret < 0)
+                               goto out_put;
+                       else if (ret > 0) {
+                               ret = -ENOENT;
                                 goto out_put;
-                       } else if (ret > 0) {
-                               ret = btrfs_previous_item(root, path, dirid,
-                                                         BTRFS_INODE_REF_KEY);
-                               if (ret < 0) {
-                                       goto out_put;
-                               } else if (ret > 0) {
-                                       ret = -ENOENT;
-                                       goto out_put;
-                               }
                         }
  
                         leaf = path->nodes[0];
                         slot = path->slots[0];
-                       btrfs_item_key_to_cpu(leaf, &key, slot);
  
                         iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
                         len = btrfs_inode_ref_name_len(leaf, iref);
@@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
                                 ret = PTR_ERR(temp_inode);
                                 goto out_put;
                         }
-                       ret = inode_permission(&init_user_ns, temp_inode,
+                       ret = inode_permission(mnt_userns, temp_inode,
                                                MAY_READ | MAY_EXEC);
                         iput(temp_inode);
                         if (ret) {
@@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
                 return -EACCES;
         }
  
-       ret = btrfs_search_path_in_tree_user(inode, args);
+       ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
  
         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
                 ret = -EFAULT;
@@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
         struct btrfs_root *dest = NULL;
         struct btrfs_ioctl_vol_args *vol_args = NULL;
         struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+       struct user_namespace *mnt_userns = file_mnt_user_ns(file);
         char *subvol_name, *subvol_name_ptr = NULL;
         int subvol_namelen;
         int err = 0;
@@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                         if (err)
                                 goto out;
                 } else {
+                       struct inode *old_dir;
+
                         if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
                                 err = -EINVAL;
                                 goto out;
@@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                 err = PTR_ERR(parent);
                                 goto out_drop_write;
                         }
+                       old_dir = dir;
                         dir = d_inode(parent);
  
                         /*
@@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                          */
                         destroy_parent = true;
  
+                       /*
+                        * On idmapped mounts, deletion via subvolid is
+                        * restricted to subvolumes that are immediate
+                        * ancestors of the inode referenced by the file
+                        * descriptor in the ioctl. Otherwise the idmapping
+                        * could potentially be abused to delete subvolumes
+                        * anywhere in the filesystem the user wouldn't be able
+                        * to delete without an idmapped mount.
+                        */
+                       if (old_dir != dir && mnt_userns != &init_user_ns) {
+                               err = -EOPNOTSUPP;
+                               goto free_parent;
+                       }
+
                         subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
                                                 fs_info, vol_args2->subvolid);
                         if (IS_ERR(subvol_name_ptr)) {
@@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
         err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
         if (err == -EINTR)
                 goto free_subvol_name;
-       dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
+       dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
         if (IS_ERR(dentry)) {
                 err = PTR_ERR(dentry);
                 goto out_unlock_dir;
@@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                 if (root == dest)
                         goto out_dput;
  
-               err = inode_permission(&init_user_ns, inode,
-                                      MAY_WRITE | MAY_EXEC);
+               err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
                 if (err)
                         goto out_dput;
         }
  
         /* check if subvolume may be deleted by a user */
-       err = btrfs_may_delete(dir, dentry, 1);
+       err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
         if (err)
                 goto out_dput;
  
@@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
  {
         struct inode *inode = file_inode(file);
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_ioctl_defrag_range_args *range;
+       struct btrfs_ioctl_defrag_range_args range = {0};
         int ret;
  
         ret = mnt_want_write_file(file);
@@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                 goto out;
         }
  
+       /* Subpage defrag will be supported in later commits */
+       if (root->fs_info->sectorsize < PAGE_SIZE) {
+               ret = -ENOTTY;
+               goto out;
+       }
+
         switch (inode->i_mode & S_IFMT) {
         case S_IFDIR:
                 if (!capable(CAP_SYS_ADMIN)) {
@@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                         goto out;
                 }
  
-               range = kzalloc(sizeof(*range), GFP_KERNEL);
-               if (!range) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-
                 if (argp) {
-                       if (copy_from_user(range, argp,
-                                          sizeof(*range))) {
+                       if (copy_from_user(&range, argp, sizeof(range))) {
                                 ret = -EFAULT;
-                               kfree(range);
                                 goto out;
                         }
                         /* compression requires us to start the IO */
-                       if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
-                               range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
-                               range->extent_thresh = (u32)-1;
+                       if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+                               range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+                               range.extent_thresh = (u32)-1;
                         }
                 } else {
                         /* the rest are all set to zero by kzalloc */
-                       range->len = (u64)-1;
+                       range.len = (u64)-1;
                 }
                 ret = btrfs_defrag_file(file_inode(file), file,
-                                       range, BTRFS_OLDEST_GENERATION, 0);
+                                       &range, BTRFS_OLDEST_GENERATION, 0);
                 if (ret > 0)
                         ret = 0;
-               kfree(range);
                 break;
         default:
                 ret = -EINVAL;
@@ -4404,25 +4422,20 @@ drop_write:
  static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
                                                 void __user *arg)
  {
-       struct btrfs_ioctl_quota_rescan_args *qsa;
+       struct btrfs_ioctl_quota_rescan_args qsa = {0};
         int ret = 0;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
-       if (!qsa)
-               return -ENOMEM;
-
         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-               qsa->flags = 1;
-               qsa->progress = fs_info->qgroup_rescan_progress.objectid;
+               qsa.flags = 1;
+               qsa.progress = fs_info->qgroup_rescan_progress.objectid;
         }
  
-       if (copy_to_user(arg, qsa, sizeof(*qsa)))
+       if (copy_to_user(arg, &qsa, sizeof(qsa)))
                 ret = -EFAULT;
  
-       kfree(qsa);
         return ret;
  }
  
@@ -4436,6 +4449,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
  }
  
  static long _btrfs_ioctl_set_received_subvol(struct file *file,
+                                           struct user_namespace *mnt_userns,
                                             struct btrfs_ioctl_received_subvol_args *sa)
  {
         struct inode *inode = file_inode(file);
@@ -4447,7 +4461,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
         int ret = 0;
         int received_uuid_changed;
  
-       if (!inode_owner_or_capable(&init_user_ns, inode))
+       if (!inode_owner_or_capable(mnt_userns, inode))
                 return -EPERM;
  
         ret = mnt_want_write_file(file);
@@ -4552,7 +4566,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
         args64->rtime.nsec = args32->rtime.nsec;
         args64->flags = args32->flags;
  
-       ret = _btrfs_ioctl_set_received_subvol(file, args64);
+       ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
         if (ret)
                 goto out;
  
@@ -4586,7 +4600,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
         if (IS_ERR(sa))
                 return PTR_ERR(sa);
  
-       ret = _btrfs_ioctl_set_received_subvol(file, sa);
+       ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
  
         if (ret)
                 goto out;
@@ -5013,6 +5027,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                 return btrfs_ioctl_get_subvol_rootref(file, argp);
         case BTRFS_IOC_INO_LOOKUP_USER:
                 return btrfs_ioctl_ino_lookup_user(file, argp);
+       case FS_IOC_ENABLE_VERITY:
+               return fsverity_ioctl_enable(file, (const void __user *)argp);
+       case FS_IOC_MEASURE_VERITY:
+               return fsverity_ioctl_measure(file, argp);
         }
  
         return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c

index cd042c7..c25dfd1 100644 (file)
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
  #include <linux/lzo.h>
  #include <linux/refcount.h>
  #include "compression.h"
+#include "ctree.h"
  
  #define LZO_LEN        4
  
@@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
         *total_in = 0;
  
         in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       data_in = kmap(in_page);
+       data_in = page_address(in_page);
  
         /*
          * store the size of all chunks of compressed data in
          * the first 4 bytes
          */
-       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       out_page = alloc_page(GFP_NOFS);
         if (out_page == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
-       cpage_out = kmap(out_page);
+       cpage_out = page_address(out_page);
         out_offset = LZO_LEN;
         tot_out = LZO_LEN;
         pages[0] = out_page;
@@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 if (out_len == 0 && tot_in >= len)
                                         break;
  
-                               kunmap(out_page);
                                 if (nr_pages == nr_dest_pages) {
                                         out_page = NULL;
                                         ret = -E2BIG;
                                         goto out;
                                 }
  
-                               out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                               out_page = alloc_page(GFP_NOFS);
                                 if (out_page == NULL) {
                                         ret = -ENOMEM;
                                         goto out;
                                 }
-                               cpage_out = kmap(out_page);
+                               cpage_out = page_address(out_page);
                                 pages[nr_pages++] = out_page;
  
                                 pg_bytes_left = PAGE_SIZE;
@@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                         break;
  
                 bytes_left = len - tot_in;
-               kunmap(in_page);
                 put_page(in_page);
  
                 start += PAGE_SIZE;
                 in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-               data_in = kmap(in_page);
+               data_in = page_address(in_page);
                 in_len = min(bytes_left, PAGE_SIZE);
         }
  
@@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
         }
  
         /* store the size of all chunks of compressed data */
-       sizes_ptr = kmap_local_page(pages[0]);
+       sizes_ptr = page_address(pages[0]);
         write_compress_length(sizes_ptr, tot_out);
-       kunmap_local(sizes_ptr);
  
         ret = 0;
         *total_out = tot_out;
         *total_in = tot_in;
  out:
         *out_pages = nr_pages;
-       if (out_page)
-               kunmap(out_page);
  
-       if (in_page) {
-               kunmap(in_page);
+       if (in_page)
                 put_page(in_page);
-       }
  
         return ret;
  }
  
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+                                   char *dest, u32 len, u32 *cur_in)
+{
+       u32 orig_in = *cur_in;
+
+       while (*cur_in < orig_in + len) {
+               struct page *cur_page;
+               u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+                                         orig_in + len - *cur_in);
+
+               ASSERT(copy_len);
+               cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+
+               memcpy(dest + *cur_in - orig_in,
+                       page_address(cur_page) + offset_in_page(*cur_in),
+                       copy_len);
+
+               *cur_in += copy_len;
+       }
+}
+
  int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  {
         struct workspace *workspace = list_entry(ws, struct workspace, list);
-       int ret = 0, ret2;
-       char *data_in;
-       unsigned long page_in_index = 0;
-       size_t srclen = cb->compressed_len;
-       unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
-       unsigned long buf_start;
-       unsigned long buf_offset = 0;
-       unsigned long bytes;
-       unsigned long working_bytes;
-       size_t in_len;
-       size_t out_len;
-       const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
-       unsigned long in_offset;
-       unsigned long in_page_bytes_left;
-       unsigned long tot_in;
-       unsigned long tot_out;
-       unsigned long tot_len;
-       char *buf;
-       bool may_late_unmap, need_unmap;
-       struct page **pages_in = cb->compressed_pages;
-       u64 disk_start = cb->start;
-       struct bio *orig_bio = cb->orig_bio;
+       const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       const u32 sectorsize = fs_info->sectorsize;
+       int ret;
+       /* Compressed data length, can be unaligned */
+       u32 len_in;
+       /* Offset inside the compressed data */
+       u32 cur_in = 0;
+       /* Bytes decompressed so far */
+       u32 cur_out = 0;
+
+       len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+       cur_in += LZO_LEN;
  
-       data_in = kmap(pages_in[0]);
-       tot_len = read_compress_length(data_in);
         /*
-        * Compressed data header check.
+        * LZO header length check
          *
-        * The real compressed size can't exceed the maximum extent length, and
-        * all pages should be used (whole unused page with just the segment
-        * header is not possible).  If this happens it means the compressed
-        * extent is corrupted.
+        * The total length should not exceed the maximum extent length,
+        * and all sectors should be used.
+        * If this happens, it means the compressed extent is corrupted.
          */
-       if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
-           tot_len < srclen - PAGE_SIZE) {
-               ret = -EUCLEAN;
-               goto done;
+       if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+           round_up(len_in, sectorsize) < cb->compressed_len) {
+               btrfs_err(fs_info,
+                       "invalid lzo header, lzo len %u compressed len %u",
+                       len_in, cb->compressed_len);
+               return -EUCLEAN;
         }
  
-       tot_in = LZO_LEN;
-       in_offset = LZO_LEN;
-       in_page_bytes_left = PAGE_SIZE - LZO_LEN;
-
-       tot_out = 0;
-
-       while (tot_in < tot_len) {
-               in_len = read_compress_length(data_in + in_offset);
-               in_page_bytes_left -= LZO_LEN;
-               in_offset += LZO_LEN;
-               tot_in += LZO_LEN;
+       /* Go through each lzo segment */
+       while (cur_in < len_in) {
+               struct page *cur_page;
+               /* Length of the compressed segment */
+               u32 seg_len;
+               u32 sector_bytes_left;
+               size_t out_len = lzo1x_worst_compress(sectorsize);
  
                 /*
-                * Segment header check.
-                *
-                * The segment length must not exceed the maximum LZO
-                * compression size, nor the total compressed size.
+                * We should always have enough space for one segment header
+                * inside current sector.
                  */
-               if (in_len > max_segment_len || tot_in + in_len > tot_len) {
-                       ret = -EUCLEAN;
-                       goto done;
-               }
-
-               tot_in += in_len;
-               working_bytes = in_len;
-               may_late_unmap = need_unmap = false;
-
-               /* fast path: avoid using the working buffer */
-               if (in_page_bytes_left >= in_len) {
-                       buf = data_in + in_offset;
-                       bytes = in_len;
-                       may_late_unmap = true;
-                       goto cont;
-               }
-
-               /* copy bytes from the pages into the working buffer */
-               buf = workspace->cbuf;
-               buf_offset = 0;
-               while (working_bytes) {
-                       bytes = min(working_bytes, in_page_bytes_left);
-
-                       memcpy(buf + buf_offset, data_in + in_offset, bytes);
-                       buf_offset += bytes;
-cont:
-                       working_bytes -= bytes;
-                       in_page_bytes_left -= bytes;
-                       in_offset += bytes;
-
-                       /* check if we need to pick another page */
-                       if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
-                           || in_page_bytes_left == 0) {
-                               tot_in += in_page_bytes_left;
-
-                               if (working_bytes == 0 && tot_in >= tot_len)
-                                       break;
-
-                               if (page_in_index + 1 >= total_pages_in) {
-                                       ret = -EIO;
-                                       goto done;
-                               }
-
-                               if (may_late_unmap)
-                                       need_unmap = true;
-                               else
-                                       kunmap(pages_in[page_in_index]);
-
-                               data_in = kmap(pages_in[++page_in_index]);
-
-                               in_page_bytes_left = PAGE_SIZE;
-                               in_offset = 0;
-                       }
-               }
-
-               out_len = max_segment_len;
-               ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
-                                           &out_len);
-               if (need_unmap)
-                       kunmap(pages_in[page_in_index - 1]);
+               ASSERT(cur_in / sectorsize ==
+                      (cur_in + LZO_LEN - 1) / sectorsize);
+               cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+               ASSERT(cur_page);
+               seg_len = read_compress_length(page_address(cur_page) +
+                                              offset_in_page(cur_in));
+               cur_in += LZO_LEN;
+
+               /* Copy the compressed segment payload into workspace */
+               copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+               /* Decompress the data */
+               ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+                                           workspace->buf, &out_len);
                 if (ret != LZO_E_OK) {
-                       pr_warn("BTRFS: decompress failed\n");
+                       btrfs_err(fs_info, "failed to decompress");
                         ret = -EIO;
-                       break;
+                       goto out;
                 }
  
-               buf_start = tot_out;
-               tot_out += out_len;
+               /* Copy the data into inode pages */
+               ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+               cur_out += out_len;
  
-               ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-                                                tot_out, disk_start, orig_bio);
-               if (ret2 == 0)
-                       break;
+               /* All data read, exit */
+               if (ret == 0)
+                       goto out;
+               ret = 0;
+
+               /* Check if the sector has enough space for a segment header */
+               sector_bytes_left = sectorsize - (cur_in % sectorsize);
+               if (sector_bytes_left >= LZO_LEN)
+                       continue;
+
+               /* Skip the padding zeros */
+               cur_in += sector_bytes_left;
         }
-done:
-       kunmap(pages_in[page_in_index]);
+out:
         if (!ret)
-               zero_fill_bio(orig_bio);
+               zero_fill_bio(cb->orig_bio);
         return ret;
  }
  
@@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
         destlen = min_t(unsigned long, destlen, PAGE_SIZE);
         bytes = min_t(unsigned long, destlen, out_len - start_byte);
  
-       kaddr = kmap_local_page(dest_page);
+       kaddr = page_address(dest_page);
         memcpy(kaddr, workspace->buf + start_byte, bytes);
  
         /*
@@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
          */
         if (bytes < destlen)
                 memset(kaddr+bytes, 0, destlen-bytes);
-       kunmap_local(kaddr);
  out:
         return ret;
  }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 5c0f848..edb65ab 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
   *              Will be also used to store the finished ordered extent.
   * @file_offset: File offset for the finished IO
   * @io_size:    Length of the finish IO range
- * @uptodate:   If the IO finishes without problem
   *
   * Return true if the ordered extent is finished in the range, and update
   * @cached.
@@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
   */
  bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
                                     struct btrfs_ordered_extent **cached,
-                                   u64 file_offset, u64 io_size, int uptodate)
+                                   u64 file_offset, u64 io_size)
  {
         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
         struct rb_node *node;
@@ -486,8 +485,6 @@ have_entry:
                        entry->bytes_left, io_size);
  
         entry->bytes_left -= io_size;
-       if (!uptodate)
-               set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
  
         if (entry->bytes_left == 0) {
                 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index b2d88ab..4194e96 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
                                 bool uptodate);
  bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
                                     struct btrfs_ordered_extent **cached,
-                                   u64 file_offset, u64 io_size, int uptodate);
+                                   u64 file_offset, u64 io_size);
  int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
                              u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
                              int type);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c

index 0fa1211..db680f5 100644 (file)
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
         ASSERT(trans != NULL);
  
         ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
-                                  false, true);
+                                  true);
         if (ret < 0) {
                 trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
                 btrfs_warn(trans->fs_info,
@@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                                 /* Search commit root to find old_roots */
                                 ret = btrfs_find_all_roots(NULL, fs_info,
                                                 record->bytenr, 0,
-                                               &record->old_roots, false, false);
+                                               &record->old_roots, false);
                                 if (ret < 0)
                                         goto cleanup;
                         }
@@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                          * current root. It's safe inside commit_transaction().
                          */
                         ret = btrfs_find_all_roots(trans, fs_info,
-                          record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
+                          record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
                         if (ret < 0)
                                 goto cleanup;
                         if (qgroup_to_skip) {
@@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
                         num_bytes = found.offset;
  
                 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
-                                          &roots, false, false);
+                                          &roots, false);
                 if (ret < 0)
                         goto out;
                 /* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index 244d499..d8d268c 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
         for (i = 0; i < rbio->nr_pages; i++) {
                 if (rbio->stripe_pages[i])
                         continue;
-               page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               page = alloc_page(GFP_NOFS);
                 if (!page)
                         return -ENOMEM;
                 rbio->stripe_pages[i] = page;
@@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
         for (; i < rbio->nr_pages; i++) {
                 if (rbio->stripe_pages[i])
                         continue;
-               page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               page = alloc_page(GFP_NOFS);
                 if (!page)
                         return -ENOMEM;
                 rbio->stripe_pages[i] = page;
@@ -1636,10 +1636,10 @@ struct btrfs_plug_cb {
  static int plug_cmp(void *priv, const struct list_head *a,
                     const struct list_head *b)
  {
-       struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
-                                                plug_list);
-       struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
-                                                plug_list);
+       const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+                                                      plug_list);
+       const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+                                                      plug_list);
         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
  
@@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
                         if (rbio->stripe_pages[index])
                                 continue;
  
-                       page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       page = alloc_page(GFP_NOFS);
                         if (!page)
                                 return -ENOMEM;
                         rbio->stripe_pages[index] = page;
@@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
         if (!need_check)
                 goto writeback;
  
-       p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       p_page = alloc_page(GFP_NOFS);
         if (!p_page)
                 goto cleanup;
         SetPageUptodate(p_page);
  
         if (has_qstripe) {
                 /* RAID6, allocate and map temp space for the Q stripe */
-               q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               q_page = alloc_page(GFP_NOFS);
                 if (!q_page) {
                         __free_page(p_page);
                         goto cleanup;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c

index 8e026de..d2062d5 100644 (file)
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
         struct block_entry *be = NULL, *exist;
         struct root_entry *re = NULL;
  
-       re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
-       be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+       re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+       be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
         if (!be || !re) {
                 kfree(re);
                 kfree(be);
@@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
         struct root_entry *re;
         struct ref_entry *ref = NULL, *exist;
  
-       ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+       ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
         if (!ref)
                 return -ENOMEM;
  
@@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
         struct block_entry *be;
         struct ref_entry *ref;
  
-       ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+       ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
         if (!ref)
                 return -ENOMEM;
         be = add_block_entry(fs_info, bytenr, num_bytes, 0);
@@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
         u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
         u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
  
-       ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+       ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
         if (!ref)
                 return -ENOMEM;
         be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index fc83159..914d403 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -24,6 +24,7 @@
  #include "block-group.h"
  #include "backref.h"
  #include "misc.h"
+#include "subpage.h"
  
  /*
   * Relocation overview
@@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster(
         u64 num_bytes;
         int nr;
         int ret = 0;
+       u64 i_size = i_size_read(&inode->vfs_inode);
         u64 prealloc_start = cluster->start - offset;
         u64 prealloc_end = cluster->end - offset;
         u64 cur_offset = prealloc_start;
  
+       /*
+        * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+        * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+        * btrfs_do_readpage() call of previously relocated file cluster.
+        *
+        * If the current cluster starts in the above range, btrfs_do_readpage()
+        * will skip the read, and relocate_one_page() will later writeback
+        * the padding zeros as new data, causing data corruption.
+        *
+        * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+        */
+       if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+               struct address_space *mapping = inode->vfs_inode.i_mapping;
+               struct btrfs_fs_info *fs_info = inode->root->fs_info;
+               const u32 sectorsize = fs_info->sectorsize;
+               struct page *page;
+
+               ASSERT(sectorsize < PAGE_SIZE);
+               ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+               /*
+                * Subpage can't handle page with DIRTY but without UPTODATE
+                * bit as it can lead to the following deadlock:
+                *
+                * btrfs_readpage()
+                * | Page already *locked*
+                * |- btrfs_lock_and_flush_ordered_range()
+                *    |- btrfs_start_ordered_extent()
+                *       |- extent_write_cache_pages()
+                *          |- lock_page()
+                *             We try to lock the page we already hold.
+                *
+                * Here we just writeback the whole data reloc inode, so that
+                * we will be ensured to have no dirty range in the page, and
+                * are safe to clear the uptodate bits.
+                *
+                * This shouldn't cause too much overhead, as we need to write
+                * the data back anyway.
+                */
+               ret = filemap_write_and_wait(mapping);
+               if (ret < 0)
+                       return ret;
+
+               clear_extent_bits(&inode->io_tree, i_size,
+                                 round_up(i_size, PAGE_SIZE) - 1,
+                                 EXTENT_UPTODATE);
+               page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+               /*
+                * If page is freed we don't need to do anything then, as we
+                * will re-read the whole page anyway.
+                */
+               if (page) {
+                       btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+                                       round_up(i_size, PAGE_SIZE) - i_size);
+                       unlock_page(page);
+                       put_page(page);
+               }
+       }
+
         BUG_ON(cluster->start != cluster->boundary[0]);
         ret = btrfs_alloc_data_chunk_ondemand(inode,
                                               prealloc_end + 1 - prealloc_start);
@@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
  }
  ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
  
-static int relocate_file_extent_cluster(struct inode *inode,
-                                       struct file_extent_cluster *cluster)
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+                                   int cluster_nr)
+{
+       /* Last extent, use cluster end directly */
+       if (cluster_nr >= cluster->nr - 1)
+               return cluster->end;
+
+       /* Use next boundary start*/
+       return cluster->boundary[cluster_nr + 1] - 1;
+}
+
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+                            struct file_extent_cluster *cluster,
+                            int *cluster_nr, unsigned long page_index)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       u64 offset = BTRFS_I(inode)->index_cnt;
+       const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+       struct page *page;
         u64 page_start;
         u64 page_end;
+       u64 cur;
+       int ret;
+
+       ASSERT(page_index <= last_index);
+       page = find_lock_page(inode->i_mapping, page_index);
+       if (!page) {
+               page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+                               page_index, last_index + 1 - page_index);
+               page = find_or_create_page(inode->i_mapping, page_index, mask);
+               if (!page)
+                       return -ENOMEM;
+       }
+       ret = set_page_extent_mapped(page);
+       if (ret < 0)
+               goto release_page;
+
+       if (PageReadahead(page))
+               page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
+                                  page_index, last_index + 1 - page_index);
+
+       if (!PageUptodate(page)) {
+               btrfs_readpage(NULL, page);
+               lock_page(page);
+               if (!PageUptodate(page)) {
+                       ret = -EIO;
+                       goto release_page;
+               }
+       }
+
+       page_start = page_offset(page);
+       page_end = page_start + PAGE_SIZE - 1;
+
+       /*
+        * Start from the cluster, as for subpage case, the cluster can start
+        * inside the page.
+        */
+       cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+       while (cur <= page_end) {
+               u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+               u64 extent_end = get_cluster_boundary_end(cluster,
+                                               *cluster_nr) - offset;
+               u64 clamped_start = max(page_start, extent_start);
+               u64 clamped_end = min(page_end, extent_end);
+               u32 clamped_len = clamped_end + 1 - clamped_start;
+
+               /* Reserve metadata for this range */
+               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+                                                     clamped_len);
+               if (ret)
+                       goto release_page;
+
+               /* Mark the range delalloc and dirty for later writeback */
+               lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+               ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+                                               clamped_end, 0, NULL);
+               if (ret) {
+                       clear_extent_bits(&BTRFS_I(inode)->io_tree,
+                                       clamped_start, clamped_end,
+                                       EXTENT_LOCKED | EXTENT_BOUNDARY);
+                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
+                                                       clamped_len, true);
+                       btrfs_delalloc_release_extents(BTRFS_I(inode),
+                                                      clamped_len);
+                       goto release_page;
+               }
+               btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+
+               /*
+                * Set the boundary if it's inside the page.
+                * Data relocation requires the destination extents to have the
+                * same size as the source.
+                * EXTENT_BOUNDARY bit prevents current extent from being merged
+                * with previous extent.
+                */
+               if (in_range(cluster->boundary[*cluster_nr] - offset,
+                            page_start, PAGE_SIZE)) {
+                       u64 boundary_start = cluster->boundary[*cluster_nr] -
+                                               offset;
+                       u64 boundary_end = boundary_start +
+                                          fs_info->sectorsize - 1;
+
+                       set_extent_bits(&BTRFS_I(inode)->io_tree,
+                                       boundary_start, boundary_end,
+                                       EXTENT_BOUNDARY);
+               }
+               unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+               cur += clamped_len;
+
+               /* Crossed extent end, go to next extent */
+               if (cur >= extent_end) {
+                       (*cluster_nr)++;
+                       /* Just finished the last extent of the cluster, exit. */
+                       if (*cluster_nr >= cluster->nr)
+                               break;
+               }
+       }
+       unlock_page(page);
+       put_page(page);
+
+       balance_dirty_pages_ratelimited(inode->i_mapping);
+       btrfs_throttle(fs_info);
+       if (btrfs_should_cancel_balance(fs_info))
+               ret = -ECANCELED;
+       return ret;
+
+release_page:
+       unlock_page(page);
+       put_page(page);
+       return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+                                       struct file_extent_cluster *cluster)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 offset = BTRFS_I(inode)->index_cnt;
         unsigned long index;
         unsigned long last_index;
-       struct page *page;
         struct file_ra_state *ra;
-       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-       int nr = 0;
+       int cluster_nr = 0;
         int ret = 0;
  
         if (!cluster->nr)
@@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
         if (ret)
                 goto out;
  
-       index = (cluster->start - offset) >> PAGE_SHIFT;
         last_index = (cluster->end - offset) >> PAGE_SHIFT;
-       while (index <= last_index) {
-               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
-                               PAGE_SIZE);
-               if (ret)
-                       goto out;
-
-               page = find_lock_page(inode->i_mapping, index);
-               if (!page) {
-                       page_cache_sync_readahead(inode->i_mapping,
-                                                 ra, NULL, index,
-                                                 last_index + 1 - index);
-                       page = find_or_create_page(inode->i_mapping, index,
-                                                  mask);
-                       if (!page) {
-                               btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                       PAGE_SIZE, true);
-                               btrfs_delalloc_release_extents(BTRFS_I(inode),
-                                                       PAGE_SIZE);
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-               }
-               ret = set_page_extent_mapped(page);
-               if (ret < 0) {
-                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                       PAGE_SIZE, true);
-                       btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
-                       unlock_page(page);
-                       put_page(page);
-                       goto out;
-               }
-
-               if (PageReadahead(page)) {
-                       page_cache_async_readahead(inode->i_mapping,
-                                                  ra, NULL, page, index,
-                                                  last_index + 1 - index);
-               }
-
-               if (!PageUptodate(page)) {
-                       btrfs_readpage(NULL, page);
-                       lock_page(page);
-                       if (!PageUptodate(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                       PAGE_SIZE, true);
-                               btrfs_delalloc_release_extents(BTRFS_I(inode),
-                                                              PAGE_SIZE);
-                               ret = -EIO;
-                               goto out;
-                       }
-               }
-
-               page_start = page_offset(page);
-               page_end = page_start + PAGE_SIZE - 1;
-
-               lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
-               if (nr < cluster->nr &&
-                   page_start + offset == cluster->boundary[nr]) {
-                       set_extent_bits(&BTRFS_I(inode)->io_tree,
-                                       page_start, page_end,
-                                       EXTENT_BOUNDARY);
-                       nr++;
-               }
-
-               ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
-                                               page_end, 0, NULL);
-               if (ret) {
-                       unlock_page(page);
-                       put_page(page);
-                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                        PAGE_SIZE, true);
-                       btrfs_delalloc_release_extents(BTRFS_I(inode),
-                                                      PAGE_SIZE);
-
-                       clear_extent_bits(&BTRFS_I(inode)->io_tree,
-                                         page_start, page_end,
-                                         EXTENT_LOCKED | EXTENT_BOUNDARY);
-                       goto out;
-
-               }
-               set_page_dirty(page);
-
-               unlock_extent(&BTRFS_I(inode)->io_tree,
-                             page_start, page_end);
-               unlock_page(page);
-               put_page(page);
-
-               index++;
-               btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
-               balance_dirty_pages_ratelimited(inode->i_mapping);
-               btrfs_throttle(fs_info);
-               if (btrfs_should_cancel_balance(fs_info)) {
-                       ret = -ECANCELED;
-                       goto out;
-               }
-       }
-       WARN_ON(nr != cluster->nr);
+       for (index = (cluster->start - offset) >> PAGE_SHIFT;
+            index <= last_index && !ret; index++)
+               ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
         if (btrfs_is_zoned(fs_info) && !ret)
                 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       if (ret == 0)
+               WARN_ON(cluster_nr != cluster->nr);
  out:
         kfree(ra);
         return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 6ac37ae..72f9b86 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1198,7 +1198,7 @@ struct backref_ctx {
  static int __clone_root_cmp_bsearch(const void *key, const void *elt)
  {
         u64 root = (u64)(uintptr_t)key;
-       struct clone_root *cr = (struct clone_root *)elt;
+       const struct clone_root *cr = elt;
  
         if (root < cr->root->root_key.objectid)
                 return -1;
@@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
  
  static int __clone_root_cmp_sort(const void *e1, const void *e2)
  {
-       struct clone_root *cr1 = (struct clone_root *)e1;
-       struct clone_root *cr2 = (struct clone_root *)e2;
+       const struct clone_root *cr1 = e1;
+       const struct clone_root *cr2 = e2;
  
         if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
                 return -1;
@@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx,
         u64 flags = 0;
         struct btrfs_file_extent_item *fi;
         struct extent_buffer *eb = path->nodes[0];
-       struct backref_ctx *backref_ctx = NULL;
+       struct backref_ctx backref_ctx = {0};
         struct clone_root *cur_clone_root;
         struct btrfs_key found_key;
         struct btrfs_path *tmp_path;
@@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx,
         /* We only use this path under the commit sem */
         tmp_path->need_commit_sem = 0;
  
-       backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
-       if (!backref_ctx) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
         if (data_offset >= ino_size) {
                 /*
                  * There may be extents that lie behind the file's size.
@@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx,
                 cur_clone_root->found_refs = 0;
         }
  
-       backref_ctx->sctx = sctx;
-       backref_ctx->found = 0;
-       backref_ctx->cur_objectid = ino;
-       backref_ctx->cur_offset = data_offset;
-       backref_ctx->found_itself = 0;
-       backref_ctx->extent_len = num_bytes;
+       backref_ctx.sctx = sctx;
+       backref_ctx.found = 0;
+       backref_ctx.cur_objectid = ino;
+       backref_ctx.cur_offset = data_offset;
+       backref_ctx.found_itself = 0;
+       backref_ctx.extent_len = num_bytes;
  
         /*
          * The last extent of a file may be too large due to page alignment.
@@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx,
          * __iterate_backrefs work.
          */
         if (data_offset + num_bytes >= ino_size)
-               backref_ctx->extent_len = ino_size - data_offset;
+               backref_ctx.extent_len = ino_size - data_offset;
  
         /*
          * Now collect all backrefs.
@@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx,
                 extent_item_pos = 0;
         ret = iterate_extent_inodes(fs_info, found_key.objectid,
                                     extent_item_pos, 1, __iterate_backrefs,
-                                   backref_ctx, false);
+                                   &backref_ctx, false);
  
         if (ret < 0)
                 goto out;
  
-       if (!backref_ctx->found_itself) {
+       if (!backref_ctx.found_itself) {
                 /* found a bug in backref code? */
                 ret = -EIO;
                 btrfs_err(fs_info,
@@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx,
                     "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
                     data_offset, ino, num_bytes, logical);
  
-       if (!backref_ctx->found)
+       if (!backref_ctx.found)
                 btrfs_debug(fs_info, "no clones found");
  
         cur_clone_root = NULL;
@@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx,
  
  out:
         btrfs_free_path(tmp_path);
-       kfree(backref_ctx);
         return ret;
  }
  
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c

index f79bf85..5ada02e 100644 (file)
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
         long time_left;
         int loops;
  
+       delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+       ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+       if (delalloc_bytes == 0 && ordered_bytes == 0)
+               return;
+
         /* Calc the number of the pages we need flush for space reservation */
         if (to_reclaim == U64_MAX) {
                 items = U64_MAX;
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
                 /*
                  * to_reclaim is set to however much metadata we need to
                  * reclaim, but reclaiming that much data doesn't really track
-                * exactly, so increase the amount to reclaim by 2x in order to
-                * make sure we're flushing enough delalloc to hopefully reclaim
-                * some metadata reservations.
+                * exactly.  What we really want to do is reclaim full inode's
+                * worth of reservations, however that's not available to us
+                * here.  We will take a fraction of the delalloc bytes for our
+                * flushing loops and hope for the best.  Delalloc will expand
+                * the amount we write to cover an entire dirty extent, which
+                * will reclaim the metadata reservation for that range.  If
+                * it's not enough subsequent flush stages will be more
+                * aggressive.
                  */
+               to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
                 items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
-               to_reclaim = items * EXTENT_SIZE_PER_ITEM;
         }
  
         trans = (struct btrfs_trans_handle *)current->journal_info;
  
-       delalloc_bytes = percpu_counter_sum_positive(
-                                               &fs_info->delalloc_bytes);
-       ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
-       if (delalloc_bytes == 0 && ordered_bytes == 0)
-               return;
-
         /*
          * If we are doing more ordered than delalloc we need to just wait on
          * ordered extents, otherwise we'll waste time trying to flush delalloc
@@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
         while ((delalloc_bytes || ordered_bytes) && loops < 3) {
                 u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
                 long nr_pages = min_t(u64, temp, LONG_MAX);
+               int async_pages;
  
                 btrfs_start_delalloc_roots(fs_info, nr_pages, true);
  
+               /*
+                * We need to make sure any outstanding async pages are now
+                * processed before we continue.  This is because things like
+                * sync_inode() try to be smart and skip writing if the inode is
+                * marked clean.  We don't use filemap_fwrite for flushing
+                * because we want to control how many pages we write out at a
+                * time, thus this is the only safe way to make sure we've
+                * waited for outstanding compressed workers to have started
+                * their jobs and thus have ordered extents set up properly.
+                *
+                * This exists because we do not want to wait for each
+                * individual inode to finish its async work, we simply want to
+                * start the IO on everybody, and then come back here and wait
+                * for all of the async work to catch up.  Once we're done with
+                * that we know we'll have ordered extents for everything and we
+                * can decide if we wait for that or not.
+                *
+                * If we choose to replace this in the future, make absolutely
+                * sure that the proper waiting is being done in the async case,
+                * as there have been bugs in that area before.
+                */
+               async_pages = atomic_read(&fs_info->async_delalloc_pages);
+               if (!async_pages)
+                       goto skip_async;
+
+               /*
+                * We don't want to wait forever, if we wrote less pages in this
+                * loop than we have outstanding, only wait for that number of
+                * pages, otherwise we can wait for all async pages to finish
+                * before continuing.
+                */
+               if (async_pages > nr_pages)
+                       async_pages -= nr_pages;
+               else
+                       async_pages = 0;
+               wait_event(fs_info->async_submit_wait,
+                          atomic_read(&fs_info->async_delalloc_pages) <=
+                          async_pages);
+skip_async:
                 loops++;
                 if (wait_ordered && !trans) {
                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
@@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                 break;
         case FLUSH_DELALLOC:
         case FLUSH_DELALLOC_WAIT:
+       case FLUSH_DELALLOC_FULL:
+               if (state == FLUSH_DELALLOC_FULL)
+                       num_bytes = U64_MAX;
                 shrink_delalloc(fs_info, space_info, num_bytes,
-                               state == FLUSH_DELALLOC_WAIT, for_preempt);
+                               state != FLUSH_DELALLOC, for_preempt);
                 break;
         case FLUSH_DELAYED_REFS_NR:
         case FLUSH_DELAYED_REFS:
@@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
  {
         u64 global_rsv_size = fs_info->global_block_rsv.reserved;
         u64 ordered, delalloc;
-       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+       u64 thresh = div_factor_fine(space_info->total_bytes, 90);
         u64 used;
  
         /* If we're just plain full then async reclaim just slows us down. */
@@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
              global_rsv_size) >= thresh)
                 return false;
  
+       used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+       /* The total flushable belongs to the global rsv, don't flush. */
+       if (global_rsv_size >= used)
+               return false;
+
+       /*
+        * 128MiB is 1/4 of the maximum global rsv size.  If we have less than
+        * that devoted to other reservations then there's no sense in flushing,
+        * we don't have a lot of things that need flushing.
+        */
+       if (used - global_rsv_size <= SZ_128M)
+               return false;
+
         /*
          * We have tickets queued, bail so we don't compete with the async
          * flushers.
@@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
         struct reserve_ticket *ticket;
         u64 tickets_id = space_info->tickets_id;
  
+       trace_btrfs_fail_all_tickets(fs_info, space_info);
+
         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
                 __btrfs_dump_space_info(fs_info, space_info);
@@ -904,6 +967,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                                 commit_cycles--;
                 }
  
+               /*
+                * We do not want to empty the system of delalloc unless we're
+                * under heavy pressure, so allow one trip through the flushing
+                * logic before we start doing a FLUSH_DELALLOC_FULL.
+                */
+               if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+                       flush_state++;
+
                 /*
                  * We don't want to force a chunk allocation until we've tried
                  * pretty hard to reclaim space.  Think of the case where we
@@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
   *   so if we now have space to allocate do the force chunk allocation.
   */
  static const enum btrfs_flush_state data_flush_states[] = {
-       FLUSH_DELALLOC_WAIT,
+       FLUSH_DELALLOC_FULL,
         RUN_DELAYED_IPUTS,
         COMMIT_TRANS,
         ALLOC_CHUNK_FORCE,
@@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
         FLUSH_DELAYED_REFS,
         FLUSH_DELALLOC,
         FLUSH_DELALLOC_WAIT,
+       FLUSH_DELALLOC_FULL,
         ALLOC_CHUNK,
         COMMIT_TRANS,
  };
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c

index 8260f8b..f429256 100644 (file)
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,         \
         }                                                               \
         token->kaddr = page_address(token->eb->pages[idx]);             \
         token->offset = idx << PAGE_SHIFT;                              \
-       if (oip + size <= PAGE_SIZE)                                    \
+       if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
                 return get_unaligned_le##bits(token->kaddr + oip);      \
                                                                         \
         memcpy(lebytes, token->kaddr + oip, part);                      \
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,              \
         u8 lebytes[sizeof(u##bits)];                                    \
                                                                         \
         ASSERT(check_setget_bounds(eb, ptr, off, size));                \
-       if (oip + size <= PAGE_SIZE)                                    \
+       if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
                 return get_unaligned_le##bits(kaddr + oip);             \
                                                                         \
         memcpy(lebytes, kaddr + oip, part);                             \
@@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token,          \
         }                                                               \
         token->kaddr = page_address(token->eb->pages[idx]);             \
         token->offset = idx << PAGE_SHIFT;                              \
-       if (oip + size <= PAGE_SIZE) {                                  \
+       if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
                 put_unaligned_le##bits(val, token->kaddr + oip);        \
                 return;                                                 \
         }                                                               \
@@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,    \
         u8 lebytes[sizeof(u##bits)];                                    \
                                                                         \
         ASSERT(check_setget_bounds(eb, ptr, off, size));                \
-       if (oip + size <= PAGE_SIZE) {                                  \
+       if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
                 put_unaligned_le##bits(val, kaddr + oip);               \
                 return;                                                 \
         }                                                               \
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c

index 640bcd2..cb10e56 100644 (file)
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
  
         spin_lock_irqsave(&subpage->lock, flags);
         subpage->writeback_bitmap &= ~tmp;
-       if (subpage->writeback_bitmap == 0)
+       if (subpage->writeback_bitmap == 0) {
+               ASSERT(PageWriteback(page));
                 end_page_writeback(page);
+       }
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
  
@@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
                          PageWriteback);
  IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
                          PageOrdered);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+                                struct page *page)
+{
+       struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+       if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+               return;
+
+       ASSERT(!PageDirty(page));
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return;
+
+       ASSERT(PagePrivate(page) && page->private);
+       ASSERT(subpage->dirty_bitmap == 0);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h

index 4d7aca8..0120948 100644 (file)
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
  bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len);
  
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+                                struct page *page);
+
  #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index d07b18b..537d90b 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
                 key.type = BTRFS_ROOT_BACKREF_KEY;
                 key.offset = (u64)-1;
  
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               ret = btrfs_search_backwards(root, &key, path);
                 if (ret < 0) {
                         goto err;
                 } else if (ret > 0) {
-                       ret = btrfs_previous_item(root, path, subvol_objectid,
-                                                 BTRFS_ROOT_BACKREF_KEY);
-                       if (ret < 0) {
-                               goto err;
-                       } else if (ret > 0) {
-                               ret = -ENOENT;
-                               goto err;
-                       }
+                       ret = -ENOENT;
+                       goto err;
                 }
  
-               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                 subvol_objectid = key.offset;
  
                 root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
                         key.type = BTRFS_INODE_REF_KEY;
                         key.offset = (u64)-1;
  
-                       ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+                       ret = btrfs_search_backwards(fs_root, &key, path);
                         if (ret < 0) {
                                 goto err;
                         } else if (ret > 0) {
-                               ret = btrfs_previous_item(fs_root, path, dirid,
-                                                         BTRFS_INODE_REF_KEY);
-                               if (ret < 0) {
-                                       goto err;
-                               } else if (ret > 0) {
-                                       ret = -ENOENT;
-                                       goto err;
-                               }
+                               ret = -ENOENT;
+                               goto err;
                         }
  
-                       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                         dirid = key.offset;
  
                         inode_ref = btrfs_item_ptr(path->nodes[0],
@@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb,
         sb->s_op = &btrfs_super_ops;
         sb->s_d_op = &btrfs_dentry_operations;
         sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+       sb->s_vop = &btrfs_verityops;
+#endif
         sb->s_xattr = btrfs_xattr_handlers;
         sb->s_time_gran = 1;
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                         ret = -EINVAL;
                         goto restore;
                 }
-               if (fs_info->sectorsize < PAGE_SIZE) {
-                       btrfs_warn(fs_info,
-       "read-write mount is not yet allowed for sectorsize %u page size %lu",
-                                  fs_info->sectorsize, PAGE_SIZE);
-                       ret = -EINVAL;
-                       goto restore;
-               }
  
                 /*
                  * NOTE: when remounting with a change that does writes, don't
@@ -2096,16 +2078,15 @@ restore:
  }
  
  /* Used to sort the devices by max_avail(descending sort) */
-static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
-                                      const void *dev_info2)
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
  {
-       if (((struct btrfs_device_info *)dev_info1)->max_avail >
-           ((struct btrfs_device_info *)dev_info2)->max_avail)
+       const struct btrfs_device_info *dev_info1 = a;
+       const struct btrfs_device_info *dev_info2 = b;
+
+       if (dev_info1->max_avail > dev_info2->max_avail)
                 return -1;
-       else if (((struct btrfs_device_info *)dev_info1)->max_avail <
-                ((struct btrfs_device_info *)dev_info2)->max_avail)
+       else if (dev_info1->max_avail < dev_info2->max_avail)
                 return 1;
-       else
         return 0;
  }
  
@@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = {
         .name           = "btrfs",
         .mount          = btrfs_mount_root,
         .kill_sb        = btrfs_kill_super,
-       .fs_flags       = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+       .fs_flags       = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
  };
  
  MODULE_ALIAS_FS("btrfs");
@@ -2571,6 +2552,11 @@ static void __init btrfs_print_mod_info(void)
                         ", zoned=yes"
  #else
                         ", zoned=no"
+#endif
+#ifdef CONFIG_FS_VERITY
+                       ", fsverity=yes"
+#else
+                       ", fsverity=no"
  #endif
                         ;
         pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 9d1d140..25a6f58 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,26 @@
  #include "block-group.h"
  #include "qgroup.h"
  
+/*
+ * Structure name                       Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs       /sys/fs/btrfs/features and
+ *                                     /sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs                         /sys/fs/btrfs/<uuid>
+ * devid_attrs                         /sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs                    /sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs                                /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs                    /sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs                          /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs           /sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs             /sys/fs/btrfs/<uuid>/debug
+ * discard_debug_attrs                 /sys/fs/btrfs/<uuid>/debug/discard
+ */
+
  struct btrfs_feature_attr {
         struct kobj_attribute kobj_attr;
         enum btrfs_feature_set feature_set;
@@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
  #ifdef CONFIG_BTRFS_DEBUG
  BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
  #endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
  
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features      - all available features implemeted by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ *                               can be changed on a mounted filesystem.
+ */
  static struct attribute *btrfs_supported_feature_attrs[] = {
         BTRFS_FEAT_ATTR_PTR(mixed_backref),
         BTRFS_FEAT_ATTR_PTR(default_subvol),
@@ -284,17 +314,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
         BTRFS_FEAT_ATTR_PTR(raid1c34),
  #ifdef CONFIG_BTRFS_DEBUG
         BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_FS_VERITY
+       BTRFS_FEAT_ATTR_PTR(verity),
  #endif
         NULL
  };
  
-/*
- * Features which depend on feature bits and may differ between each fs.
- *
- * /sys/fs/btrfs/features lists all available features of this kernel while
- * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
- * can be changed online.
- */
  static const struct attribute_group btrfs_feature_attr_group = {
         .name = "features",
         .is_visible = btrfs_feature_visible,
@@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
  {
         ssize_t ret = 0;
  
+       /* 4K sector size is also supported with 64K page size */
+       if (PAGE_SIZE == SZ_64K)
+               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+
         /* Only sectorsize == PAGE_SIZE is now supported */
         ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
  
@@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
  BTRFS_ATTR(static_feature, supported_sectorsizes,
            supported_sectorsizes_show);
  
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_supported_feature_attrs.
+ */
  static struct attribute *btrfs_supported_static_feature_attrs[] = {
         BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
         BTRFS_ATTR_PTR(static_feature, supported_checksums),
@@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
         NULL
  };
  
-/*
- * Features which only depend on kernel version.
- *
- * These are listed in /sys/fs/btrfs/features along with
- * btrfs_feature_attr_group
- */
  static const struct attribute_group btrfs_static_feature_attr_group = {
         .name = "features",
         .attrs = btrfs_supported_static_feature_attrs,
@@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
  BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
               btrfs_discard_max_discard_size_store);
  
+/*
+ * Per-filesystem debugging of discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ */
  static const struct attribute *discard_debug_attrs[] = {
         BTRFS_ATTR_PTR(discard, discardable_bytes),
         BTRFS_ATTR_PTR(discard, discardable_extents),
@@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = {
  };
  
  /*
- * Runtime debugging exported via sysfs
+ * Per-filesystem runtime debugging exported via sysfs.
   *
- * /sys/fs/btrfs/debug - applies to module or all filesystems
- * /sys/fs/btrfs/UUID  - applies only to the given filesystem
+ * Path: /sys/fs/btrfs/UUID/debug/
   */
  static const struct attribute *btrfs_debug_mount_attrs[] = {
         NULL,
  };
  
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
  static struct attribute *btrfs_debug_feature_attrs[] = {
         NULL
  };
@@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
         return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
  }
  
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
  static struct attribute *raid_attrs[] = {
         BTRFS_ATTR_PTR(raid, total_bytes),
         BTRFS_ATTR_PTR(raid, used_bytes),
@@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
  SPACE_INFO_ATTR(disk_used);
  SPACE_INFO_ATTR(disk_total);
  
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
  static struct attribute *space_info_attrs[] = {
         BTRFS_ATTR_PTR(space_info, flags),
         BTRFS_ATTR_PTR(space_info, total_bytes),
@@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = {
         .default_groups = space_info_groups,
  };
  
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
  static const struct attribute *allocation_attrs[] = {
         BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
         BTRFS_ATTR_PTR(allocation, global_rsv_size),
@@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
         ssize_t ret;
  
-       ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+       ret = scnprintf(buf, PAGE_SIZE, "%d\n",
+                       READ_ONCE(fs_info->bg_reclaim_threshold));
  
         return ret;
  }
@@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
         if (ret)
                 return ret;
  
-       if (thresh <= 50 || thresh > 100)
+       if (thresh != 0 && (thresh <= 50 || thresh > 100))
                 return -EINVAL;
  
-       fs_info->bg_reclaim_threshold = thresh;
+       WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
  
         return len;
  }
  BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
               btrfs_bg_reclaim_threshold_store);
  
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
  static const struct attribute *btrfs_attrs[] = {
         BTRFS_ATTR_PTR(, label),
         BTRFS_ATTR_PTR(, nodesize),
@@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
  }
  BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
  
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
  static struct attribute *devid_attrs[] = {
         BTRFS_ATTR_PTR(devid, error_stats),
         BTRFS_ATTR_PTR(devid, in_fs_metadata),
@@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
  QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
  QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
  
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
  static struct attribute *qgroup_attrs[] = {
         BTRFS_ATTR_PTR(qgroup, referenced),
         BTRFS_ATTR_PTR(qgroup, exclusive),
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c

index 98b5aab..19ba7d5 100644 (file)
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
          * we can only call btrfs_qgroup_account_extent() directly to test
          * quota.
          */
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 test_err("couldn't find old roots: %d", ret);
@@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
         if (ret)
                 return ret;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 ulist_free(new_roots);
@@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
         old_roots = NULL;
         new_roots = NULL;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 test_err("couldn't find old roots: %d", ret);
@@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
         if (ret)
                 return -EINVAL;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 ulist_free(new_roots);
@@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root,
                 return ret;
         }
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 test_err("couldn't find old roots: %d", ret);
@@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
         if (ret)
                 return ret;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 ulist_free(new_roots);
@@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root,
                 return -EINVAL;
         }
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 test_err("couldn't find old roots: %d", ret);
@@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root,
         if (ret)
                 return ret;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 ulist_free(new_roots);
@@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root,
                 return -EINVAL;
         }
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 test_err("couldn't find old roots: %d", ret);
@@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root,
         if (ret)
                 return ret;
  
-       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-                       false, false);
+       ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
         if (ret) {
                 ulist_free(old_roots);
                 ulist_free(new_roots);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c

index a8b2e0d..7733e8a 100644 (file)
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -24,6 +24,7 @@
  #include "compression.h"
  #include "volumes.h"
  #include "misc.h"
+#include "btrfs_inode.h"
  
  /*
   * Error message should follow the following format:
@@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
                 }
         }
  
-       if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-                    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
-                    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
-                    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-                    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+       if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+                     sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+                    (type & BTRFS_BLOCK_GROUP_RAID1 &&
+                     num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+                    (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+                     num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+                    (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+                     num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
+                    (type & BTRFS_BLOCK_GROUP_RAID5 &&
+                     num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+                    (type & BTRFS_BLOCK_GROUP_RAID6 &&
+                     num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+                    (type & BTRFS_BLOCK_GROUP_DUP &&
+                     num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
                      ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
-                     num_stripes != 1))) {
+                     num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
                 chunk_err(leaf, chunk, logical,
                         "invalid num_stripes:sub_stripes %u:%u for profile %llu",
                         num_stripes, sub_stripes,
@@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf,
         u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
         u32 mode;
         int ret;
+       u32 flags;
+       u32 ro_flags;
  
         ret = check_inode_key(leaf, key, slot);
         if (unlikely(ret < 0))
@@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf,
                         btrfs_inode_nlink(leaf, iitem));
                 return -EUCLEAN;
         }
-       if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
+       btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+       if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
                 inode_item_err(leaf, slot,
-                              "unknown flags detected: 0x%llx",
-                              btrfs_inode_flags(leaf, iitem) &
-                              ~BTRFS_INODE_FLAG_MASK);
+                              "unknown incompat flags detected: 0x%x", flags);
+               return -EUCLEAN;
+       }
+       if (unlikely(!sb_rdonly(fs_info->sb) &&
+                    (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+               inode_item_err(leaf, slot,
+                       "unknown ro-compat flags detected on writeable mount: 0x%x",
+                       ro_flags);
                 return -EUCLEAN;
         }
         return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index e6430ac..f7efc26 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                          */
                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
                                                 ins.offset);
-                       if (ret == 0) {
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret == 0) {
                                 btrfs_init_generic_ref(&ref,
                                                 BTRFS_ADD_DELAYED_REF,
                                                 ins.objectid, ins.offset, 0);
@@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
                 list_del_init(&ctx->list);
                 ctx->log_ret = error;
         }
-
-       INIT_LIST_HEAD(&root->log_ctxs[index]);
  }
  
  /*
@@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 goto out_wake_log_root;
         }
  
-       mutex_lock(&root->log_mutex);
-       if (root->last_log_commit < log_transid)
-               root->last_log_commit = log_transid;
-       mutex_unlock(&root->log_mutex);
+       /*
+        * We know there can only be one task here, since we have not yet set
+        * root->log_commit[index1] to 0 and any task attempting to sync the
+        * log must wait for the previous log transaction to commit if it's
+        * still in progress or wait for the current log transaction commit if
+        * someone else already started it. We use <= and not < because the
+        * first log transaction has an ID of 0.
+        */
+       ASSERT(root->last_log_commit <= log_transid);
+       root->last_log_commit = log_transid;
  
  out_wake_log_root:
         mutex_lock(&log_root_tree->log_mutex);
@@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
  }
  
  /*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Check if an inode was logged in the current transaction. This may often
+ * return some false positives, because logged_trans is an in memory only field,
+ * not persisted anywhere. This is meant to be used in contexts where a false
+ * positive has no functional consequences.
   */
  static bool inode_logged(struct btrfs_trans_handle *trans,
                          struct btrfs_inode *inode)
@@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
         if (inode->logged_trans == trans->transid)
                 return true;
  
-       if (inode->last_trans == trans->transid &&
-           test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+       /*
+        * The inode's logged_trans is always 0 when we load it (because it is
+        * not persisted in the inode item or elsewhere). So if it is 0, the
+        * inode was last modified in the current transaction then the inode may
+        * have been logged before in the current transaction, then evicted and
+        * loaded again in the current transaction - or may have never been logged
+        * in the current transaction, but since we can not be sure, we have to
+        * assume it was, otherwise our callers can leave an inconsistent log.
+        */
+       if (inode->logged_trans == 0 &&
+           inode->last_trans == trans->transid &&
             !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
                 return true;
  
@@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                             u64 logged_isize)
  {
         struct btrfs_map_token token;
+       u64 flags;
  
         btrfs_init_map_token(&token, leaf);
  
@@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
         btrfs_set_token_inode_transid(&token, item, trans->transid);
         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-       btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+       flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+                                         BTRFS_I(inode)->ro_flags);
+       btrfs_set_token_inode_flags(&token, item, flags);
         btrfs_set_token_inode_block_group(&token, item, 0);
  }
  
  static int log_inode_item(struct btrfs_trans_handle *trans,
                           struct btrfs_root *log, struct btrfs_path *path,
-                         struct btrfs_inode *inode)
+                         struct btrfs_inode *inode, bool inode_item_dropped)
  {
         struct btrfs_inode_item *inode_item;
         int ret;
  
-       ret = btrfs_insert_empty_item(trans, log, path,
-                                     &inode->location, sizeof(*inode_item));
-       if (ret && ret != -EEXIST)
+       /*
+        * If we are doing a fast fsync and the inode was logged before in the
+        * current transaction, then we know the inode was previously logged and
+        * it exists in the log tree. For performance reasons, in this case use
+        * btrfs_search_slot() directly with ins_len set to 0 so that we never
+        * attempt a write lock on the leaf's parent, which adds unnecessary lock
+        * contention in case there are concurrent fsyncs for other inodes of the
+        * same subvolume. Using btrfs_insert_empty_item() when the inode item
+        * already exists can also result in unnecessarily splitting a leaf.
+        */
+       if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+               ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+               ASSERT(ret <= 0);
+               if (ret > 0)
+                       ret = -ENOENT;
+       } else {
+               /*
+                * This means it is the first fsync in the current transaction,
+                * so the inode item is not in the log and we need to insert it.
+                * We can never get -EEXIST because we are only called for a fast
+                * fsync and in case an inode eviction happens after the inode was
+                * logged before in the current transaction, when we load again
+                * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+                * flags and set ->logged_trans to 0.
+                */
+               ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+                                             sizeof(*inode_item));
+               ASSERT(ret != -EEXIST);
+       }
+       if (ret)
                 return ret;
         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_item);
@@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  static int extent_cmp(void *priv, const struct list_head *a,
                       const struct list_head *b)
  {
-       struct extent_map *em1, *em2;
+       const struct extent_map *em1, *em2;
  
         em1 = list_entry(a, struct extent_map, list);
         em2 = list_entry(b, struct extent_map, list);
@@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                 /*
                  * Check the inode's logged_trans only instead of
                  * btrfs_inode_in_log(). This is because the last_log_commit of
-                * the inode is not updated when we only log that it exists and
-                * it has the full sync bit set (see btrfs_log_inode()).
+                * the inode is not updated when we only log that it exists (see
+                * btrfs_log_inode()).
                  */
                 if (BTRFS_I(inode)->logged_trans == trans->transid) {
                         spin_unlock(&BTRFS_I(inode)->lock);
@@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         bool need_log_inode_item = true;
         bool xattrs_logged = false;
         bool recursive_logging = false;
+       bool inode_item_dropped = true;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 } else {
                         if (inode_only == LOG_INODE_ALL)
                                 fast_search = true;
+                       inode_item_dropped = false;
                         goto log_extents;
                 }
  
@@ -5466,7 +5509,7 @@ log_extents:
         btrfs_release_path(path);
         btrfs_release_path(dst_path);
         if (need_log_inode_item) {
-               err = log_inode_item(trans, log, dst_path, inode);
+               err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
                 if (err)
                         goto out_unlock;
                 /*
@@ -5572,6 +5615,13 @@ out_unlock:
  static bool need_log_inode(struct btrfs_trans_handle *trans,
                            struct btrfs_inode *inode)
  {
+       /*
+        * If a directory was not modified, no dentries added or removed, we can
+        * and should avoid logging it.
+        */
+       if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+               return false;
+
         /*
          * If this inode does not have new/updated/deleted xattrs since the last
          * time it was logged and is flagged as logged in the current transaction,
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c

new file mode 100644 (file)

index 0000000..28d443d
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,811 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type.  These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
+ */
+
+#define MERKLE_START_ALIGN                     65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode:  inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+       u64 sz = inode->i_size;
+       u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+       if (rounded > inode->i_sb->s_maxbytes)
+               return -EFBIG;
+
+       return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode:     inode to drop items for
+ * @key_type:  type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ *             BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int count = 0;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (1) {
+               /* 1 for the item being dropped */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       goto out;
+               }
+
+               /*
+                * Walk backwards through all the items until we find one that
+                * isn't from our key type or objectid
+                */
+               key.objectid = btrfs_ino(inode);
+               key.type = key_type;
+               key.offset = (u64)-1;
+
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+               if (ret > 0) {
+                       ret = 0;
+                       /* No more keys of this type, we're done */
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+               } else if (ret < 0) {
+                       btrfs_end_transaction(trans);
+                       goto out;
+               }
+
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+               /* No more keys of this type, we're done */
+               if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+                       break;
+
+               /*
+                * This shouldn't be a performance sensitive function because
+                * it's not used as part of truncate.  If it ever becomes
+                * perf sensitive, change this to walk forward and bulk delete
+                * items
+                */
+               ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+               if (ret) {
+                       btrfs_end_transaction(trans);
+                       goto out;
+               }
+               count++;
+               btrfs_release_path(path);
+               btrfs_end_transaction(trans);
+       }
+       ret = count;
+       btrfs_end_transaction(trans);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode:  inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+       int ret;
+
+       ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+       if (ret < 0)
+               return ret;
+       ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+       if (ret < 0)
+               return ret;
+
+       return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode:     inode to insert for
+ * @key_type:  key type to insert
+ * @offset:    item offset to insert at
+ * @src:       source data to write
+ * @len:       length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+                          const char *src, u64 len)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       unsigned long copy_bytes;
+       unsigned long src_offset = 0;
+       void *data;
+       int ret = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (len > 0) {
+               /* 1 for the new item being inserted */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+
+               key.objectid = btrfs_ino(inode);
+               key.type = key_type;
+               key.offset = offset;
+
+               /*
+                * Insert 2K at a time mostly to be friendly for smaller leaf
+                * size filesystems
+                */
+               copy_bytes = min_t(u64, len, 2048);
+
+               ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+               if (ret) {
+                       btrfs_end_transaction(trans);
+                       break;
+               }
+
+               leaf = path->nodes[0];
+
+               data = btrfs_item_ptr(leaf, path->slots[0], void);
+               write_extent_buffer(leaf, src + src_offset,
+                                   (unsigned long)data, copy_bytes);
+               offset += copy_bytes;
+               src_offset += copy_bytes;
+               len -= copy_bytes;
+
+               btrfs_release_path(path);
+               btrfs_end_transaction(trans);
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode:      inode to read items of
+ * @key_type:   key type to read
+ * @offset:     item offset to read from
+ * @dest:       Buffer to read into. This parameter has slightly tricky
+ *              semantics.  If it is NULL, the function will not do any copying
+ *              and will just return the size of all the items up to len bytes.
+ *              If dest_page is passed, then the function will kmap_local the
+ *              page and ignore dest, but it must still be non-NULL to avoid the
+ *              counting-only behavior.
+ * @len:        length in bytes to read
+ * @dest_page:  copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree.  This returns the number of
+ * bytes read or < 0 for errors.  We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length.  Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+                         char *dest, u64 len, struct page *dest_page)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       u64 item_end;
+       u64 copy_end;
+       int copied = 0;
+       u32 copy_offset;
+       unsigned long copy_bytes;
+       unsigned long dest_offset = 0;
+       void *data;
+       char *kaddr = dest;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       if (dest_page)
+               path->reada = READA_FORWARD;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = key_type;
+       key.offset = offset;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               goto out;
+       } else if (ret > 0) {
+               ret = 0;
+               if (path->slots[0] == 0)
+                       goto out;
+               path->slots[0]--;
+       }
+
+       while (len > 0) {
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+               if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+                       break;
+
+               item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+               if (copied > 0) {
+                       /*
+                        * Once we've copied something, we want all of the items
+                        * to be sequential
+                        */
+                       if (key.offset != offset)
+                               break;
+               } else {
+                       /*
+                        * Our initial offset might be in the middle of an
+                        * item.  Make sure it all makes sense.
+                        */
+                       if (key.offset > offset)
+                               break;
+                       if (item_end <= offset)
+                               break;
+               }
+
+               /* desc = NULL to just sum all the item lengths */
+               if (!dest)
+                       copy_end = item_end;
+               else
+                       copy_end = min(offset + len, item_end);
+
+               /* Number of bytes in this item we want to copy */
+               copy_bytes = copy_end - offset;
+
+               /* Offset from the start of item for copying */
+               copy_offset = offset - key.offset;
+
+               if (dest) {
+                       if (dest_page)
+                               kaddr = kmap_local_page(dest_page);
+
+                       data = btrfs_item_ptr(leaf, path->slots[0], void);
+                       read_extent_buffer(leaf, kaddr + dest_offset,
+                                          (unsigned long)data + copy_offset,
+                                          copy_bytes);
+
+                       if (dest_page)
+                               kunmap_local(kaddr);
+               }
+
+               offset += copy_bytes;
+               dest_offset += copy_bytes;
+               len -= copy_bytes;
+               copied += copy_bytes;
+
+               path->slots[0]++;
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       /*
+                        * We've reached the last slot in this leaf and we need
+                        * to go to the next leaf.
+                        */
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               break;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+               }
+       }
+out:
+       btrfs_free_path(path);
+       if (!ret)
+               ret = copied;
+       return ret;
+}
+
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans:  transaction to do the delete in
+ * @inode:  inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+       struct btrfs_root *root = inode->root;
+       int ret;
+
+       /*
+        * If the inode has no links, it is either already unlinked, or was
+        * created with O_TMPFILE. In either case, it should have an orphan from
+        * that other operation. Rather than reference count the orphans, we
+        * simply ignore them here, because we only invoke the verity path in
+        * the orphan logic when i_nlink is 1.
+        */
+       if (!inode->vfs_inode.i_nlink)
+               return 0;
+
+       ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+       if (ret == -ENOENT)
+               ret = 0;
+       return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode:  inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = inode->root;
+       int ret;
+
+       ASSERT(inode_is_locked(&inode->vfs_inode));
+       truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+       clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+       ret = btrfs_drop_verity_items(inode);
+       if (ret) {
+               btrfs_handle_fs_error(root->fs_info, ret,
+                               "failed to drop verity items in rollback %llu",
+                               (u64)inode->vfs_inode.i_ino);
+               goto out;
+       }
+
+       /*
+        * 1 for updating the inode flag
+        * 1 for deleting the orphan
+        */
+       trans = btrfs_start_transaction(root, 2);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               btrfs_handle_fs_error(root->fs_info, ret,
+                       "failed to start transaction in verity rollback %llu",
+                       (u64)inode->vfs_inode.i_ino);
+               goto out;
+       }
+       inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+       btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+       ret = del_orphan(trans, inode);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+       btrfs_end_transaction(trans);
+out:
+       return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode:      inode to be marked as verity
+ * @desc:       contents of the verity descriptor to write (not NULL)
+ * @desc_size:  size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - delete the orphan item
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+                        size_t desc_size)
+{
+       struct btrfs_trans_handle *trans = NULL;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_verity_descriptor_item item;
+       int ret;
+
+       /* Write out the descriptor item */
+       memset(&item, 0, sizeof(item));
+       btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+       ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+                             (const char *)&item, sizeof(item));
+       if (ret)
+               goto out;
+
+       /* Write out the descriptor itself */
+       ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+                             desc, desc_size);
+       if (ret)
+               goto out;
+
+       /*
+        * 1 for updating the inode flag
+        * 1 for deleting the orphan
+        */
+       trans = btrfs_start_transaction(root, 2);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+       inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+       btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret)
+               goto end_trans;
+       ret = del_orphan(trans, inode);
+       if (ret)
+               goto end_trans;
+       clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+       btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+       btrfs_end_transaction(trans);
+out:
+       return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp:  file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+       struct btrfs_root *root = inode->root;
+       struct btrfs_trans_handle *trans;
+       int ret;
+
+       ASSERT(inode_is_locked(file_inode(filp)));
+
+       if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+               return -EBUSY;
+
+       /*
+        * This should almost never do anything, but theoretically, it's
+        * possible that we failed to enable verity on a file, then were
+        * interrupted or failed while rolling back, failed to cleanup the
+        * orphan, and finally attempt to enable verity again.
+        */
+       ret = btrfs_drop_verity_items(inode);
+       if (ret)
+               return ret;
+
+       /* 1 for the orphan item */
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       ret = btrfs_orphan_add(trans, inode);
+       if (!ret)
+               set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+       btrfs_end_transaction(trans);
+
+       return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp:              file we are finishing enabling verity on
+ * @desc:              verity descriptor to write out (NULL in error conditions)
+ * @desc_size:         size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size:  size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+                                  size_t desc_size, u64 merkle_tree_size)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+       int ret = 0;
+       int rollback_ret;
+
+       ASSERT(inode_is_locked(file_inode(filp)));
+
+       if (desc == NULL)
+               goto rollback;
+
+       ret = finish_verity(inode, desc, desc_size);
+       if (ret)
+               goto rollback;
+       return ret;
+
+rollback:
+       rollback_ret = rollback_verity(inode);
+       if (rollback_ret)
+               btrfs_err(inode->root->fs_info,
+                         "failed to rollback verity items: %d", rollback_ret);
+       return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode:     inode to get the descriptor of
+ * @buf:       output buffer for the descriptor contents
+ * @buf_size:  size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+                                      size_t buf_size)
+{
+       u64 true_size;
+       int ret = 0;
+       struct btrfs_verity_descriptor_item item;
+
+       memset(&item, 0, sizeof(item));
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+                            (char *)&item, sizeof(item), NULL);
+       if (ret < 0)
+               return ret;
+
+       if (item.reserved[0] != 0 || item.reserved[1] != 0)
+               return -EUCLEAN;
+
+       true_size = btrfs_stack_verity_descriptor_size(&item);
+       if (true_size > INT_MAX)
+               return -EUCLEAN;
+
+       if (buf_size == 0)
+               return true_size;
+       if (buf_size < true_size)
+               return -ERANGE;
+
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+                            buf, buf_size, NULL);
+       if (ret < 0)
+               return ret;
+       if (ret != true_size)
+               return -EIO;
+
+       return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode:         inode to read a merkle tree page for
+ * @index:         page index relative to the start of the merkle tree
+ * @num_ra_pages:  number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+                                               pgoff_t index,
+                                               unsigned long num_ra_pages)
+{
+       struct page *page;
+       u64 off = (u64)index << PAGE_SHIFT;
+       loff_t merkle_pos = merkle_file_pos(inode);
+       int ret;
+
+       if (merkle_pos < 0)
+               return ERR_PTR(merkle_pos);
+       if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+               return ERR_PTR(-EFBIG);
+       index += merkle_pos >> PAGE_SHIFT;
+again:
+       page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+       if (page) {
+               if (PageUptodate(page))
+                       return page;
+
+               lock_page(page);
+               /*
+                * We only insert uptodate pages, so !Uptodate has to be
+                * an error
+                */
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
+               unlock_page(page);
+               return page;
+       }
+
+       page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Merkle item keys are indexed from byte 0 in the merkle tree.
+        * They have the form:
+        *
+        * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+        */
+       ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+                            page_address(page), PAGE_SIZE, page);
+       if (ret < 0) {
+               put_page(page);
+               return ERR_PTR(ret);
+       }
+       if (ret < PAGE_SIZE)
+               memzero_page(page, ret, PAGE_SIZE - ret);
+
+       SetPageUptodate(page);
+       ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+       if (!ret) {
+               /* Inserted and ready for fsverity */
+               unlock_page(page);
+       } else {
+               put_page(page);
+               /* Did someone race us into inserting this page? */
+               if (ret == -EEXIST)
+                       goto again;
+               page = ERR_PTR(ret);
+       }
+       return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode:          inode to write a Merkle tree block for
+ * @buf:            Merkle tree data block to write
+ * @index:          index of the block in the Merkle tree
+ * @log_blocksize:  log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+                                       u64 index, int log_blocksize)
+{
+       u64 off = index << log_blocksize;
+       u64 len = 1ULL << log_blocksize;
+       loff_t merkle_pos = merkle_file_pos(inode);
+
+       if (merkle_pos < 0)
+               return merkle_pos;
+       if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+               return -EFBIG;
+
+       return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+                              off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+       .begin_enable_verity     = btrfs_begin_enable_verity,
+       .end_enable_verity       = btrfs_end_enable_verity,
+       .get_verity_descriptor   = btrfs_get_verity_descriptor,
+       .read_merkle_tree_page   = btrfs_read_merkle_tree_page,
+       .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 70f94b7..ec3a874 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .sub_stripes    = 2,
                 .dev_stripes    = 1,
                 .devs_max       = 0,    /* 0 == as many as possible */
-               .devs_min       = 4,
+               .devs_min       = 2,
                 .tolerated_failures = 1,
                 .devs_increment = 2,
                 .ncopies        = 2,
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .sub_stripes    = 1,
                 .dev_stripes    = 1,
                 .devs_max       = 0,
-               .devs_min       = 2,
+               .devs_min       = 1,
                 .tolerated_failures = 0,
                 .devs_increment = 1,
                 .ncopies        = 1,
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
         },
  };
  
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+       if (flags & BTRFS_BLOCK_GROUP_RAID10)
+               return BTRFS_RAID_RAID10;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+               return BTRFS_RAID_RAID1;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+               return BTRFS_RAID_RAID1C3;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+               return BTRFS_RAID_RAID1C4;
+       else if (flags & BTRFS_BLOCK_GROUP_DUP)
+               return BTRFS_RAID_DUP;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+               return BTRFS_RAID_RAID0;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+               return BTRFS_RAID_RAID5;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+               return BTRFS_RAID_RAID6;
+
+       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
  const char *btrfs_bg_type_to_raid_name(u64 flags)
  {
         const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
         }
  }
  
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_device *dev;
-
-       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-       if (!dev)
-               return ERR_PTR(-ENOMEM);
-
-       /*
-        * Preallocate a bio that's always going to be used for flushing device
-        * barriers and matches the device lifespan
-        */
-       dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
-       if (!dev->flush_bio) {
-               kfree(dev);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       INIT_LIST_HEAD(&dev->dev_list);
-       INIT_LIST_HEAD(&dev->dev_alloc_list);
-       INIT_LIST_HEAD(&dev->post_commit_list);
-
-       atomic_set(&dev->reada_in_flight, 0);
-       atomic_set(&dev->dev_stats_ccnt, 0);
-       btrfs_device_data_ordered_init(dev);
-       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-       extent_io_tree_init(fs_info, &dev->alloc_state,
-                           IO_TREE_DEVICE_ALLOC_STATE, NULL);
-
-       return dev;
-}
-
  static noinline struct btrfs_fs_devices *find_fsid(
                 const u8 *fsid, const u8 *metadata_fsid)
  {
@@ -1130,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
                 fs_devices->rw_devices--;
         }
  
+       if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+               clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                 fs_devices->missing_devices--;
  
@@ -1228,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
  static int devid_cmp(void *priv, const struct list_head *a,
                      const struct list_head *b)
  {
-       struct btrfs_device *dev1, *dev2;
+       const struct btrfs_device *dev1, *dev2;
  
         dev1 = list_entry(a, struct btrfs_device, dev_list);
         dev2 = list_entry(b, struct btrfs_device, dev_list);
@@ -1598,14 +1589,9 @@ again:
         key.offset = search_start;
         key.type = BTRFS_DEV_EXTENT_KEY;
  
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       ret = btrfs_search_backwards(root, &key, path);
         if (ret < 0)
                 goto out;
-       if (ret > 0) {
-               ret = btrfs_previous_item(root, path, key.objectid, key.type);
-               if (ret < 0)
-                       goto out;
-       }
  
         while (1) {
                 l = path->nodes[0];
@@ -1759,48 +1745,6 @@ out:
         return ret;
  }
  
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-                                 struct btrfs_device *device,
-                                 u64 chunk_offset, u64 start, u64 num_bytes)
-{
-       int ret;
-       struct btrfs_path *path;
-       struct btrfs_fs_info *fs_info = device->fs_info;
-       struct btrfs_root *root = fs_info->dev_root;
-       struct btrfs_dev_extent *extent;
-       struct extent_buffer *leaf;
-       struct btrfs_key key;
-
-       WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
-       WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       key.objectid = device->devid;
-       key.offset = start;
-       key.type = BTRFS_DEV_EXTENT_KEY;
-       ret = btrfs_insert_empty_item(trans, root, path, &key,
-                                     sizeof(*extent));
-       if (ret)
-               goto out;
-
-       leaf = path->nodes[0];
-       extent = btrfs_item_ptr(leaf, path->slots[0],
-                               struct btrfs_dev_extent);
-       btrfs_set_dev_extent_chunk_tree(leaf, extent,
-                                       BTRFS_CHUNK_TREE_OBJECTID);
-       btrfs_set_dev_extent_chunk_objectid(leaf, extent,
-                                           BTRFS_FIRST_CHUNK_TREE_OBJECTID);
-       btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
-       btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-       btrfs_mark_buffer_dirty(leaf);
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
  static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
  {
         struct extent_map_tree *em_tree;
@@ -2003,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
                         continue;
  
-               if (num_devices < btrfs_raid_array[i].devs_min) {
-                       int ret = btrfs_raid_array[i].mindev_error;
-
-                       if (ret)
-                               return ret;
-               }
+               if (num_devices < btrfs_raid_array[i].devs_min)
+                       return btrfs_raid_array[i].mindev_error;
         }
  
         return 0;
@@ -2137,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
  
         if (IS_ERR(device)) {
                 if (PTR_ERR(device) == -ENOENT &&
-                   strcmp(device_path, "missing") == 0)
+                   device_path && strcmp(device_path, "missing") == 0)
                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                 else
                         ret = PTR_ERR(device);
@@ -3622,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
         const int ncopies = btrfs_raid_array[index].ncopies;
         const int nparity = btrfs_raid_array[index].nparity;
  
-       if (nparity)
-               return num_stripes - nparity;
-       else
-               return num_stripes / ncopies;
+       return (num_stripes - nparity) / ncopies;
  }
  
  /* [pstart, pend) */
@@ -4025,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
                 return true;
  
+       if (fs_info->sectorsize < PAGE_SIZE &&
+               bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               btrfs_err(fs_info,
+               "RAID56 is not yet supported for sectorsize %u with page size %lu",
+                         fs_info->sectorsize, PAGE_SIZE);
+               return false;
+       }
         /* Profile is valid and does not have bits outside of the allowed set */
         if (alloc_profile_is_valid(bargs->target, 1) &&
             (bargs->target & ~allowed) == 0)
@@ -5463,56 +5407,6 @@ out:
         return block_group;
  }
  
-/*
- * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
- *
- * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
- * phases.
- */
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
-                            u64 chunk_offset, u64 chunk_size)
-{
-       struct btrfs_fs_info *fs_info = trans->fs_info;
-       struct btrfs_device *device;
-       struct extent_map *em;
-       struct map_lookup *map;
-       u64 dev_offset;
-       u64 stripe_size;
-       int i;
-       int ret = 0;
-
-       em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
-
-       map = em->map_lookup;
-       stripe_size = em->orig_block_len;
-
-       /*
-        * Take the device list mutex to prevent races with the final phase of
-        * a device replace operation that replaces the device object associated
-        * with the map's stripes, because the device object's id can change
-        * at any time during that final phase of the device replace operation
-        * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
-        * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
-        * resulting in persisting a device extent item with such ID.
-        */
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       for (i = 0; i < map->num_stripes; i++) {
-               device = map->stripes[i].dev;
-               dev_offset = map->stripes[i].physical;
-
-               ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
-                                            dev_offset, stripe_size);
-               if (ret)
-                       break;
-       }
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-       free_extent_map(em);
-       return ret;
-}
-
  /*
   * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
   * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
@@ -6923,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
         if (WARN_ON(!devid && !fs_info))
                 return ERR_PTR(-EINVAL);
  
-       dev = __alloc_device(fs_info);
-       if (IS_ERR(dev))
-               return dev;
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Preallocate a bio that's always going to be used for flushing device
+        * barriers and matches the device lifespan
+        */
+       dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+       if (!dev->flush_bio) {
+               kfree(dev);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       INIT_LIST_HEAD(&dev->dev_list);
+       INIT_LIST_HEAD(&dev->dev_alloc_list);
+       INIT_LIST_HEAD(&dev->post_commit_list);
+
+       atomic_set(&dev->reada_in_flight, 0);
+       atomic_set(&dev->dev_stats_ccnt, 0);
+       btrfs_device_data_ordered_init(dev);
+       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+       extent_io_tree_init(fs_info, &dev->alloc_state,
+                           IO_TREE_DEVICE_ALLOC_STATE, NULL);
  
         if (devid)
                 tmp = *devid;
@@ -6961,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
  
  static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
  {
-       int index = btrfs_bg_flags_to_raid_index(type);
-       int ncopies = btrfs_raid_array[index].ncopies;
-       const int nparity = btrfs_raid_array[index].nparity;
-       int data_stripes;
-
-       if (nparity)
-               data_stripes = num_stripes - nparity;
-       else
-               data_stripes = num_stripes / ncopies;
+       const int data_stripes = calc_data_stripes(type, num_stripes);
  
         return div_u64(chunk_len, data_stripes);
  }
@@ -8144,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
                 goto out;
  
         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-               ret = btrfs_next_item(root, path);
+               ret = btrfs_next_leaf(root, path);
                 if (ret < 0)
                         goto out;
                 /* No dev extents at all? Not good */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 55a8ba2..b082250 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -508,8 +508,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
                            u64 logical, u64 len);
  unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
                                     u64 logical);
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
-                            u64 chunk_offset, u64 chunk_size);
  int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
                                      struct btrfs_block_group *bg);
  int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
@@ -568,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
         atomic_inc(&dev->dev_stats_ccnt);
  }
  
-/*
- * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
- * can be used as index to access btrfs_raid_array[].
- */
-static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
-{
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
-               return BTRFS_RAID_RAID1C3;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
-               return BTRFS_RAID_RAID1C4;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
  void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
  
  struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
@@ -603,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
                                struct block_device *bdev,
                                const char *device_path);
  
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
  int btrfs_bg_type_to_factor(u64 flags);
  const char *btrfs_bg_type_to_raid_name(u64 flags);
  int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c

index c3fa7d3..8afa900 100644 (file)
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
         workspace->strm.total_in = 0;
         workspace->strm.total_out = 0;
  
-       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       out_page = alloc_page(GFP_NOFS);
         if (out_page == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
-       cpage_out = kmap(out_page);
+       cpage_out = page_address(out_page);
         pages[0] = out_page;
         nr_pages = 1;
  
@@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 int i;
  
                                 for (i = 0; i < in_buf_pages; i++) {
-                                       if (in_page) {
-                                               kunmap(in_page);
+                                       if (in_page)
                                                 put_page(in_page);
-                                       }
                                         in_page = find_get_page(mapping,
                                                                 start >> PAGE_SHIFT);
-                                       data_in = kmap(in_page);
+                                       data_in = page_address(in_page);
                                         memcpy(workspace->buf + i * PAGE_SIZE,
                                                data_in, PAGE_SIZE);
                                         start += PAGE_SIZE;
                                 }
                                 workspace->strm.next_in = workspace->buf;
                         } else {
-                               if (in_page) {
-                                       kunmap(in_page);
+                               if (in_page)
                                         put_page(in_page);
-                               }
                                 in_page = find_get_page(mapping,
                                                         start >> PAGE_SHIFT);
-                               data_in = kmap(in_page);
+                               data_in = page_address(in_page);
                                 start += PAGE_SIZE;
                                 workspace->strm.next_in = data_in;
                         }
@@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                  * the stream end if required
                  */
                 if (workspace->strm.avail_out == 0) {
-                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
                                 goto out;
                         }
-                       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       out_page = alloc_page(GFP_NOFS);
                         if (out_page == NULL) {
                                 ret = -ENOMEM;
                                 goto out;
                         }
-                       cpage_out = kmap(out_page);
+                       cpage_out = page_address(out_page);
                         pages[nr_pages] = out_page;
                         nr_pages++;
                         workspace->strm.avail_out = PAGE_SIZE;
@@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                         goto out;
                 } else if (workspace->strm.avail_out == 0) {
                         /* get another page for the stream end */
-                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
                                 goto out;
                         }
-                       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       out_page = alloc_page(GFP_NOFS);
                         if (out_page == NULL) {
                                 ret = -ENOMEM;
                                 goto out;
                         }
-                       cpage_out = kmap(out_page);
+                       cpage_out = page_address(out_page);
                         pages[nr_pages] = out_page;
                         nr_pages++;
                         workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
         *total_in = workspace->strm.total_in;
  out:
         *out_pages = nr_pages;
-       if (out_page)
-               kunmap(out_page);
-
-       if (in_page) {
-               kunmap(in_page);
+       if (in_page)
                 put_page(in_page);
-       }
         return ret;
  }
  
@@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
         unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
         unsigned long buf_start;
         struct page **pages_in = cb->compressed_pages;
-       u64 disk_start = cb->start;
-       struct bio *orig_bio = cb->orig_bio;
  
-       data_in = kmap(pages_in[page_in_index]);
+       data_in = page_address(pages_in[page_in_index]);
         workspace->strm.next_in = data_in;
         workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
         workspace->strm.total_in = 0;
@@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  
         if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
                 pr_warn("BTRFS: inflateInit failed\n");
-               kunmap(pages_in[page_in_index]);
                 return -EIO;
         }
         while (workspace->strm.total_in < srclen) {
@@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 if (buf_start == total_out)
                         break;
  
-               ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-                                                total_out, disk_start,
-                                                orig_bio);
+               ret2 = btrfs_decompress_buf2page(workspace->buf,
+                               total_out - buf_start, cb, buf_start);
                 if (ret2 == 0) {
                         ret = 0;
                         goto done;
@@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  
                 if (workspace->strm.avail_in == 0) {
                         unsigned long tmp;
-                       kunmap(pages_in[page_in_index]);
+
                         page_in_index++;
                         if (page_in_index >= total_pages_in) {
                                 data_in = NULL;
                                 break;
                         }
-                       data_in = kmap(pages_in[page_in_index]);
+                       data_in = page_address(pages_in[page_in_index]);
                         workspace->strm.next_in = data_in;
                         tmp = srclen - workspace->strm.total_in;
-                       workspace->strm.avail_in = min(tmp,
-                                                          PAGE_SIZE);
+                       workspace->strm.avail_in = min(tmp, PAGE_SIZE);
                 }
         }
         if (ret != Z_STREAM_END)
@@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 ret = 0;
  done:
         zlib_inflateEnd(&workspace->strm);
-       if (data_in)
-               kunmap(pages_in[page_in_index]);
         if (!ret)
-               zero_fill_bio(orig_bio);
+               zero_fill_bio(cb->orig_bio);
         return ret;
  }
  
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c

index 907c2cc..47af1ab 100644 (file)
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
                 goto out;
  
         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-               ret = btrfs_next_item(root, path);
+               ret = btrfs_next_leaf(root, path);
                 if (ret < 0)
                         goto out;
                 /* No dev extents at all? Not good */
@@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         struct btrfs_fs_info *fs_info = device->fs_info;
         struct btrfs_zoned_device_info *zone_info = NULL;
         struct block_device *bdev = device->bdev;
-       struct request_queue *queue = bdev_get_queue(bdev);
         sector_t nr_sectors;
         sector_t sector = 0;
         struct blk_zone *zones = NULL;
@@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
  
         nr_sectors = bdev_nr_sectors(bdev);
         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
-       zone_info->max_zone_append_size =
-               (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
         if (!IS_ALIGNED(nr_sectors, zone_sectors))
                 zone_info->nr_zones++;
  
-       if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
-               btrfs_err(fs_info, "zoned: device %pg does not support zone append",
-                         bdev);
-               ret = -EINVAL;
-               goto out;
-       }
-
         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
         if (!zone_info->seq_zones) {
                 ret = -ENOMEM;
@@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
         u64 zoned_devices = 0;
         u64 nr_devices = 0;
         u64 zone_size = 0;
-       u64 max_zone_append_size = 0;
         const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
         int ret = 0;
  
@@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
                                 ret = -EINVAL;
                                 goto out;
                         }
-                       if (!max_zone_append_size ||
-                           (zone_info->max_zone_append_size &&
-                            zone_info->max_zone_append_size < max_zone_append_size))
-                               max_zone_append_size =
-                                       zone_info->max_zone_append_size;
                 }
                 nr_devices++;
         }
@@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
         }
  
         fs_info->zone_size = zone_size;
-       fs_info->max_zone_append_size = max_zone_append_size;
         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
  
         /*
@@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
         if (!btrfs_is_zoned(fs_info))
                 return false;
  
-       if (!fs_info->max_zone_append_size)
-               return false;
-
         if (!is_data_inode(&inode->vfs_inode))
                 return false;
  
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h

index b0ae260..4b29970 100644 (file)
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -22,7 +22,6 @@ struct btrfs_zoned_device_info {
          */
         u64 zone_size;
         u8  zone_size_shift;
-       u64 max_zone_append_size;
         u32 nr_zones;
         unsigned long *seq_zones;
         unsigned long *empty_zones;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c

index 3e26b46..56dce9f 100644 (file)
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  
         /* map in the first page of input data */
         in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       workspace->in_buf.src = kmap(in_page);
+       workspace->in_buf.src = page_address(in_page);
         workspace->in_buf.pos = 0;
         workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
  
  
         /* Allocate and map in the output buffer */
-       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       out_page = alloc_page(GFP_NOFS);
         if (out_page == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
         pages[nr_pages++] = out_page;
-       workspace->out_buf.dst = kmap(out_page);
+       workspace->out_buf.dst = page_address(out_page);
         workspace->out_buf.pos = 0;
         workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
  
@@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                 if (workspace->out_buf.pos == workspace->out_buf.size) {
                         tot_out += PAGE_SIZE;
                         max_out -= PAGE_SIZE;
-                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
                                 goto out;
                         }
-                       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       out_page = alloc_page(GFP_NOFS);
                         if (out_page == NULL) {
                                 ret = -ENOMEM;
                                 goto out;
                         }
                         pages[nr_pages++] = out_page;
-                       workspace->out_buf.dst = kmap(out_page);
+                       workspace->out_buf.dst = page_address(out_page);
                         workspace->out_buf.pos = 0;
                         workspace->out_buf.size = min_t(size_t, max_out,
                                                         PAGE_SIZE);
@@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                 /* Check if we need more input */
                 if (workspace->in_buf.pos == workspace->in_buf.size) {
                         tot_in += PAGE_SIZE;
-                       kunmap(in_page);
                         put_page(in_page);
  
                         start += PAGE_SIZE;
                         len -= PAGE_SIZE;
                         in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-                       workspace->in_buf.src = kmap(in_page);
+                       workspace->in_buf.src = page_address(in_page);
                         workspace->in_buf.pos = 0;
                         workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
                 }
@@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  
                 tot_out += PAGE_SIZE;
                 max_out -= PAGE_SIZE;
-               kunmap(out_page);
                 if (nr_pages == nr_dest_pages) {
                         out_page = NULL;
                         ret = -E2BIG;
                         goto out;
                 }
-               out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               out_page = alloc_page(GFP_NOFS);
                 if (out_page == NULL) {
                         ret = -ENOMEM;
                         goto out;
                 }
                 pages[nr_pages++] = out_page;
-               workspace->out_buf.dst = kmap(out_page);
+               workspace->out_buf.dst = page_address(out_page);
                 workspace->out_buf.pos = 0;
                 workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
         }
@@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  out:
         *out_pages = nr_pages;
         /* Cleanup */
-       if (in_page) {
-               kunmap(in_page);
+       if (in_page)
                 put_page(in_page);
-       }
-       if (out_page)
-               kunmap(out_page);
         return ret;
  }
  
@@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  {
         struct workspace *workspace = list_entry(ws, struct workspace, list);
         struct page **pages_in = cb->compressed_pages;
-       u64 disk_start = cb->start;
-       struct bio *orig_bio = cb->orig_bio;
         size_t srclen = cb->compressed_len;
         ZSTD_DStream *stream;
         int ret = 0;
@@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 goto done;
         }
  
-       workspace->in_buf.src = kmap(pages_in[page_in_index]);
+       workspace->in_buf.src = page_address(pages_in[page_in_index]);
         workspace->in_buf.pos = 0;
         workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
  
@@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 workspace->out_buf.pos = 0;
  
                 ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
-                               buf_start, total_out, disk_start, orig_bio);
+                               total_out - buf_start, cb, buf_start);
                 if (ret == 0)
                         break;
  
@@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                         break;
  
                 if (workspace->in_buf.pos == workspace->in_buf.size) {
-                       kunmap(pages_in[page_in_index++]);
+                       page_in_index++;
                         if (page_in_index >= total_pages_in) {
                                 workspace->in_buf.src = NULL;
                                 ret = -EIO;
                                 goto done;
                         }
                         srclen -= PAGE_SIZE;
-                       workspace->in_buf.src = kmap(pages_in[page_in_index]);
+                       workspace->in_buf.src = page_address(pages_in[page_in_index]);
                         workspace->in_buf.pos = 0;
                         workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
                 }
         }
         ret = 0;
-       zero_fill_bio(orig_bio);
+       zero_fill_bio(cb->orig_bio);
  done:
-       if (workspace->in_buf.src)
-               kunmap(pages_in[page_in_index]);
         return ret;
  }
  
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 4c33705..eb57dad 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2729,23 +2729,6 @@ int write_inode_now(struct inode *inode, int sync)
  }
  EXPORT_SYMBOL(write_inode_now);
  
-/**
- * sync_inode - write an inode and its pages to disk.
- * @inode: the inode to sync
- * @wbc: controls the writeback mode
- *
- * sync_inode() will write an inode and its pages to disk.  It will also
- * correctly update the inode on its superblock's dirty inode lists and will
- * update inode->i_state.
- *
- * The caller must have a ref on the inode.
- */
-int sync_inode(struct inode *inode, struct writeback_control *wbc)
-{
-       return writeback_single_inode(inode, wbc);
-}
-EXPORT_SYMBOL(sync_inode);
-
  /**
   * sync_inode_metadata - write an inode to disk
   * @inode: the inode to sync
@@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait)
                 .nr_to_write = 0, /* metadata-only */
         };
  
-       return sync_inode(inode, &wbc);
+       return writeback_single_inode(inode, &wbc);
  }
  EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/namei.c b/fs/namei.c

index 32351c0..d049d39 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2598,8 +2598,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
  }
  EXPORT_SYMBOL(vfs_path_lookup);
  
-static int lookup_one_len_common(const char *name, struct dentry *base,
-                                int len, struct qstr *this)
+static int lookup_one_common(struct user_namespace *mnt_userns,
+                            const char *name, struct dentry *base, int len,
+                            struct qstr *this)
  {
         this->name = name;
         this->len = len;
@@ -2627,7 +2628,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
                         return err;
         }
  
-       return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
+       return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
  }
  
  /**
@@ -2651,7 +2652,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len
  
         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
  
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
  
@@ -2678,7 +2679,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
  
         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
  
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
  
@@ -2687,6 +2688,36 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
  }
  EXPORT_SYMBOL(lookup_one_len);
  
+/**
+ * lookup_one - filesystem helper to lookup single pathname component
+ * @mnt_userns:        user namespace of the mount the lookup is performed from
+ * @name:      pathname component to lookup
+ * @base:      base directory to lookup from
+ * @len:       maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
+                         struct dentry *base, int len)
+{
+       struct dentry *dentry;
+       struct qstr this;
+       int err;
+
+       WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+       err = lookup_one_common(mnt_userns, name, base, len, &this);
+       if (err)
+               return ERR_PTR(err);
+
+       dentry = lookup_dcache(&this, base, 0);
+       return dentry ? dentry : __lookup_slow(&this, base, 0);
+}
+EXPORT_SYMBOL(lookup_one);
+
  /**
   * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
   * @name:      pathname component to lookup
@@ -2706,7 +2737,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
         int err;
         struct dentry *ret;
  
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
  
diff --git a/include/linux/bio.h b/include/linux/bio.h

index 3d67d0f..00952e9 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -374,7 +374,7 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip,
  
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
-extern void bio_trim(struct bio *bio, int offset, int size);
+void bio_trim(struct bio *bio, sector_t offset, sector_t size);
  extern struct bio *bio_split(struct bio *bio, int sectors,
                              gfp_t gfp, struct bio_set *bs);
  
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 9e392da..be622b5 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -277,6 +277,7 @@ struct bio {
  };
  
  #define BIO_RESET_BYTES                offsetof(struct bio, bi_max_vecs)
+#define BIO_MAX_SECTORS                (UINT_MAX >> SECTOR_SHIFT)
  
  /*
   * bio flags
diff --git a/include/linux/fs.h b/include/linux/fs.h

index c58c261..2c2bcfb 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2500,7 +2500,6 @@ static inline void file_accessed(struct file *file)
  
  extern int file_modified(struct file *file);
  
-int sync_inode(struct inode *inode, struct writeback_control *wbc);
  int sync_inode_metadata(struct inode *inode, int wait);
  
  struct file_system_type {
@@ -2852,6 +2851,8 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
                                 loff_t start, loff_t end);
  extern int filemap_check_errors(struct address_space *mapping);
  extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+                          struct writeback_control *wbc);
  
  static inline int filemap_write_and_wait(struct address_space *mapping)
  {
diff --git a/include/linux/namei.h b/include/linux/namei.h

index be9a2b3..e89329b 100644 (file)
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -68,6 +68,7 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
  extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
  extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
+struct dentry *lookup_one(struct user_namespace *, const char *, struct dentry *, int);
  
  extern int follow_down_one(struct path *);
  extern int follow_down(struct path *);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h

index b671b1f..8f58fd9 100644 (file)
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -94,6 +94,7 @@ struct btrfs_space_info;
         EM( FLUSH_DELAYED_ITEMS,        "FLUSH_DELAYED_ITEMS")          \
         EM( FLUSH_DELALLOC,             "FLUSH_DELALLOC")               \
         EM( FLUSH_DELALLOC_WAIT,        "FLUSH_DELALLOC_WAIT")          \
+       EM( FLUSH_DELALLOC_FULL,        "FLUSH_DELALLOC_FULL")          \
         EM( FLUSH_DELAYED_REFS_NR,      "FLUSH_DELAYED_REFS_NR")        \
         EM( FLUSH_DELAYED_REFS,         "FLUSH_ELAYED_REFS")            \
         EM( ALLOC_CHUNK,                "ALLOC_CHUNK")                  \
@@ -2037,7 +2038,7 @@ TRACE_EVENT(btrfs_convert_extent_bit,
  );
  
  DECLARE_EVENT_CLASS(btrfs_dump_space_info,
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
                  const struct btrfs_space_info *sinfo),
  
         TP_ARGS(fs_info, sinfo),
@@ -2057,6 +2058,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
                 __field(        u64,    delayed_refs_reserved   )
                 __field(        u64,    delayed_reserved        )
                 __field(        u64,    free_chunk_space        )
+               __field(        u64,    delalloc_bytes          )
+               __field(        u64,    ordered_bytes           )
         ),
  
         TP_fast_assign_btrfs(fs_info,
@@ -2074,6 +2077,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
                 __entry->delayed_refs_reserved  =       fs_info->delayed_refs_rsv.reserved;
                 __entry->delayed_reserved       =       fs_info->delayed_block_rsv.reserved;
                 __entry->free_chunk_space       =       atomic64_read(&fs_info->free_chunk_space);
+               __entry->delalloc_bytes         =       percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+               __entry->ordered_bytes          =       percpu_counter_sum_positive(&fs_info->ordered_bytes);
         ),
  
         TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu "
@@ -2081,7 +2086,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
                         "bytes_may_use=%llu bytes_readonly=%llu "
                         "reclaim_size=%llu clamp=%d global_reserved=%llu "
                         "trans_reserved=%llu delayed_refs_reserved=%llu "
-                       "delayed_reserved=%llu chunk_free_space=%llu",
+                       "delayed_reserved=%llu chunk_free_space=%llu "
+                       "delalloc_bytes=%llu ordered_bytes=%llu",
                         __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS),
                         __entry->total_bytes, __entry->bytes_used,
                         __entry->bytes_pinned, __entry->bytes_reserved,
@@ -2089,11 +2095,18 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
                         __entry->reclaim_size, __entry->clamp,
                         __entry->global_reserved, __entry->trans_reserved,
                         __entry->delayed_refs_reserved,
-                       __entry->delayed_reserved, __entry->free_chunk_space)
+                       __entry->delayed_reserved, __entry->free_chunk_space,
+                       __entry->delalloc_bytes, __entry->ordered_bytes)
  );
  
  DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim,
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                const struct btrfs_space_info *sinfo),
+       TP_ARGS(fs_info, sinfo)
+);
+
+DEFINE_EVENT(btrfs_dump_space_info, btrfs_fail_all_tickets,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
                  const struct btrfs_space_info *sinfo),
         TP_ARGS(fs_info, sinfo)
  );
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index 22cd037..d7d3cfe 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
   * first mount when booting older kernel versions.
   */
  #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID  (1ULL << 1)
+#define BTRFS_FEATURE_COMPAT_RO_VERITY                 (1ULL << 2)
  
  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h

index ccdb40f..e1c4c73 100644 (file)
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -118,6 +118,29 @@
  #define BTRFS_INODE_REF_KEY            12
  #define BTRFS_INODE_EXTREF_KEY         13
  #define BTRFS_XATTR_ITEM_KEY           24
+
+/*
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the size
+ * of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.  The
+ * latter are opaque to btrfs, we just read and write them as a blob for the
+ * higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.  When
+ * fsverity asks for page 0 of the merkle tree, we pull up one page starting at
+ * offset 0 for this key type.  These are also opaque to btrfs, we're blindly
+ * storing whatever fsverity sends down.
+ */
+#define BTRFS_VERITY_DESC_ITEM_KEY     36
+#define BTRFS_VERITY_MERKLE_ITEM_KEY   37
+
  #define BTRFS_ORPHAN_ITEM_KEY          48
  /* reserve 2-15 close to the inode for later flexibility */
  
@@ -991,4 +1014,16 @@ struct btrfs_qgroup_limit_item {
         __le64 rsv_excl;
  } __attribute__ ((__packed__));
  
+struct btrfs_verity_descriptor_item {
+       /* Size of the verity descriptor in bytes */
+       __le64 size;
+       /*
+        * When we implement support for fscrypt, we will need to encrypt the
+        * Merkle tree for encrypted verity files. These 128 bits are for the
+        * eventual storage of an fscrypt initialization vector.
+        */
+       __le64 reserved[2];
+       __u8 encryption;
+} __attribute__ ((__packed__));
+
  #endif /* _BTRFS_CTREE_H_ */
diff --git a/mm/filemap.c b/mm/filemap.c

index 0fad083..920e8dc 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -379,6 +379,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
         return 0;
  }
  
+/**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping:   address space structure to write
+ * @wbc:       the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+                          struct writeback_control *wbc)
+{
+       int ret;
+
+       if (!mapping_can_writeback(mapping) ||
+           !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+               return 0;
+
+       wbc_attach_fdatawrite_inode(wbc, mapping->host);
+       ret = do_writepages(mapping, wbc);
+       wbc_detach_inode(wbc);
+       return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
  /**
   * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
   * @mapping:   address space structure to write
@@ -399,7 +425,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
  int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                 loff_t end, int sync_mode)
  {
-       int ret;
         struct writeback_control wbc = {
                 .sync_mode = sync_mode,
                 .nr_to_write = LONG_MAX,
@@ -407,14 +432,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                 .range_end = end,
         };
  
-       if (!mapping_can_writeback(mapping) ||
-           !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-               return 0;
-
-       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
-       ret = do_writepages(mapping, &wbc);
-       wbc_detach_inode(&wbc);
-       return ret;
+       return filemap_fdatawrite_wbc(mapping, &wbc);
  }
  
  static inline int __filemap_fdatawrite(struct address_space *mapping,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
block/bio.c		patch \| blob \| history
fs/9p/vfs_file.c		patch \| blob \| history
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/acl.c		patch \| blob \| history
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/backref.h		patch \| blob \| history
fs/btrfs/block-group.c		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/compression.h		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/dir-item.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/file-item.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/lzo.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/qgroup.c		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/ref-verify.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/space-info.c		patch \| blob \| history
fs/btrfs/struct-funcs.c		patch \| blob \| history
fs/btrfs/subpage.c		patch \| blob \| history
fs/btrfs/subpage.h		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/tests/qgroup-tests.c		patch \| blob \| history
fs/btrfs/tree-checker.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/verity.c	[new file with mode: 0644]	patch \| blob
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
fs/btrfs/zlib.c		patch \| blob \| history
fs/btrfs/zoned.c		patch \| blob \| history
fs/btrfs/zoned.h		patch \| blob \| history
fs/btrfs/zstd.c		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/namei.c		patch \| blob \| history
include/linux/bio.h		patch \| blob \| history
include/linux/blk_types.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/namei.h		patch \| blob \| history
include/trace/events/btrfs.h		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history
include/uapi/linux/btrfs_tree.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history