Merge branch 'for-linus-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 802fabb..ecbc63d 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
                 return -ENOMEM;
  
         ref->root_id = root_id;
-       if (key)
+       if (key) {
                 ref->key_for_search = *key;
-       else
+               /*
+                * We can often find data backrefs with an offset that is too
+                * large (>= LLONG_MAX, maximum allowed file offset) due to
+                * underflows when subtracting a file's offset with the data
+                * offset of its corresponding extent data item. This can
+                * happen for example in the clone ioctl.
+                * So if we detect such case we set the search key's offset to
+                * zero to make sure we will find the matching file extent item
+                * at add_all_parents(), otherwise we will miss it because the
+                * offset taken form the backref is much larger then the offset
+                * of the file extent item. This can make us scan a very large
+                * number of file extent items, but at least it will not make
+                * us miss any.
+                * This is an ugly workaround for a behaviour that should have
+                * never existed, but it does and a fix for the clone ioctl
+                * would touch a lot of places, cause backwards incompatibility
+                * and would not fix the problem for extents cloned with older
+                * kernels.
+                */
+               if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
+                   ref->key_for_search.offset >= LLONG_MAX)
+                       ref->key_for_search.offset = 0;
+       } else {
                 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+       }
  
         ref->inode_list = NULL;
         ref->level = level;
@@ -632,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                         struct btrfs_delayed_tree_ref *ref;
  
                         ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, NULL,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                ref->level + 1, ref->parent,
                                                node->bytenr,
                                                node->ref_mod * sgn, GFP_ATOMIC);
@@ -664,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                         struct btrfs_delayed_data_ref *ref;
  
                         ref = btrfs_delayed_node_to_data_ref(node);
-
-                       key.objectid = ref->objectid;
-                       key.type = BTRFS_EXTENT_DATA_KEY;
-                       key.offset = ref->offset;
-                       ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0,
                                                ref->parent, node->bytenr,
                                                node->ref_mod * sgn, GFP_ATOMIC);
                         break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 54114b4..5f745ea 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  
         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
-               if (ret)
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
                         return ret;
+               }
         }
  
         if (buf == root->node) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index aac314e..938efe3 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache {
         /* for raid56, this is a full stripe, without parity */
         unsigned long full_stripe_len;
  
-       unsigned int ro:1;
+       unsigned int ro;
         unsigned int iref:1;
         unsigned int has_caching_ctl:1;
         unsigned int removed:1;
@@ -1518,12 +1518,6 @@ struct btrfs_fs_info {
          */
         struct mutex ordered_operations_mutex;
  
-       /*
-        * Same as ordered_operations_mutex except this is for ordered extents
-        * and not the operations.
-        */
-       struct mutex ordered_extent_flush_mutex;
-
         struct rw_semaphore commit_root_sem;
  
         struct rw_semaphore cleanup_work_sem;
@@ -3437,6 +3431,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 group_start,
                              struct extent_map *em);
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root);
  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3495,9 +3491,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
  void btrfs_block_rsv_release(struct btrfs_root *root,
                              struct btrfs_block_rsv *block_rsv,
                              u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
                               struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
  u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
@@ -4073,6 +4069,7 @@ __cold
  void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                      unsigned int line, int errno, const char *fmt, ...);
  
+const char *btrfs_decode_error(int errno);
  
  __cold
  void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@ -4185,8 +4182,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, struct extent_buffer *buf,
                           struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
-                             struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
                               u64 *bytes_to_reserve);
  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                               struct btrfs_pending_snapshot *pending);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 5e307bd..9ebd34f 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1730,6 +1730,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
         bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
         bdi->congested_fn       = btrfs_congested_fn;
         bdi->congested_data     = info;
+       bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
         return 0;
  }
  
@@ -2613,7 +2614,6 @@ int open_ctree(struct super_block *sb,
  
  
         mutex_init(&fs_info->ordered_operations_mutex);
-       mutex_init(&fs_info->ordered_extent_flush_mutex);
         mutex_init(&fs_info->tree_log_mutex);
         mutex_init(&fs_info->chunk_mutex);
         mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2955,8 +2955,9 @@ retry_root_backup:
         if (fs_info->fs_devices->missing_devices >
              fs_info->num_tolerated_disk_barrier_failures &&
             !(sb->s_flags & MS_RDONLY)) {
-               printk(KERN_WARNING "BTRFS: "
-                       "too many missing devices, writeable mount is not allowed\n");
+               pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+                       fs_info->fs_devices->missing_devices,
+                       fs_info->num_tolerated_disk_barrier_failures);
                 goto fail_sysfs;
         }
  
@@ -3763,6 +3764,15 @@ void close_ctree(struct btrfs_root *root)
         cancel_work_sync(&fs_info->async_reclaim_work);
  
         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+               /*
+                * If the cleaner thread is stopped and there are
+                * block groups queued for removal, the deletion will be
+                * skipped when we quit the cleaner thread.
+                */
+               mutex_lock(&root->fs_info->cleaner_mutex);
+               btrfs_delete_unused_bgs(root->fs_info);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
+
                 ret = btrfs_commit_super(root);
                 if (ret)
                         btrfs_err(fs_info, "commit super ret %d", ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 07204bf..5411f0a 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1316,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
-                                         struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
                                           struct btrfs_extent_inline_ref *iref)
  {
         struct btrfs_key key;
@@ -1883,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static int btrfs_issue_discard(struct block_device *bdev,
-                               u64 start, u64 len)
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+                              u64 *discarded_bytes)
  {
-       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       int j, ret = 0;
+       u64 bytes_left, end;
+       u64 aligned_start = ALIGN(start, 1 << 9);
+
+       if (WARN_ON(start != aligned_start)) {
+               len -= aligned_start - start;
+               len = round_down(len, 1 << 9);
+               start = aligned_start;
+       }
+
+       *discarded_bytes = 0;
+
+       if (!len)
+               return 0;
+
+       end = start + len;
+       bytes_left = len;
+
+       /* Skip any superblocks on this device. */
+       for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+               u64 sb_start = btrfs_sb_offset(j);
+               u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+               u64 size = sb_start - start;
+
+               if (!in_range(sb_start, start, bytes_left) &&
+                   !in_range(sb_end, start, bytes_left) &&
+                   !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+                       continue;
+
+               /*
+                * Superblock spans beginning of range.  Adjust start and
+                * try again.
+                */
+               if (sb_start <= start) {
+                       start += sb_end - start;
+                       if (start > end) {
+                               bytes_left = 0;
+                               break;
+                       }
+                       bytes_left = end - start;
+                       continue;
+               }
+
+               if (size) {
+                       ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+                                                  GFP_NOFS, 0);
+                       if (!ret)
+                               *discarded_bytes += size;
+                       else if (ret != -EOPNOTSUPP)
+                               return ret;
+               }
+
+               start = sb_end;
+               if (start > end) {
+                       bytes_left = 0;
+                       break;
+               }
+               bytes_left = end - start;
+       }
+
+       if (bytes_left) {
+               ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+                                          GFP_NOFS, 0);
+               if (!ret)
+                       *discarded_bytes += bytes_left;
+       }
+       return ret;
  }
  
  int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  
  
                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+                       u64 bytes;
                         if (!stripe->dev->can_discard)
                                 continue;
  
                         ret = btrfs_issue_discard(stripe->dev->bdev,
                                                   stripe->physical,
-                                                 stripe->length);
+                                                 stripe->length,
+                                                 &bytes);
                         if (!ret)
-                               discarded_bytes += stripe->length;
+                               discarded_bytes += bytes;
                         else if (ret != -EOPNOTSUPP)
                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
  
@@ -6062,20 +6130,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct list_head *deleted_bgs;
         struct extent_io_tree *unpin;
         u64 start;
         u64 end;
         int ret;
  
-       if (trans->aborted)
-               return 0;
-
         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                 unpin = &fs_info->freed_extents[1];
         else
                 unpin = &fs_info->freed_extents[0];
  
-       while (1) {
+       while (!trans->aborted) {
                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = find_first_extent_bit(unpin, 0, &start, &end,
                                             EXTENT_DIRTY, NULL);
@@ -6094,6 +6161,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                 cond_resched();
         }
  
+       /*
+        * Transaction is finished.  We don't need the lock anymore.  We
+        * do need to clean up the block groups in case of a transaction
+        * abort.
+        */
+       deleted_bgs = &trans->transaction->deleted_bgs;
+       list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+               u64 trimmed = 0;
+
+               ret = -EROFS;
+               if (!trans->aborted)
+                       ret = btrfs_discard_extent(root,
+                                                  block_group->key.objectid,
+                                                  block_group->key.offset,
+                                                  &trimmed);
+
+               list_del_init(&block_group->bg_list);
+               btrfs_put_block_group_trimming(block_group);
+               btrfs_put_block_group(block_group);
+
+               if (ret) {
+                       const char *errstr = btrfs_decode_error(ret);
+                       btrfs_warn(fs_info,
+                                  "Discard failed while removing blockgroup: errno=%d %s\n",
+                                  ret, errstr);
+               }
+       }
+
         return 0;
  }
  
@@ -6349,7 +6444,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         } else {
                 if (found_extent) {
                         BUG_ON(is_data && refs_to_drop !=
-                              extent_data_ref_count(root, path, iref));
+                              extent_data_ref_count(path, iref));
                         if (iref) {
                                 BUG_ON(path->slots[0] != extent_slot);
                         } else {
@@ -7567,9 +7662,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
  
  /*
   * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
   * returns the tree buffer or an ERR_PTR on error.
   */
  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -8723,14 +8815,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
         return flags;
  }
  
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
         u64 num_bytes;
         u64 min_allocable_bytes;
         int ret = -ENOSPC;
  
-
         /*
          * We need some metadata space and system metadata space for
          * allocating chunks in some corner cases until we force to set
@@ -8747,6 +8838,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
         spin_lock(&cache->lock);
  
         if (cache->ro) {
+               cache->ro++;
                 ret = 0;
                 goto out;
         }
@@ -8758,7 +8850,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
             min_allocable_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
-               cache->ro = 1;
+               cache->ro++;
                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                 ret = 0;
         }
@@ -8768,7 +8860,7 @@ out:
         return ret;
  }
  
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
  
  {
@@ -8776,8 +8868,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
         u64 alloc_flags;
         int ret;
  
-       BUG_ON(cache->ro);
-
  again:
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans))
@@ -8820,7 +8910,7 @@ again:
                         goto out;
         }
  
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
         if (!ret)
                 goto out;
         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8828,7 +8918,7 @@ again:
                              CHUNK_ALLOC_FORCE);
         if (ret < 0)
                 goto out;
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
  out:
         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                 alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8891,7 +8981,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
         return free_bytes;
  }
  
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
                               struct btrfs_block_group_cache *cache)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
@@ -8901,11 +8991,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
  
         spin_lock(&sinfo->lock);
         spin_lock(&cache->lock);
-       num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-                   cache->bytes_super - btrfs_block_group_used(&cache->item);
-       sinfo->bytes_readonly -= num_bytes;
-       cache->ro = 0;
-       list_del_init(&cache->ro_list);
+       if (!--cache->ro) {
+               num_bytes = cache->key.offset - cache->reserved -
+                           cache->pinned - cache->bytes_super -
+                           btrfs_block_group_used(&cache->item);
+               sinfo->bytes_readonly -= num_bytes;
+               list_del_init(&cache->ro_list);
+       }
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
  }
@@ -9421,7 +9513,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
  
                 set_avail_alloc_bits(root->fs_info, cache->flags);
                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                 } else if (btrfs_block_group_used(&cache->item) == 0) {
                         spin_lock(&info->unused_bgs_lock);
                         /* Should always be true but just in case. */
@@ -9449,11 +9541,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 list_for_each_entry(cache,
                                 &space_info->block_groups[BTRFS_RAID_RAID0],
                                 list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                 list_for_each_entry(cache,
                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
                                 list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
         }
  
         init_global_block_rsv(info);
@@ -9834,6 +9926,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
          * currently running transaction might finish and a new one start,
          * allowing for new block groups to be created that can reuse the same
          * physical device locations unless we take this special care.
+        *
+        * There may also be an implicit trim operation if the file system
+        * is mounted with -odiscard. The same protections must remain
+        * in place until the extents have been discarded completely when
+        * the transaction commit has completed.
          */
         remove_em = (atomic_read(&block_group->trimming) == 0);
         /*
@@ -9908,6 +10005,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
         spin_lock(&fs_info->unused_bgs_lock);
         while (!list_empty(&fs_info->unused_bgs)) {
                 u64 start, end;
+               int trimming;
  
                 block_group = list_first_entry(&fs_info->unused_bgs,
                                                struct btrfs_block_group_cache,
@@ -9941,7 +10039,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&block_group->lock);
  
                 /* We don't want to force the issue, only flip if it's ok. */
-               ret = set_block_group_ro(block_group, 0);
+               ret = inc_block_group_ro(block_group, 0);
                 up_write(&space_info->groups_sem);
                 if (ret < 0) {
                         ret = 0;
@@ -9955,7 +10053,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 /* 1 for btrfs_orphan_reserve_metadata() */
                 trans = btrfs_start_transaction(root, 1);
                 if (IS_ERR(trans)) {
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         ret = PTR_ERR(trans);
                         goto next;
                 }
@@ -9982,14 +10080,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         goto end_trans;
                 }
                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         goto end_trans;
                 }
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10007,12 +10105,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&block_group->lock);
                 spin_unlock(&space_info->lock);
  
+               /* DISCARD can flip during remount */
+               trimming = btrfs_test_opt(root, DISCARD);
+
+               /* Implicit trim during transaction commit. */
+               if (trimming)
+                       btrfs_get_block_group_trimming(block_group);
+
                 /*
                  * Btrfs_remove_chunk will abort the transaction if things go
                  * horribly wrong.
                  */
                 ret = btrfs_remove_chunk(trans, root,
                                          block_group->key.objectid);
+
+               if (ret) {
+                       if (trimming)
+                               btrfs_put_block_group_trimming(block_group);
+                       goto end_trans;
+               }
+
+               /*
+                * If we're not mounted with -odiscard, we can just forget
+                * about this block group. Otherwise we'll need to wait
+                * until transaction commit to do the actual discard.
+                */
+               if (trimming) {
+                       WARN_ON(!list_empty(&block_group->bg_list));
+                       spin_lock(&trans->transaction->deleted_bgs_lock);
+                       list_move(&block_group->bg_list,
+                                 &trans->transaction->deleted_bgs);
+                       spin_unlock(&trans->transaction->deleted_bgs_lock);
+                       btrfs_get_block_group(block_group);
+               }
  end_trans:
                 btrfs_end_transaction(trans, root);
  next:
@@ -10066,10 +10191,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
         return unpin_extent_range(root, start, end, false);
  }
  
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+                                  u64 minlen, u64 *trimmed)
+{
+       u64 start = 0, len = 0;
+       int ret;
+
+       *trimmed = 0;
+
+       /* Not writeable = nothing to do. */
+       if (!device->writeable)
+               return 0;
+
+       /* No free space = nothing to do. */
+       if (device->total_bytes <= device->bytes_used)
+               return 0;
+
+       ret = 0;
+
+       while (1) {
+               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+               struct btrfs_transaction *trans;
+               u64 bytes;
+
+               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+               if (ret)
+                       return ret;
+
+               down_read(&fs_info->commit_root_sem);
+
+               spin_lock(&fs_info->trans_lock);
+               trans = fs_info->running_transaction;
+               if (trans)
+                       atomic_inc(&trans->use_count);
+               spin_unlock(&fs_info->trans_lock);
+
+               ret = find_free_dev_extent_start(trans, device, minlen, start,
+                                                &start, &len);
+               if (trans)
+                       btrfs_put_transaction(trans);
+
+               if (ret) {
+                       up_read(&fs_info->commit_root_sem);
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret == -ENOSPC)
+                               ret = 0;
+                       break;
+               }
+
+               ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+               up_read(&fs_info->commit_root_sem);
+               mutex_unlock(&fs_info->chunk_mutex);
+
+               if (ret)
+                       break;
+
+               start += len;
+               *trimmed += bytes;
+
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               cond_resched();
+       }
+
+       return ret;
+}
+
  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_device *device;
+       struct list_head *devices;
         u64 group_trimmed;
         u64 start;
         u64 end;
@@ -10124,6 +10338,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                 cache = next_block_group(fs_info->tree_root, cache);
         }
  
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       devices = &root->fs_info->fs_devices->alloc_list;
+       list_for_each_entry(device, devices, dev_alloc_list) {
+               ret = btrfs_trim_free_extents(device, range->minlen,
+                                             &group_trimmed);
+               if (ret)
+                       break;
+
+               trimmed += group_trimmed;
+       }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
         range->len = trimmed;
         return ret;
  }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 68b12bb..f1018cf 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2723,6 +2723,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
                 btrfs_bio->csum = NULL;
                 btrfs_bio->csum_allocated = NULL;
                 btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
+               /* FIXME, put this into bio_clone_bioset */
+               if (bio->bi_css)
+                       bio_associate_blkcg(new, bio->bi_css);
+#endif
         }
         return new;
  }
@@ -2783,6 +2789,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
  }
  
  static int submit_extent_page(int rw, struct extent_io_tree *tree,
+                             struct writeback_control *wbc,
                               struct page *page, sector_t sector,
                               size_t size, unsigned long offset,
                               struct block_device *bdev,
@@ -2817,6 +2824,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                         }
                         bio = NULL;
                 } else {
+                       if (wbc)
+                               wbc_account_io(wbc, page, page_size);
                         return 0;
                 }
         }
@@ -2829,6 +2838,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
         bio_add_page(bio, page, page_size, offset);
         bio->bi_end_io = end_io_func;
         bio->bi_private = tree;
+       if (wbc) {
+               wbc_init_bio(wbc, bio);
+               wbc_account_io(wbc, page, page_size);
+       }
  
         if (bio_ret)
                 *bio_ret = bio;
@@ -3039,7 +3052,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                 }
  
                 pnr -= page->index;
-               ret = submit_extent_page(rw, tree, page,
+               ret = submit_extent_page(rw, tree, NULL, page,
                                          sector, disk_io_size, pg_offset,
                                          bdev, bio, pnr,
                                          end_bio_extent_readpage, mirror_num,
@@ -3434,7 +3447,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
                                        page->index, cur, end);
                         }
  
-                       ret = submit_extent_page(write_flags, tree, page,
+                       ret = submit_extent_page(write_flags, tree, wbc, page,
                                                  sector, iosize, pg_offset,
                                                  bdev, &epd->bio, max_nr,
                                                  end_bio_extent_writepage,
@@ -3738,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
  
                 clear_page_dirty_for_io(p);
                 set_page_writeback(p);
-               ret = submit_extent_page(rw, tree, p, offset >> 9,
+               ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
                                          PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
                                          -1, end_bio_extent_buffer_writepage,
                                          0, epd->bio_flags, bio_flags);
@@ -4603,9 +4616,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
  {
         struct extent_buffer *eb = NULL;
  
-       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
-       if (eb == NULL)
-               return NULL;
+       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
         eb->start = start;
         eb->len = len;
         eb->fs_info = fs_info;
@@ -4863,7 +4874,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                 return NULL;
  
         for (i = 0; i < num_pages; i++, index++) {
-               p = find_or_create_page(mapping, index, GFP_NOFS);
+               p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
                 if (!p)
                         goto free_eb;
  
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index fb5a6b1..abe3a66 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3272,35 +3272,23 @@ next:
         return ret;
  }
  
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
  {
-       int ret;
+       atomic_inc(&cache->trimming);
+}
  
-       *trimmed = 0;
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       bool cleanup;
  
         spin_lock(&block_group->lock);
-       if (block_group->removed) {
-               spin_unlock(&block_group->lock);
-               return 0;
-       }
-       atomic_inc(&block_group->trimming);
+       cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+                  block_group->removed);
         spin_unlock(&block_group->lock);
  
-       ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
-       if (ret)
-               goto out;
-
-       ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
-       spin_lock(&block_group->lock);
-       if (atomic_dec_and_test(&block_group->trimming) &&
-           block_group->removed) {
-               struct extent_map_tree *em_tree;
-               struct extent_map *em;
-
-               spin_unlock(&block_group->lock);
-
+       if (cleanup) {
                 lock_chunks(block_group->fs_info->chunk_root);
                 em_tree = &block_group->fs_info->mapping_tree.map_tree;
                 write_lock(&em_tree->lock);
@@ -3324,10 +3312,31 @@ out:
                  * this block group have left 1 entry each one. Free them.
                  */
                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
-       } else {
+       }
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+       int ret;
+
+       *trimmed = 0;
+
+       spin_lock(&block_group->lock);
+       if (block_group->removed) {
                 spin_unlock(&block_group->lock);
+               return 0;
         }
+       btrfs_get_block_group_trimming(block_group);
+       spin_unlock(&block_group->lock);
+
+       ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+       if (ret)
+               goto out;
  
+       ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+       btrfs_put_block_group_trimming(block_group);
         return ret;
  }
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index f924d9a..237da01 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3658,6 +3658,35 @@ cache_index:
                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                         &BTRFS_I(inode)->runtime_flags);
  
+       /*
+        * We don't persist the id of the transaction where an unlink operation
+        * against the inode was last made. So here we assume the inode might
+        * have been evicted, and therefore the exact value of last_unlink_trans
+        * lost, and set it to last_trans to avoid metadata inconsistencies
+        * between the inode and its parent if the inode is fsync'ed and the log
+        * replayed. For example, in the scenario:
+        *
+        * touch mydir/foo
+        * ln mydir/foo mydir/bar
+        * sync
+        * unlink mydir/bar
+        * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
+        * xfs_io -c fsync mydir/foo
+        * <power failure>
+        * mount fs, triggers fsync log replay
+        *
+        * We must make sure that when we fsync our inode foo we also log its
+        * parent inode, otherwise after log replay the parent still has the
+        * dentry with the "bar" name but our inode foo has a link count of 1
+        * and doesn't have an inode ref with the name "bar" anymore.
+        *
+        * Setting last_unlink_trans to last_trans is a pessimistic approach,
+        * but it guarantees correctness at the expense of ocassional full
+        * transaction commits on fsync if our inode is a directory, or if our
+        * inode is not a directory, logging its parent unnecessarily.
+        */
+       BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
         path->slots[0]++;
         if (inode->i_nlink != 1 ||
             path->slots[0] >= btrfs_header_nritems(leaf))
@@ -7958,7 +7987,11 @@ out:
  static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
                                        u64 first_sector, gfp_t gfp_flags)
  {
-       return btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+       struct bio *bio;
+       bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+       if (bio)
+               bio_associate_current(bio);
+       return bio;
  }
  
  static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 0770c91..0adf542 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1030,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
         struct extent_map *em;
         int ret = 1;
         bool next_mergeable = true;
+       bool prev_mergeable = true;
  
         /*
          * make sure that once we start defragging an extent, we keep on
@@ -1050,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
                 goto out;
         }
  
+       if (!*defrag_end)
+               prev_mergeable = false;
+
         next_mergeable = defrag_check_next_extent(inode, em);
         /*
          * we hit a real extent, if it is big or the next extent is not a
          * real extent, don't bother defragging it
          */
         if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-           (em->len >= thresh || !next_mergeable))
+           (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
                 ret = 0;
  out:
         /*
@@ -1933,6 +1937,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
         u64 found_transid;
         struct extent_buffer *leaf;
         struct btrfs_ioctl_search_header sh;
+       struct btrfs_key test;
         unsigned long item_off;
         unsigned long item_len;
         int nritems;
@@ -2016,12 +2021,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
         }
  advance_key:
         ret = 0;
-       if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+       test.objectid = sk->max_objectid;
+       test.type = sk->max_type;
+       test.offset = sk->max_offset;
+       if (btrfs_comp_cpu_keys(key, &test) >= 0)
+               ret = 1;
+       else if (key->offset < (u64)-1)
                 key->offset++;
-       else if (key->type < (u8)-1 && key->type < sk->max_type) {
+       else if (key->type < (u8)-1) {
                 key->offset = 0;
                 key->type++;
-       } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+       } else if (key->objectid < (u64)-1) {
                 key->offset = 0;
                 key->type = 0;
                 key->objectid++;
@@ -2842,8 +2852,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
                 swap(inode1, inode2);
  
         mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-       if (inode1 != inode2)
-               mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+       mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
  }
  
  static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2861,8 +2870,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
                 swap(loff1, loff2);
         }
         lock_extent_range(inode1, loff1, len);
-       if (inode1 != inode2)
-               lock_extent_range(inode2, loff2, len);
+       lock_extent_range(inode2, loff2, len);
  }
  
  struct cmp_pages {
@@ -3787,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 goto out_fput;
  
         if (!same_inode) {
-               if (inode < src) {
-                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-                       mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-               } else {
-                       mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-               }
+               btrfs_double_inode_lock(src, inode);
         } else {
                 mutex_lock(&src->i_mutex);
         }
@@ -3843,8 +3845,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
  
                 lock_extent_range(src, lock_start, lock_len);
         } else {
-               lock_extent_range(src, off, len);
-               lock_extent_range(inode, destoff, len);
+               btrfs_double_extent_lock(src, off, inode, destoff, len);
         }
  
         ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3855,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
  
                 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
         } else {
-               unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-               unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
-                             destoff + len - 1);
+               btrfs_double_extent_unlock(src, off, inode, destoff, len);
         }
         /*
          * Truncate page cache pages so that future reads will see the cloned
@@ -3866,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         truncate_inode_pages_range(&inode->i_data, destoff,
                                    PAGE_CACHE_ALIGN(destoff + len) - 1);
  out_unlock:
-       if (!same_inode) {
-               if (inode < src) {
-                       mutex_unlock(&src->i_mutex);
-                       mutex_unlock(&inode->i_mutex);
-               } else {
-                       mutex_unlock(&inode->i_mutex);
-                       mutex_unlock(&src->i_mutex);
-               }
-       } else {
+       if (!same_inode)
+               btrfs_double_inode_unlock(src, inode);
+       else
                 mutex_unlock(&src->i_mutex);
-       }
  out_fput:
         fdput(src_file);
  out_drop_write:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c

index f8229ef..d7e6baf 100644 (file)
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
   */
  void btrfs_tree_lock(struct extent_buffer *eb)
  {
+       WARN_ON(eb->lock_owner == current->pid);
  again:
         wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
         wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index 0a02e24..fcf7265 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -61,9 +61,10 @@
  #define RBIO_CACHE_SIZE 1024
  
  enum btrfs_rbio_ops {
-       BTRFS_RBIO_WRITE        = 0,
-       BTRFS_RBIO_READ_REBUILD = 1,
-       BTRFS_RBIO_PARITY_SCRUB = 2,
+       BTRFS_RBIO_WRITE,
+       BTRFS_RBIO_READ_REBUILD,
+       BTRFS_RBIO_PARITY_SCRUB,
+       BTRFS_RBIO_REBUILD_MISSING,
  };
  
  struct btrfs_raid_bio {
@@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
             cur->operation == BTRFS_RBIO_PARITY_SCRUB)
                 return 0;
  
+       if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+           cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+               return 0;
+
         return 1;
  }
  
@@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
  
                         if (next->operation == BTRFS_RBIO_READ_REBUILD)
                                 async_read_rebuild(next);
-                       else if (next->operation == BTRFS_RBIO_WRITE) {
+                       else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+                               steal_rbio(rbio, next);
+                               async_read_rebuild(next);
+                       } else if (next->operation == BTRFS_RBIO_WRITE) {
                                 steal_rbio(rbio, next);
                                 async_rmw_stripe(next);
                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
@@ -1805,7 +1813,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         faila = rbio->faila;
         failb = rbio->failb;
  
-       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+           rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                 spin_lock_irq(&rbio->bio_list_lock);
                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
                 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1830,7 +1839,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+                       if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+                            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1939,7 +1949,8 @@ pstripe:
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+                       if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+                            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1960,6 +1971,8 @@ cleanup_io:
                 else
                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  
+               rbio_orig_end_io(rbio, err);
+       } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                 rbio_orig_end_io(rbio, err);
         } else if (err == 0) {
                 rbio->faila = -1;
@@ -2096,7 +2109,8 @@ out:
         return 0;
  
  cleanup:
-       if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+           rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
                 rbio_orig_end_io(rbio, -EIO);
         return -EIO;
  }
@@ -2227,8 +2241,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
         return rbio;
  }
  
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
-                                  struct page *page, u64 logical)
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+                           u64 logical)
  {
         int stripe_offset;
         int index;
@@ -2662,3 +2677,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
         if (!lock_stripe_add(rbio))
                 async_scrub_parity(rbio);
  }
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+                         struct btrfs_bio *bbio, u64 length)
+{
+       struct btrfs_raid_bio *rbio;
+
+       rbio = alloc_rbio(root, bbio, length);
+       if (IS_ERR(rbio))
+               return NULL;
+
+       rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+       bio_list_add(&rbio->bio_list, bio);
+       /*
+        * This is a special bio which is used to hold the completion handler
+        * and make the scrub rbio is similar to the other types
+        */
+       ASSERT(!bio->bi_iter.bi_size);
+
+       rbio->faila = find_logical_bio_stripe(rbio, bio);
+       if (rbio->faila == -1) {
+               BUG();
+               kfree(rbio);
+               return NULL;
+       }
+
+       return rbio;
+}
+
+static void missing_raid56_work(struct btrfs_work *work)
+{
+       struct btrfs_raid_bio *rbio;
+
+       rbio = container_of(work, struct btrfs_raid_bio, work);
+       __raid56_parity_recover(rbio);
+}
+
+static void async_missing_raid56(struct btrfs_raid_bio *rbio)
+{
+       btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                       missing_raid56_work, NULL, NULL);
+
+       btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+       if (!lock_stripe_add(rbio))
+               async_missing_raid56(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h

index 2b5d797..8b69469 100644 (file)
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
  int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                                struct btrfs_bio *bbio, u64 stripe_len);
  
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+                           u64 logical);
+
  struct btrfs_raid_bio *
  raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
                                struct btrfs_bio *bbio, u64 stripe_len,
                                struct btrfs_device *scrub_dev,
                                unsigned long *dbitmap, int stripe_nsectors);
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
-                                  struct page *page, u64 logical);
  void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
  
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+                         struct btrfs_bio *bbio, u64 length);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
  int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
  void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
  #endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index 0e7beea..4645cd1 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         struct btrfs_device *prev_dev;
         u32 blocksize;
         u64 length;
+       int real_stripes;
         int nzones = 0;
         int i;
         unsigned long index = logical >> PAGE_CACHE_SHIFT;
@@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                 goto error;
         }
  
-       for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+       real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       for (nzones = 0; nzones < real_stripes; ++nzones) {
                 struct reada_zone *zone;
  
                 dev = bbio->stripes[nzones].dev;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 88cbb59..303babe 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2523,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
   * counted. return -ENOENT if the block is root of reloc tree.
   */
  static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
-                                  struct backref_node *node)
+struct btrfs_root *select_one_root(struct backref_node *node)
  {
         struct backref_node *next;
         struct btrfs_root *root;
@@ -2912,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                 return 0;
  
         BUG_ON(node->processed);
-       root = select_one_root(trans, node);
+       root = select_one_root(node);
         if (root == ERR_PTR(-ENOENT)) {
                 update_processed_blocks(rc, node);
                 goto out;
@@ -3755,8 +3754,7 @@ out:
   * helper to find next unprocessed extent
   */
  static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
-                    struct reloc_control *rc, struct btrfs_path *path,
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
                      struct btrfs_key *extent_key)
  {
         struct btrfs_key key;
@@ -3951,7 +3949,7 @@ restart:
                         continue;
                 }
  
-               ret = find_next_extent(trans, rc, path, &key);
+               ret = find_next_extent(rc, path, &key);
                 if (ret < 0)
                         err = ret;
                 if (ret != 0)
@@ -3976,6 +3974,10 @@ restart:
                                sizeof(struct btrfs_extent_item_v0));
                         ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
                                                   &path_change);
+                       if (ret < 0) {
+                               err = ret;
+                               break;
+                       }
                         if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
                                 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
                         else
@@ -4140,7 +4142,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
         struct btrfs_trans_handle *trans;
         struct btrfs_root *root;
         struct btrfs_key key;
-       u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+       u64 objectid;
         int err = 0;
  
         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4215,14 +4217,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
         BUG_ON(!rc->block_group);
  
-       if (!rc->block_group->ro) {
-               ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
-               if (ret) {
-                       err = ret;
-                       goto out;
-               }
-               rw = 1;
+       ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+       if (ret) {
+               err = ret;
+               goto out;
         }
+       rw = 1;
  
         path = btrfs_alloc_path();
         if (!path) {
@@ -4294,7 +4294,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
  out:
         if (err && rw)
-               btrfs_set_block_group_rw(extent_root, rc->block_group);
+               btrfs_dec_block_group_ro(extent_root, rc->block_group);
         iput(rc->data_inode);
         btrfs_put_block_group(rc->block_group);
         kfree(rc);
@@ -4594,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
   * called before creating snapshot. it calculates metadata reservation
   * requried for relocating tree blocks in the snapshot
   */
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
-                             struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
                               u64 *bytes_to_reserve)
  {
         struct btrfs_root *root;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 9c146d8..9a11db0 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -125,6 +125,7 @@ struct scrub_block {
                 /* It is for the data with checksum */
                 unsigned int    data_corrected:1;
         };
+       struct btrfs_work       work;
  };
  
  /* Used for the chunks with parity stripe such RAID5/6 */
@@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
         }
  }
  
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
  {
         atomic_inc(&fs_info->scrubs_paused);
         wake_up(&fs_info->scrub_pause_wait);
+}
  
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
         mutex_lock(&fs_info->scrub_lock);
         __scrub_blocked_if_needed(fs_info);
         atomic_dec(&fs_info->scrubs_paused);
@@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
         wake_up(&fs_info->scrub_pause_wait);
  }
  
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+       scrub_pause_on(fs_info);
+       scrub_pause_off(fs_info);
+}
+
  /*
   * used for workers that require transaction commits (i.e., for the
   * NOCOW case)
@@ -2074,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
         sbio = sctx->bios[sctx->curr];
         sctx->curr = -1;
         scrub_pending_bio_inc(sctx);
-
-       if (!sbio->bio->bi_bdev) {
-               /*
-                * this case should not happen. If btrfs_map_block() is
-                * wrong, it could happen for dev-replace operations on
-                * missing devices when no mirrors are available, but in
-                * this case it should already fail the mount.
-                * This case is handled correctly (but _very_ slowly).
-                */
-               printk_ratelimited(KERN_WARNING
-                       "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
-               bio_io_error(sbio->bio);
-       } else {
-               btrfsic_submit_bio(READ, sbio->bio);
-       }
+       btrfsic_submit_bio(READ, sbio->bio);
  }
  
  static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2165,6 +2161,134 @@ again:
         return 0;
  }
  
+static void scrub_missing_raid56_end_io(struct bio *bio)
+{
+       struct scrub_block *sblock = bio->bi_private;
+       struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+
+       if (bio->bi_error)
+               sblock->no_io_error_seen = 0;
+
+       btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct btrfs_work *work)
+{
+       struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+       struct scrub_ctx *sctx = sblock->sctx;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+       unsigned int is_metadata;
+       unsigned int have_csum;
+       u8 *csum;
+       u64 generation;
+       u64 logical;
+       struct btrfs_device *dev;
+
+       is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
+       have_csum = sblock->pagev[0]->have_csum;
+       csum = sblock->pagev[0]->csum;
+       generation = sblock->pagev[0]->generation;
+       logical = sblock->pagev[0]->logical;
+       dev = sblock->pagev[0]->dev;
+
+       if (sblock->no_io_error_seen) {
+               scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+                                            have_csum, csum, generation,
+                                            sctx->csum_size);
+       }
+
+       if (!sblock->no_io_error_seen) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.read_errors++;
+               spin_unlock(&sctx->stat_lock);
+               printk_ratelimited_in_rcu(KERN_ERR
+                       "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+                       logical, rcu_str_deref(dev->name));
+       } else if (sblock->header_error || sblock->checksum_error) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
+               printk_ratelimited_in_rcu(KERN_ERR
+                       "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+                       logical, rcu_str_deref(dev->name));
+       } else {
+               scrub_write_block_to_dev_replace(sblock);
+       }
+
+       scrub_block_put(sblock);
+
+       if (sctx->is_dev_replace &&
+           atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+               mutex_lock(&sctx->wr_ctx.wr_lock);
+               scrub_wr_submit(sctx);
+               mutex_unlock(&sctx->wr_ctx.wr_lock);
+       }
+
+       scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+       struct scrub_ctx *sctx = sblock->sctx;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+       u64 length = sblock->page_count * PAGE_SIZE;
+       u64 logical = sblock->pagev[0]->logical;
+       struct btrfs_bio *bbio;
+       struct bio *bio;
+       struct btrfs_raid_bio *rbio;
+       int ret;
+       int i;
+
+       ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+                              &bbio, 0, 1);
+       if (ret || !bbio || !bbio->raid_map)
+               goto bbio_out;
+
+       if (WARN_ON(!sctx->is_dev_replace ||
+                   !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+               /*
+                * We shouldn't be scrubbing a missing device. Even for dev
+                * replace, we should only get here for RAID 5/6. We either
+                * managed to mount something with no mirrors remaining or
+                * there's a bug in scrub_remap_extent()/btrfs_map_block().
+                */
+               goto bbio_out;
+       }
+
+       bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+       if (!bio)
+               goto bbio_out;
+
+       bio->bi_iter.bi_sector = logical >> 9;
+       bio->bi_private = sblock;
+       bio->bi_end_io = scrub_missing_raid56_end_io;
+
+       rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+       if (!rbio)
+               goto rbio_out;
+
+       for (i = 0; i < sblock->page_count; i++) {
+               struct scrub_page *spage = sblock->pagev[i];
+
+               raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+       }
+
+       btrfs_init_work(&sblock->work, btrfs_scrub_helper,
+                       scrub_missing_raid56_worker, NULL, NULL);
+       scrub_block_get(sblock);
+       scrub_pending_bio_inc(sctx);
+       raid56_submit_missing_rbio(rbio);
+       return;
+
+rbio_out:
+       bio_put(bio);
+bbio_out:
+       btrfs_put_bbio(bbio);
+       spin_lock(&sctx->stat_lock);
+       sctx->stat.malloc_errors++;
+       spin_unlock(&sctx->stat_lock);
+}
+
  static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
                        u64 physical, struct btrfs_device *dev, u64 flags,
                        u64 gen, int mirror_num, u8 *csum, int force,
@@ -2228,19 +2352,27 @@ leave_nomem:
         }
  
         WARN_ON(sblock->page_count == 0);
-       for (index = 0; index < sblock->page_count; index++) {
-               struct scrub_page *spage = sblock->pagev[index];
-               int ret;
+       if (dev->missing) {
+               /*
+                * This case should only be hit for RAID 5/6 device replace. See
+                * the comment in scrub_missing_raid56_pages() for details.
+                */
+               scrub_missing_raid56_pages(sblock);
+       } else {
+               for (index = 0; index < sblock->page_count; index++) {
+                       struct scrub_page *spage = sblock->pagev[index];
+                       int ret;
  
-               ret = scrub_add_page_to_rd_bio(sctx, spage);
-               if (ret) {
-                       scrub_block_put(sblock);
-                       return ret;
+                       ret = scrub_add_page_to_rd_bio(sctx, spage);
+                       if (ret) {
+                               scrub_block_put(sblock);
+                               return ret;
+                       }
                 }
-       }
  
-       if (force)
-               scrub_submit(sctx);
+               if (force)
+                       scrub_submit(sctx);
+       }
  
         /* last one frees, either here or in bio completion for last page */
         scrub_block_put(sblock);
@@ -2551,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
         u8 csum[BTRFS_CSUM_SIZE];
         u32 blocksize;
  
+       if (dev->missing) {
+               scrub_parity_mark_sectors_error(sparity, logical, len);
+               return 0;
+       }
+
         if (flags & BTRFS_EXTENT_FLAG_DATA) {
                 blocksize = sctx->sectorsize;
         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
@@ -2689,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
                            sparity->nsectors))
                 goto out;
  
-       length = sparity->logic_end - sparity->logic_start + 1;
+       length = sparity->logic_end - sparity->logic_start;
         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
                                sparity->logic_start,
                                &length, &bbio, 0, 1);
@@ -2712,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
                 goto rbio_out;
  
         list_for_each_entry(spage, &sparity->spages, list)
-               raid56_parity_add_scrub_pages(rbio, spage->page,
-                                             spage->logical);
+               raid56_add_scrub_pages(rbio, spage->page, spage->logical);
  
         scrub_pending_bio_inc(sctx);
         raid56_parity_submit_scrub_rbio(rbio);
@@ -2761,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         struct btrfs_root *root = fs_info->extent_root;
         struct btrfs_root *csum_root = fs_info->csum_root;
         struct btrfs_extent_item *extent;
+       struct btrfs_bio *bbio = NULL;
         u64 flags;
         int ret;
         int slot;
@@ -2770,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         u64 extent_logical;
         u64 extent_physical;
         u64 extent_len;
+       u64 mapped_length;
         struct btrfs_device *extent_dev;
         struct scrub_parity *sparity;
         int nsectors;
@@ -2843,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
                         }
                         btrfs_item_key_to_cpu(l, &key, slot);
  
+                       if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+                           key.type != BTRFS_METADATA_ITEM_KEY)
+                               goto next;
+
                         if (key.type == BTRFS_METADATA_ITEM_KEY)
                                 bytes = root->nodesize;
                         else
@@ -2851,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
                         if (key.objectid + bytes <= logic_start)
                                 goto next;
  
-                       if (key.type != BTRFS_EXTENT_ITEM_KEY &&
-                           key.type != BTRFS_METADATA_ITEM_KEY)
-                               goto next;
-
-                       if (key.objectid > logic_end) {
+                       if (key.objectid >= logic_end) {
                                 stop_loop = 1;
                                 break;
                         }
@@ -2868,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
                         flags = btrfs_extent_flags(l, extent);
                         generation = btrfs_extent_generation(l, extent);
  
-                       if (key.objectid < logic_start &&
-                           (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
-                               btrfs_err(fs_info,
-                                         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
-                                          key.objectid, logic_start);
+                       if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+                           (key.objectid < logic_start ||
+                            key.objectid + bytes >
+                            logic_start + map->stripe_len)) {
+                               btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+                                         key.objectid, logic_start);
                                 goto next;
                         }
  again:
@@ -2892,10 +3031,21 @@ again:
                         scrub_parity_mark_sectors_data(sparity, extent_logical,
                                                        extent_len);
  
-                       scrub_remap_extent(fs_info, extent_logical,
-                                          extent_len, &extent_physical,
-                                          &extent_dev,
-                                          &extent_mirror_num);
+                       mapped_length = extent_len;
+                       ret = btrfs_map_block(fs_info, READ, extent_logical,
+                                             &mapped_length, &bbio, 0);
+                       if (!ret) {
+                               if (!bbio || mapped_length < extent_len)
+                                       ret = -EIO;
+                       }
+                       if (ret) {
+                               btrfs_put_bbio(bbio);
+                               goto out;
+                       }
+                       extent_physical = bbio->stripes[0].physical;
+                       extent_mirror_num = bbio->mirror_num;
+                       extent_dev = bbio->stripes[0].dev;
+                       btrfs_put_bbio(bbio);
  
                         ret = btrfs_lookup_csums_range(csum_root,
                                                 extent_logical,
@@ -2910,10 +3060,12 @@ again:
                                                       extent_dev, flags,
                                                       generation,
                                                       extent_mirror_num);
+
+                       scrub_free_csums(sctx);
+
                         if (ret)
                                 goto out;
  
-                       scrub_free_csums(sctx);
                         if (extent_logical + extent_len <
                             key.objectid + bytes) {
                                 logic_start += map->stripe_len;
@@ -2942,7 +3094,7 @@ next:
  out:
         if (ret < 0)
                 scrub_parity_mark_sectors_error(sparity, logic_start,
-                                               logic_end - logic_start + 1);
+                                               logic_end - logic_start);
         scrub_parity_put(sparity);
         scrub_submit(sctx);
         mutex_lock(&sctx->wr_ctx.wr_lock);
@@ -3091,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
          */
         ret = 0;
         while (physical < physical_end) {
-               /* for raid56, we skip parity stripe */
-               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                       ret = get_raid56_logic_offset(physical, num,
-                                       map, &logical, &stripe_logical);
-                       logical += base;
-                       if (ret) {
-                               stripe_logical += base;
-                               stripe_end = stripe_logical + increment - 1;
-                               ret = scrub_raid56_parity(sctx, map, scrub_dev,
-                                               ppath, stripe_logical,
-                                               stripe_end);
-                               if (ret)
-                                       goto out;
-                               goto skip;
-                       }
-               }
                 /*
                  * canceled?
                  */
@@ -3131,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                         scrub_blocked_if_needed(fs_info);
                 }
  
+               /* for raid56, we skip parity stripe */
+               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+                       ret = get_raid56_logic_offset(physical, num, map,
+                                                     &logical,
+                                                     &stripe_logical);
+                       logical += base;
+                       if (ret) {
+                               stripe_logical += base;
+                               stripe_end = stripe_logical + increment;
+                               ret = scrub_raid56_parity(sctx, map, scrub_dev,
+                                                         ppath, stripe_logical,
+                                                         stripe_end);
+                               if (ret)
+                                       goto out;
+                               goto skip;
+                       }
+               }
+
                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
                         key.type = BTRFS_METADATA_ITEM_KEY;
                 else
@@ -3175,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                         }
                         btrfs_item_key_to_cpu(l, &key, slot);
  
+                       if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+                           key.type != BTRFS_METADATA_ITEM_KEY)
+                               goto next;
+
                         if (key.type == BTRFS_METADATA_ITEM_KEY)
                                 bytes = root->nodesize;
                         else
@@ -3183,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                         if (key.objectid + bytes <= logical)
                                 goto next;
  
-                       if (key.type != BTRFS_EXTENT_ITEM_KEY &&
-                           key.type != BTRFS_METADATA_ITEM_KEY)
-                               goto next;
-
                         if (key.objectid >= logical + map->stripe_len) {
                                 /* out of this device extent */
                                 if (key.objectid >= logic_end)
@@ -3199,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                         flags = btrfs_extent_flags(l, extent);
                         generation = btrfs_extent_generation(l, extent);
  
-                       if (key.objectid < logical &&
-                           (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                       if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+                           (key.objectid < logical ||
+                            key.objectid + bytes >
+                            logical + map->stripe_len)) {
                                 btrfs_err(fs_info,
                                            "scrub: tree block %llu spanning "
                                            "stripes, ignored. logical=%llu",
@@ -3234,9 +3390,11 @@ again:
                                                    &extent_dev,
                                                    &extent_mirror_num);
  
-                       ret = btrfs_lookup_csums_range(csum_root, logical,
-                                               logical + map->stripe_len - 1,
-                                               &sctx->csum_list, 1);
+                       ret = btrfs_lookup_csums_range(csum_root,
+                                                      extent_logical,
+                                                      extent_logical +
+                                                      extent_len - 1,
+                                                      &sctx->csum_list, 1);
                         if (ret)
                                 goto out;
  
@@ -3244,10 +3402,12 @@ again:
                                            extent_physical, extent_dev, flags,
                                            generation, extent_mirror_num,
                                            extent_logical - logical + physical);
+
+                       scrub_free_csums(sctx);
+
                         if (ret)
                                 goto out;
  
-                       scrub_free_csums(sctx);
                         if (extent_logical + extent_len <
                             key.objectid + bytes) {
                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3265,7 +3425,7 @@ loop:
                                         if (ret && physical < physical_end) {
                                                 stripe_logical += base;
                                                 stripe_end = stripe_logical +
-                                                               increment - 1;
+                                                               increment;
                                                 ret = scrub_raid56_parity(sctx,
                                                         map, scrub_dev, ppath,
                                                         stripe_logical,
@@ -3374,7 +3534,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
         u64 chunk_tree;
         u64 chunk_objectid;
         u64 chunk_offset;
-       int ret;
+       int ret = 0;
         int slot;
         struct extent_buffer *l;
         struct btrfs_key key;
@@ -3402,8 +3562,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                         if (path->slots[0] >=
                             btrfs_header_nritems(path->nodes[0])) {
                                 ret = btrfs_next_leaf(root, path);
-                               if (ret)
+                               if (ret < 0)
+                                       break;
+                               if (ret > 0) {
+                                       ret = 0;
                                         break;
+                               }
+                       } else {
+                               ret = 0;
                         }
                 }
  
@@ -3445,6 +3611,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                 if (!cache)
                         goto skip;
  
+               /*
+                * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+                * to avoid deadlock caused by:
+                * btrfs_inc_block_group_ro()
+                * -> btrfs_wait_for_commit()
+                * -> btrfs_commit_transaction()
+                * -> btrfs_scrub_pause()
+                */
+               scrub_pause_on(fs_info);
+               ret = btrfs_inc_block_group_ro(root, cache);
+               scrub_pause_off(fs_info);
+               if (ret) {
+                       btrfs_put_block_group(cache);
+                       break;
+               }
+
                 dev_replace->cursor_right = found_key.offset + length;
                 dev_replace->cursor_left = found_key.offset;
                 dev_replace->item_needs_writeback = 1;
@@ -3470,8 +3652,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
  
                 wait_event(sctx->list_wait,
                            atomic_read(&sctx->bios_in_flight) == 0);
-               atomic_inc(&fs_info->scrubs_paused);
-               wake_up(&fs_info->scrub_pause_wait);
+
+               scrub_pause_on(fs_info);
  
                 /*
                  * must be called before we decrease @scrub_paused.
@@ -3482,11 +3664,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                            atomic_read(&sctx->workers_pending) == 0);
                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
  
-               mutex_lock(&fs_info->scrub_lock);
-               __scrub_blocked_if_needed(fs_info);
-               atomic_dec(&fs_info->scrubs_paused);
-               mutex_unlock(&fs_info->scrub_lock);
-               wake_up(&fs_info->scrub_pause_wait);
+               scrub_pause_off(fs_info);
+
+               btrfs_dec_block_group_ro(root, cache);
  
                 btrfs_put_block_group(cache);
                 if (ret)
@@ -3510,11 +3690,7 @@ skip:
  
         btrfs_free_path(path);
  
-       /*
-        * ret can still be 1 from search_slot or next_leaf,
-        * that's not an error
-        */
-       return ret < 0 ? ret : 0;
+       return ret;
  }
  
  static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 6bad633..2b07b35 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type;
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data);
  
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
  {
         char *errstr = "unknown";
  
@@ -1033,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb,
         sb->s_flags |= MS_POSIXACL;
  #endif
         sb->s_flags |= MS_I_VERSION;
+       sb->s_iflags |= SB_I_CGROUPWB;
         err = open_ctree(sb, fs_devices, (char *)data);
         if (err) {
                 printk(KERN_ERR "BTRFS: open_ctree failed\n");
@@ -1650,6 +1651,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
  
                 sb->s_flags |= MS_RDONLY;
  
+               /*
+                * Setting MS_RDONLY will put the cleaner thread to
+                * sleep at the next loop if it's already active.
+                * If it's already asleep, we'll leave unused block
+                * groups on disk until we're mounted read-write again
+                * unless we clean them up here.
+                */
+               mutex_lock(&root->fs_info->cleaner_mutex);
+               btrfs_delete_unused_bgs(fs_info);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
+
                 btrfs_dev_replace_suspend_for_unmount(fs_info);
                 btrfs_scrub_cancel(fs_info);
                 btrfs_pause_balance(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index f5021fc..68ad89e 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -258,6 +258,8 @@ loop:
         mutex_init(&cur_trans->cache_write_mutex);
         cur_trans->num_dirty_bgs = 0;
         spin_lock_init(&cur_trans->dirty_bgs_lock);
+       INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+       spin_lock_init(&cur_trans->deleted_bgs_lock);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
                              fs_info->btree_inode->i_mapping);
@@ -1301,7 +1303,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
          */
         btrfs_set_skip_qgroup(trans, objectid);
  
-       btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+       btrfs_reloc_pre_snapshot(pending, &to_reserve);
  
         if (to_reserve > 0) {
                 pending->error = btrfs_block_rsv_add(root,
@@ -1893,8 +1895,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                         spin_unlock(&root->fs_info->trans_lock);
  
                         wait_for_commit(root, prev_trans);
+                       ret = prev_trans->aborted;
  
                         btrfs_put_transaction(prev_trans);
+                       if (ret)
+                               goto cleanup_transaction;
                 } else {
                         spin_unlock(&root->fs_info->trans_lock);
                 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index eb09c20..edc2fbc 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -74,6 +74,8 @@ struct btrfs_transaction {
          */
         struct mutex cache_write_mutex;
         spinlock_t dirty_bgs_lock;
+       struct list_head deleted_bgs;
+       spinlock_t deleted_bgs_lock;
         struct btrfs_delayed_ref_root delayed_refs;
         int aborted;
         int dirty_bg_run;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 9c45431..1bbaace 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_log_ctx *ctx)
  {
-       int index;
-       int ret;
+       int ret = 0;
  
         mutex_lock(&root->log_mutex);
+
         if (root->log_root) {
                 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
                         ret = -EAGAIN;
                         goto out;
                 }
+
                 if (!root->log_start_pid) {
-                       root->log_start_pid = current->pid;
                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+                       root->log_start_pid = current->pid;
                 } else if (root->log_start_pid != current->pid) {
                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                 }
+       } else {
+               mutex_lock(&root->fs_info->tree_log_mutex);
+               if (!root->fs_info->log_root_tree)
+                       ret = btrfs_init_log_root_tree(trans, root->fs_info);
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               if (ret)
+                       goto out;
  
-               atomic_inc(&root->log_batch);
-               atomic_inc(&root->log_writers);
-               if (ctx) {
-                       index = root->log_transid % 2;
-                       list_add_tail(&ctx->list, &root->log_ctxs[index]);
-                       ctx->log_transid = root->log_transid;
-               }
-               mutex_unlock(&root->log_mutex);
-               return 0;
-       }
-
-       ret = 0;
-       mutex_lock(&root->fs_info->tree_log_mutex);
-       if (!root->fs_info->log_root_tree)
-               ret = btrfs_init_log_root_tree(trans, root->fs_info);
-       mutex_unlock(&root->fs_info->tree_log_mutex);
-       if (ret)
-               goto out;
-
-       if (!root->log_root) {
                 ret = btrfs_add_log_tree(trans, root);
                 if (ret)
                         goto out;
+
+               clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+               root->log_start_pid = current->pid;
         }
-       clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-       root->log_start_pid = current->pid;
+
         atomic_inc(&root->log_batch);
         atomic_inc(&root->log_writers);
         if (ctx) {
-               index = root->log_transid % 2;
+               int index = root->log_transid % 2;
                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
                 ctx->log_transid = root->log_transid;
         }
+
  out:
         mutex_unlock(&root->log_mutex);
         return ret;
@@ -731,11 +722,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                 &ordered_sums, 0);
                         if (ret)
                                 goto out;
+                       /*
+                        * Now delete all existing cums in the csum root that
+                        * cover our range. We do this because we can have an
+                        * extent that is completely referenced by one file
+                        * extent item and partially referenced by another
+                        * file extent item (like after using the clone or
+                        * extent_same ioctls). In this case if we end up doing
+                        * the replay of the one that partially references the
+                        * extent first, and we do not do the csum deletion
+                        * below, we can get 2 csum items in the csum tree that
+                        * overlap each other. For example, imagine our log has
+                        * the two following file extent items:
+                        *
+                        * key (257 EXTENT_DATA 409600)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 20480 nr 20480 ram 102400
+                        *
+                        * key (257 EXTENT_DATA 819200)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 0 nr 102400 ram 102400
+                        *
+                        * Where the second one fully references the 100K extent
+                        * that starts at disk byte 12845056, and the log tree
+                        * has a single csum item that covers the entire range
+                        * of the extent:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        *
+                        * After the first file extent item is replayed, the
+                        * csum tree gets the following csum item:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which covers the 20K sub-range starting at offset 20K
+                        * of our extent. Now when we replay the second file
+                        * extent item, if we do not delete existing csum items
+                        * that cover any of its blocks, we end up getting two
+                        * csum items in our csum tree that overlap each other:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which is a problem, because after this anyone trying
+                        * to lookup up for the checksum of any block of our
+                        * extent starting at an offset of 40K or higher, will
+                        * end up looking at the second csum item only, which
+                        * does not contain the checksum for any block starting
+                        * at offset 40K or higher of our extent.
+                        */
                         while (!list_empty(&ordered_sums)) {
                                 struct btrfs_ordered_sum *sums;
                                 sums = list_entry(ordered_sums.next,
                                                 struct btrfs_ordered_sum,
                                                 list);
+                               if (!ret)
+                                       ret = btrfs_del_csums(trans,
+                                                     root->fs_info->csum_root,
+                                                     sums->bytenr,
+                                                     sums->len);
                                 if (!ret)
                                         ret = btrfs_csum_file_blocks(trans,
                                                 root->fs_info->csum_root,
@@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
   */
  static noinline int insert_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
-                                   struct btrfs_path *path,
                                     u64 dirid, u64 index,
-                                   char *name, int name_len, u8 type,
+                                   char *name, int name_len,
                                     struct btrfs_key *location)
  {
         struct inode *inode;
@@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
   * not exist in the FS, it is skipped.  fsyncs on directories
   * do not force down inodes inside that directory, just changes to the
   * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
   */
  static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
@@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
         int exists;
         int ret = 0;
         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+       bool name_added = false;
  
         dir = read_one_inode(root, key->objectid);
         if (!dir)
@@ -1708,6 +1756,8 @@ out:
         }
         kfree(name);
         iput(dir);
+       if (!ret && name_added)
+               ret = 1;
         return ret;
  
  insert:
@@ -1719,10 +1769,12 @@ insert:
                 goto out;
         }
         btrfs_release_path(path);
-       ret = insert_one_name(trans, root, path, key->objectid, key->offset,
-                             name, name_len, log_type, &log_key);
+       ret = insert_one_name(trans, root, key->objectid, key->offset,
+                             name, name_len, &log_key);
         if (ret && ret != -ENOENT && ret != -EEXIST)
                 goto out;
+       if (!ret)
+               name_added = true;
         update_size = false;
         ret = 0;
         goto out;
@@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                                         struct extent_buffer *eb, int slot,
                                         struct btrfs_key *key)
  {
-       int ret;
+       int ret = 0;
         u32 item_size = btrfs_item_size_nr(eb, slot);
         struct btrfs_dir_item *di;
         int name_len;
         unsigned long ptr;
         unsigned long ptr_end;
+       struct btrfs_path *fixup_path = NULL;
  
         ptr = btrfs_item_ptr_offset(eb, slot);
         ptr_end = ptr + item_size;
@@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                         return -EIO;
                 name_len = btrfs_dir_name_len(eb, di);
                 ret = replay_one_name(trans, root, path, eb, di, key);
-               if (ret)
-                       return ret;
+               if (ret < 0)
+                       break;
                 ptr = (unsigned long)(di + 1);
                 ptr += name_len;
+
+               /*
+                * If this entry refers to a non-directory (directories can not
+                * have a link count > 1) and it was added in the transaction
+                * that was not committed, make sure we fixup the link count of
+                * the inode it the entry points to. Otherwise something like
+                * the following would result in a directory pointing to an
+                * inode with a wrong link that does not account for this dir
+                * entry:
+                *
+                * mkdir testdir
+                * touch testdir/foo
+                * touch testdir/bar
+                * sync
+                *
+                * ln testdir/bar testdir/bar_link
+                * ln testdir/foo testdir/foo_link
+                * xfs_io -c "fsync" testdir/bar
+                *
+                * <power failure>
+                *
+                * mount fs, log replay happens
+                *
+                * File foo would remain with a link count of 1 when it has two
+                * entries pointing to it in the directory testdir. This would
+                * make it impossible to ever delete the parent directory has
+                * it would result in stale dentries that can never be deleted.
+                */
+               if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+                       struct btrfs_key di_key;
+
+                       if (!fixup_path) {
+                               fixup_path = btrfs_alloc_path();
+                               if (!fixup_path) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                       }
+
+                       btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                       ret = link_to_fixup_dir(trans, root, fixup_path,
+                                               di_key.objectid);
+                       if (ret)
+                               break;
+               }
+               ret = 0;
         }
-       return 0;
+       btrfs_free_path(fixup_path);
+       return ret;
  }
  
  /*
@@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
  {
         DEFINE_WAIT(wait);
         int index = transid % 2;
@@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
                  atomic_read(&root->log_commit[index]));
  }
  
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
  {
         DEFINE_WAIT(wait);
  
@@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         index1 = log_transid % 2;
         if (atomic_read(&root->log_commit[index1])) {
-               wait_log_commit(trans, root, log_transid);
+               wait_log_commit(root, log_transid);
                 mutex_unlock(&root->log_mutex);
                 return ctx->log_ret;
         }
@@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         /* wait for previous tree log sync to complete */
         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-               wait_log_commit(trans, root, log_transid - 1);
+               wait_log_commit(root, log_transid - 1);
  
         while (1) {
                 int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                         schedule_timeout_uninterruptible(1);
                         mutex_lock(&root->log_mutex);
                 }
-               wait_for_writer(trans, root);
+               wait_for_writer(root);
                 if (batch == atomic_read(&root->log_batch))
                         break;
         }
@@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
                                                 mark);
                 btrfs_wait_logged_extents(trans, log, log_transid);
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
                 if (!ret)
@@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         atomic_set(&log_root_tree->log_commit[index2], 1);
  
         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid - 1);
         }
  
-       wait_for_writer(trans, log_root_tree);
+       wait_for_writer(log_root_tree);
  
         /*
          * now that we've moved on to the tree of log tree roots,
@@ -4904,6 +5002,94 @@ next_dir_inode:
         return ret;
  }
  
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+                                struct inode *inode,
+                                struct btrfs_log_ctx *ctx)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       const u64 ino = btrfs_ino(inode);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_REF_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               u32 cur_offset = 0;
+               u32 item_size;
+               unsigned long ptr;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+               if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+                       break;
+
+               item_size = btrfs_item_size_nr(leaf, slot);
+               ptr = btrfs_item_ptr_offset(leaf, slot);
+               while (cur_offset < item_size) {
+                       struct btrfs_key inode_key;
+                       struct inode *dir_inode;
+
+                       inode_key.type = BTRFS_INODE_ITEM_KEY;
+                       inode_key.offset = 0;
+
+                       if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                               struct btrfs_inode_extref *extref;
+
+                               extref = (struct btrfs_inode_extref *)
+                                       (ptr + cur_offset);
+                               inode_key.objectid = btrfs_inode_extref_parent(
+                                       leaf, extref);
+                               cur_offset += sizeof(*extref);
+                               cur_offset += btrfs_inode_extref_name_len(leaf,
+                                       extref);
+                       } else {
+                               inode_key.objectid = key.offset;
+                               cur_offset = item_size;
+                       }
+
+                       dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+                                              root, NULL);
+                       /* If parent inode was deleted, skip it. */
+                       if (IS_ERR(dir_inode))
+                               continue;
+
+                       ret = btrfs_log_inode(trans, root, dir_inode,
+                                             LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+                       iput(dir_inode);
+                       if (ret)
+                               goto out;
+               }
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
  /*
   * helper function around btrfs_log_inode to make sure newly created
   * parent directories also end up in the log.  A minimal inode and backref
@@ -4923,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         struct dentry *old_parent = NULL;
         int ret = 0;
         u64 last_committed = root->fs_info->last_trans_committed;
-       const struct dentry * const first_parent = parent;
-       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
-                                last_committed);
         bool log_dentries = false;
         struct inode *orig_inode = inode;
  
@@ -4986,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
                 log_dentries = true;
  
+       /*
+        * On unlink we must make sure all our current and old parent directores
+        * inodes are fully logged. This is to prevent leaving dangling
+        * directory index entries in directories that were our parents but are
+        * not anymore. Not doing this results in old parent directory being
+        * impossible to delete after log replay (rmdir will always fail with
+        * error -ENOTEMPTY).
+        *
+        * Example 1:
+        *
+        * mkdir testdir
+        * touch testdir/foo
+        * ln testdir/foo testdir/bar
+        * sync
+        * unlink testdir/bar
+        * xfs_io -c fsync testdir/foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * If we don't log the parent directory (testdir), after log replay the
+        * directory still has an entry pointing to the file inode using the bar
+        * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+        * the file inode has a link count of 1.
+        *
+        * Example 2:
+        *
+        * mkdir testdir
+        * touch foo
+        * ln foo testdir/foo2
+        * ln foo testdir/foo3
+        * sync
+        * unlink testdir/foo3
+        * xfs_io -c fsync foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * Similar as the first example, after log replay the parent directory
+        * testdir still has an entry pointing to the inode file with name foo3
+        * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+        * and has a link count of 2.
+        */
+       if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+               ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+               if (ret)
+                       goto end_trans;
+       }
+
         while (1) {
                 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
                         break;
@@ -4994,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 if (root != BTRFS_I(inode)->root)
                         break;
  
-               /*
-                * On unlink we must make sure our immediate parent directory
-                * inode is fully logged. This is to prevent leaving dangling
-                * directory index entries and a wrong directory inode's i_size.
-                * Not doing so can result in a directory being impossible to
-                * delete after log replay (rmdir will always fail with error
-                * -ENOTEMPTY).
-                */
-               if (did_unlink && parent == first_parent)
-                       inode_only = LOG_INODE_ALL;
-               else
-                       inode_only = LOG_INODE_EXISTS;
-
-               if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed ||
-                   inode_only == LOG_INODE_ALL) {
-                       ret = btrfs_log_inode(trans, root, inode, inode_only,
+               if (BTRFS_I(inode)->generation > last_committed) {
+                       ret = btrfs_log_inode(trans, root, inode,
+                                             LOG_INODE_EXISTS,
                                               0, LLONG_MAX, ctx);
                         if (ret)
                                 goto end_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 762476f..76201d6 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1116,15 +1116,18 @@ out:
         return ret;
  }
  
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
                                    struct btrfs_device *device,
                                    u64 *start, u64 len)
  {
+       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
         struct extent_map *em;
-       struct list_head *search_list = &trans->transaction->pending_chunks;
+       struct list_head *search_list = &fs_info->pinned_chunks;
         int ret = 0;
         u64 physical_start = *start;
  
+       if (transaction)
+               search_list = &transaction->pending_chunks;
  again:
         list_for_each_entry(em, search_list, list) {
                 struct map_lookup *map;
@@ -1159,8 +1162,8 @@ again:
                         }
                 }
         }
-       if (search_list == &trans->transaction->pending_chunks) {
-               search_list = &trans->root->fs_info->pinned_chunks;
+       if (search_list != &fs_info->pinned_chunks) {
+               search_list = &fs_info->pinned_chunks;
                 goto again;
         }
  
@@ -1169,12 +1172,13 @@ again:
  
  
  /*
- * find_free_dev_extent - find free space in the specified device
- * @device:    the device which we search the free space in
- * @num_bytes: the size of the free space that we need
- * @start:     store the start of the free space.
- * @len:       the size of the free space. that we find, or the size of the max
- *             free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device:      the device which we search the free space in
+ * @num_bytes:   the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start:       store the start of the free space.
+ * @len:         the size of the free space. that we find, or the size
+ *               of the max free space if we don't find suitable free space
   *
   * this uses a pretty simple search, the expectation is that it is
   * called very infrequently and that a given device has a small number
@@ -1188,9 +1192,9 @@ again:
   * But if we don't find suitable free space, it is used to store the size of
   * the max free space.
   */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
-                        u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+                              struct btrfs_device *device, u64 num_bytes,
+                              u64 search_start, u64 *start, u64 *len)
  {
         struct btrfs_key key;
         struct btrfs_root *root = device->dev_root;
@@ -1200,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
         u64 max_hole_start;
         u64 max_hole_size;
         u64 extent_end;
-       u64 search_start;
         u64 search_end = device->total_bytes;
         int ret;
         int slot;
         struct extent_buffer *l;
  
-       /* FIXME use last free of some kind */
-
-       /* we don't want to overwrite the superblock on the drive,
-        * so we make sure to start at an offset of at least 1MB
-        */
-       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -1273,7 +1269,7 @@ again:
                          * Have to check before we set max_hole_start, otherwise
                          * we could end up sending back this offset anyway.
                          */
-                       if (contains_pending_extent(trans, device,
+                       if (contains_pending_extent(transaction, device,
                                                     &search_start,
                                                     hole_size)) {
                                 if (key.offset >= search_start) {
@@ -1322,7 +1318,7 @@ next:
         if (search_end > search_start) {
                 hole_size = search_end - search_start;
  
-               if (contains_pending_extent(trans, device, &search_start,
+               if (contains_pending_extent(transaction, device, &search_start,
                                             hole_size)) {
                         btrfs_release_path(path);
                         goto again;
@@ -1348,6 +1344,24 @@ out:
         return ret;
  }
  
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 *start, u64 *len)
+{
+       struct btrfs_root *root = device->dev_root;
+       u64 search_start;
+
+       /* FIXME use last free of some kind */
+
+       /*
+        * we don't want to overwrite the superblock on the drive,
+        * so we make sure to start at an offset of at least 1MB
+        */
+       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+       return find_free_dev_extent_start(trans->transaction, device,
+                                         num_bytes, search_start, start, len);
+}
+
  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                           struct btrfs_device *device,
                           u64 start, u64 *dev_extent_len)
@@ -2755,9 +2769,7 @@ out:
         return ret;
  }
  
-static int btrfs_relocate_chunk(struct btrfs_root *root,
-                               u64 chunk_objectid,
-                               u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
  {
         struct btrfs_root *extent_root;
         struct btrfs_trans_handle *trans;
@@ -2785,7 +2797,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
                 return -ENOSPC;
  
         /* step one, relocate all the extents inside this chunk */
+       btrfs_scrub_pause(root);
         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+       btrfs_scrub_continue(root);
         if (ret)
                 return ret;
  
@@ -2855,7 +2869,6 @@ again:
  
                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
                         ret = btrfs_relocate_chunk(chunk_root,
-                                                  found_key.objectid,
                                                    found_key.offset);
                         if (ret == -ENOSPC)
                                 failed++;
@@ -3375,7 +3388,6 @@ again:
                 }
  
                 ret = btrfs_relocate_chunk(chunk_root,
-                                          found_key.objectid,
                                            found_key.offset);
                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                 if (ret && ret != -ENOSPC)
@@ -4077,7 +4089,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         struct btrfs_dev_extent *dev_extent = NULL;
         struct btrfs_path *path;
         u64 length;
-       u64 chunk_objectid;
         u64 chunk_offset;
         int ret;
         int slot;
@@ -4154,11 +4165,10 @@ again:
                         break;
                 }
  
-               chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
                 btrfs_release_path(path);
  
-               ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+               ret = btrfs_relocate_chunk(root, chunk_offset);
                 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
                 if (ret && ret != -ENOSPC)
                         goto done;
@@ -4200,7 +4210,8 @@ again:
                 u64 start = new_size;
                 u64 len = old_size - new_size;
  
-               if (contains_pending_extent(trans, device, &start, len)) {
+               if (contains_pending_extent(trans->transaction, device,
+                                           &start, len)) {
                         unlock_chunks(root);
                         checked_pending_chunks = true;
                         failed = 0;
@@ -5071,9 +5082,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
                  * and the stripes
                  */
                 sizeof(u64) * (total_stripes),
-               GFP_NOFS);
-       if (!bbio)
-               return NULL;
+               GFP_NOFS|__GFP_NOFAIL);
  
         atomic_set(&bbio->error, 0);
         atomic_set(&bbio->refs, 1);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 95842a9..2ca784a 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -453,6 +453,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 search_start, u64 *start, u64 *max_avail);
  int find_free_dev_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_device *device, u64 num_bytes,
                          u64 *start, u64 *max_avail);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 5 Sep 2015 22:14:43 +0000 (15:14 -0700)
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/locking.c		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/raid56.h		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history