Merge tag 'for-5.11/drivers-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / fs / btrfs / qgroup.c
index c0f350c..fe30460 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/btrfs.h>
+#include <linux/sched/mm.h>
 
 #include "ctree.h"
 #include "transaction.h"
@@ -497,13 +498,13 @@ next2:
                        break;
        }
 out:
+       btrfs_free_path(path);
        fs_info->qgroup_flags |= flags;
        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
                 ret >= 0)
                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-       btrfs_free_path(path);
 
        if (ret < 0) {
                ulist_free(fs_info->qgroup_ulist);
@@ -893,8 +894,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
-       path->leave_spinning = 1;
-
        key.objectid = 0;
        key.offset = 0;
        key.type = 0;
@@ -936,6 +935,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        struct btrfs_key found_key;
        struct btrfs_qgroup *qgroup = NULL;
        struct btrfs_trans_handle *trans = NULL;
+       struct ulist *ulist = NULL;
        int ret = 0;
        int slot;
 
@@ -943,8 +943,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        if (fs_info->quota_root)
                goto out;
 
-       fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
-       if (!fs_info->qgroup_ulist) {
+       ulist = ulist_alloc(GFP_KERNEL);
+       if (!ulist) {
                ret = -ENOMEM;
                goto out;
        }
@@ -952,6 +952,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        ret = btrfs_sysfs_add_qgroups(fs_info);
        if (ret < 0)
                goto out;
+
+       /*
+        * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+        * avoid lock acquisition inversion problems (reported by lockdep) between
+        * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+        * start a transaction.
+        * After we started the transaction lock qgroup_ioctl_lock again and
+        * check if someone else created the quota root in the meanwhile. If so,
+        * just return success and release the transaction handle.
+        *
+        * Also we don't need to worry about someone else calling
+        * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+        * that function returns 0 (success) when the sysfs entries already exist.
+        */
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
        /*
         * 1 for quota root item
         * 1 for BTRFS_QGROUP_STATUS item
@@ -961,12 +977,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
         * would be a lot of overkill.
         */
        trans = btrfs_start_transaction(tree_root, 2);
+
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                goto out;
        }
 
+       if (fs_info->quota_root)
+               goto out;
+
+       fs_info->qgroup_ulist = ulist;
+       ulist = NULL;
+
        /*
         * initially create the quota tree
         */
@@ -1026,6 +1050,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
                if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+                       /* Release locks on tree_root before we access quota_root */
+                       btrfs_release_path(path);
+
                        ret = add_qgroup_item(trans, quota_root,
                                              found_key.offset);
                        if (ret) {
@@ -1044,6 +1072,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }
+                       ret = btrfs_search_slot_for_read(tree_root, &found_key,
+                                                        path, 1, 0);
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, ret);
+                               goto out_free_path;
+                       }
+                       if (ret > 0) {
+                               /*
+                                * Shouldn't happen, but in case it does we
+                                * don't need to do the btrfs_next_item, just
+                                * continue.
+                                */
+                               continue;
+                       }
                }
                ret = btrfs_next_item(tree_root, path);
                if (ret < 0) {
@@ -1106,11 +1148,14 @@ out:
        if (ret) {
                ulist_free(fs_info->qgroup_ulist);
                fs_info->qgroup_ulist = NULL;
-               if (trans)
-                       btrfs_end_transaction(trans);
                btrfs_sysfs_del_qgroups(fs_info);
        }
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
+       if (ret && trans)
+               btrfs_end_transaction(trans);
+       else if (trans)
+               ret = btrfs_end_transaction(trans);
+       ulist_free(ulist);
        return ret;
 }
 
@@ -1123,19 +1168,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root)
                goto out;
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
 
        /*
         * 1 For the root item
         *
         * We should also reserve enough items for the quota tree deletion in
         * btrfs_clean_quota_tree but this is not done.
+        *
+        * Also, we must always start a transaction without holding the mutex
+        * qgroup_ioctl_lock, see btrfs_quota_enable().
         */
        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
+               trans = NULL;
                goto out;
        }
 
+       if (!fs_info->quota_root)
+               goto out;
+
        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        btrfs_qgroup_wait_for_completion(fs_info, false);
        spin_lock(&fs_info->qgroup_lock);
@@ -1149,13 +1204,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
        ret = btrfs_clean_quota_tree(trans, quota_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto end_trans;
+               goto out;
        }
 
        ret = btrfs_del_root(trans, &quota_root->root_key);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto end_trans;
+               goto out;
        }
 
        list_del(&quota_root->dirty_list);
@@ -1167,10 +1222,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 
        btrfs_put_root(quota_root);
 
-end_trans:
-       ret = btrfs_end_transaction(trans);
 out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
+       if (ret && trans)
+               btrfs_end_transaction(trans);
+       else if (trans)
+               ret = btrfs_end_transaction(trans);
+
        return ret;
 }
 
@@ -1306,13 +1364,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
        struct ulist *tmp;
+       unsigned int nofs_flag;
        int ret = 0;
 
        /* Check the level of src and dst first */
        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
                return -EINVAL;
 
+       /* We hold a transaction handle open, must do a NOFS allocation. */
+       nofs_flag = memalloc_nofs_save();
        tmp = ulist_alloc(GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
        if (!tmp)
                return -ENOMEM;
 
@@ -1369,10 +1431,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
        struct btrfs_qgroup_list *list;
        struct ulist *tmp;
        bool found = false;
+       unsigned int nofs_flag;
        int ret = 0;
        int ret2;
 
+       /* We hold a transaction handle open, must do a NOFS allocation. */
+       nofs_flag = memalloc_nofs_save();
        tmp = ulist_alloc(GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
        if (!tmp)
                return -ENOMEM;
 
@@ -1876,34 +1942,22 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
                struct btrfs_key dst_key;
 
                if (src_path->nodes[cur_level] == NULL) {
-                       struct btrfs_key first_key;
                        struct extent_buffer *eb;
                        int parent_slot;
-                       u64 child_gen;
-                       u64 child_bytenr;
 
                        eb = src_path->nodes[cur_level + 1];
                        parent_slot = src_path->slots[cur_level + 1];
-                       child_bytenr = btrfs_node_blockptr(eb, parent_slot);
-                       child_gen = btrfs_node_ptr_generation(eb, parent_slot);
-                       btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
 
-                       eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                                            cur_level, &first_key);
+                       eb = btrfs_read_node_slot(eb, parent_slot);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                goto out;
-                       } else if (!extent_buffer_uptodate(eb)) {
-                               free_extent_buffer(eb);
-                               ret = -EIO;
-                               goto out;
                        }
 
                        src_path->nodes[cur_level] = eb;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_read(eb);
-                       src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+                       src_path->locks[cur_level] = BTRFS_READ_LOCK;
                }
 
                src_path->slots[cur_level] = dst_path->slots[cur_level];
@@ -1998,10 +2052,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
 
        /* Read the tree block if needed */
        if (dst_path->nodes[cur_level] == NULL) {
-               struct btrfs_key first_key;
                int parent_slot;
                u64 child_gen;
-               u64 child_bytenr;
 
                /*
                 * dst_path->nodes[root_level] must be initialized before
@@ -2020,31 +2072,23 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
                  */
                eb = dst_path->nodes[cur_level + 1];
                parent_slot = dst_path->slots[cur_level + 1];
-               child_bytenr = btrfs_node_blockptr(eb, parent_slot);
                child_gen = btrfs_node_ptr_generation(eb, parent_slot);
-               btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
 
                /* This node is old, no need to trace */
                if (child_gen < last_snapshot)
                        goto out;
 
-               eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                                    cur_level, &first_key);
+               eb = btrfs_read_node_slot(eb, parent_slot);
                if (IS_ERR(eb)) {
                        ret = PTR_ERR(eb);
                        goto out;
-               } else if (!extent_buffer_uptodate(eb)) {
-                       free_extent_buffer(eb);
-                       ret = -EIO;
-                       goto out;
                }
 
                dst_path->nodes[cur_level] = eb;
                dst_path->slots[cur_level] = 0;
 
                btrfs_tree_read_lock(eb);
-               btrfs_set_lock_blocking_read(eb);
-               dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+               dst_path->locks[cur_level] = BTRFS_READ_LOCK;
                need_cleanup = true;
        }
 
@@ -2188,38 +2232,28 @@ walk_down:
        level = root_level;
        while (level >= 0) {
                if (path->nodes[level] == NULL) {
-                       struct btrfs_key first_key;
                        int parent_slot;
-                       u64 child_gen;
                        u64 child_bytenr;
 
                        /*
-                        * We need to get child blockptr/gen from parent before
-                        * we can read it.
+                        * We need to get child blockptr from parent before we
+                        * can read it.
                          */
                        eb = path->nodes[level + 1];
                        parent_slot = path->slots[level + 1];
                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
-                       child_gen = btrfs_node_ptr_generation(eb, parent_slot);
-                       btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
 
-                       eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                                            level, &first_key);
+                       eb = btrfs_read_node_slot(eb, parent_slot);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                goto out;
-                       } else if (!extent_buffer_uptodate(eb)) {
-                               free_extent_buffer(eb);
-                               ret = -EIO;
-                               goto out;
                        }
 
                        path->nodes[level] = eb;
                        path->slots[level] = 0;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_read(eb);
-                       path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+                       path->locks[level] = BTRFS_READ_LOCK;
 
                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
                                                        fs_info->nodesize,
@@ -2315,7 +2349,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
  * Update qgroup rfer/excl counters.
  * Rfer update is easy, codes can explain themselves.
  *
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
  * Part 1: Possible exclusive <-> sharing detect:
  *     |       A       |       !A      |
  *  -------------------------------------
@@ -3417,24 +3451,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
 {
        struct rb_node *node;
        struct rb_node *next;
-       struct ulist_node *entry = NULL;
+       struct ulist_node *entry;
        int ret = 0;
 
        node = reserved->range_changed.root.rb_node;
+       if (!node)
+               return 0;
        while (node) {
                entry = rb_entry(node, struct ulist_node, rb_node);
                if (entry->val < start)
                        node = node->rb_right;
-               else if (entry)
-                       node = node->rb_left;
                else
-                       break;
+                       node = node->rb_left;
        }
 
-       /* Empty changeset */
-       if (!entry)
-               return 0;
-
        if (entry->val > start && rb_prev(&entry->rb_node))
                entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
                                 rb_node);
@@ -3498,6 +3528,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
 {
        struct btrfs_trans_handle *trans;
        int ret;
+       bool can_commit = true;
 
        /*
         * We don't want to run flush again and again, so if there is a running
@@ -3509,6 +3540,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
                return 0;
        }
 
+       /*
+        * If current process holds a transaction, we shouldn't flush, as we
+        * assume all space reservation happens before a transaction handle is
+        * held.
+        *
+        * But there are cases like btrfs_delayed_item_reserve_metadata() where
+        * we try to reserve space with one transction handle already held.
+        * In that case we can't commit transaction, but at least try to end it
+        * and hope the started data writes can free some space.
+        */
+       if (current->journal_info &&
+           current->journal_info != BTRFS_SEND_TRANS_STUB)
+               can_commit = false;
+
        ret = btrfs_start_delalloc_snapshot(root);
        if (ret < 0)
                goto out;
@@ -3520,7 +3565,10 @@ static int try_flush_qgroup(struct btrfs_root *root)
                goto out;
        }
 
-       ret = btrfs_commit_transaction(trans);
+       if (can_commit)
+               ret = btrfs_commit_transaction(trans);
+       else
+               ret = btrfs_end_transaction(trans);
 out:
        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
        wake_up(&root->qgroup_flush_wait);
@@ -4160,7 +4208,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
        spin_unlock(&blocks->lock);
 
        /* Read out reloc subtree root */
-       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
                                   block->reloc_generation, block->level,
                                   &block->first_key);
        if (IS_ERR(reloc_eb)) {