btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, pos, release_bytes);
+ btrfs_delalloc_release_space(inode,
+ round_down(pos, root->sectorsize),
+ release_bytes);
}
}
return num_written ? num_written : ret;
}
-static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- struct iov_iter *from,
- loff_t pos)
+static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ loff_t pos = iocb->ki_pos;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = __btrfs_direct_write(iocb, from, pos);
+ num_written = __btrfs_direct_write(iocb, from);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
spin_unlock(&BTRFS_I(inode)->lock);
- if (num_written > 0) {
- err = generic_write_sync(file, pos, num_written);
- if (err < 0)
- num_written = err;
- }
+ if (num_written > 0)
+ num_written = generic_write_sync(iocb, num_written);
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.copy_file_range = btrfs_copy_file_range,
.clone_file_range = btrfs_clone_file_range,
async_extent->ram_size - 1, 0);
goto out_free_reserve;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
/*
* clear dirty, set writeback and unlock the pages.
}
return;
out_free_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
goto out_drop_extent_cache;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
if (disk_num_bytes < cur_alloc_size)
break;
out_drop_extent_cache:
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
*/
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out_check;
+ if (!btrfs_inc_nocow_writers(root->fs_info,
+ disk_bytenr))
+ goto out_check;
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
path->slots[0]++;
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto next_slot;
}
if (!nocow) {
if (ret) {
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto error;
}
cow_start = (u64)-1;
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
return em;
}
+ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+ const u64 start,
+ const u64 len,
+ const u64 orig_start,
+ const u64 block_start,
+ const u64 block_len,
+ const u64 orig_block_len,
+ const u64 ram_bytes,
+ const int type)
+ {
+ struct extent_map *em = NULL;
+ int ret;
+
+ down_read(&BTRFS_I(inode)->dio_sem);
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = create_pinned_em(inode, start, len, orig_start,
+ block_start, block_len, orig_block_len,
+ ram_bytes, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+ len, block_len, type);
+ if (ret) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_cache(inode, start,
+ start + len - 1, 0);
+ }
+ em = ERR_PTR(ret);
+ }
+ out:
+ up_read(&BTRFS_I(inode)->dio_sem);
+
+ return em;
+ }
+
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
u64 start, u64 len)
{
if (ret)
return ERR_PTR(ret);
- /*
- * Create the ordered extent before the extent map. This is to avoid
- * races with the fast fsync path that would lead to it logging file
- * extent items that point to disk extents that were not yet written to.
- * The fast fsync path collects ordered extents into a local list and
- * then collects all the new extent maps, so we must create the ordered
- * extent first and make sure the fast fsync path collects any new
- * ordered extents after collecting new extent maps as well.
- * The fsync path simply can not rely on inode_dio_wait() because it
- * causes deadlock with AIO.
- */
- ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
- ins.offset, ins.offset, 0);
- if (ret) {
+ em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+ ins.objectid, ins.offset, ins.offset,
+ ins.offset, 0);
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ if (IS_ERR(em))
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return ERR_PTR(ret);
- }
-
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- struct btrfs_ordered_extent *oe;
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- oe = btrfs_lookup_ordered_extent(inode, start);
- ASSERT(oe);
- if (WARN_ON(!oe))
- return em;
- set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
- set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
- btrfs_remove_ordered_extent(inode, oe);
- /* Once for our lookup and once for the ordered extents tree. */
- btrfs_put_ordered_extent(oe);
- btrfs_put_ordered_extent(oe);
- }
return em;
}
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1) {
+ &orig_block_len, &ram_bytes) == 1 &&
+ btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+ struct extent_map *em2;
+
+ em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(root->fs_info, block_start);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
- em = create_pinned_em(inode, start, len,
- orig_start,
- block_start, len,
- orig_block_len,
- ram_bytes, type);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
+ em = em2;
}
-
- ret = btrfs_add_ordered_extent_dio(inode, start,
- block_start, len, len, type);
- if (ret) {
- free_extent_map(em);
+ if (em2 && IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
goto unlock_err;
}
goto unlock;
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_data dio_data = { 0 };
+ loff_t offset = iocb->ki_pos;
size_t count = 0;
int flags = 0;
bool wakeup = true;
ret = __blockdev_direct_IO(iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iter, offset, btrfs_get_blocks_direct, NULL,
+ iter, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->dio_sem);
return inode;
}
return 0;
}
+ static int btrfs_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+ {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ struct dentry *parent;
+ u64 old_ino = btrfs_ino(old_inode);
+ u64 new_ino = btrfs_ino(new_inode);
+ u64 old_idx = 0;
+ u64 new_idx = 0;
+ u64 root_objectid;
+ int ret;
+ bool root_log_pinned = false;
+ bool dest_log_pinned = false;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ /* close the race window with snapshot create/destroy ioctl */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&dest->fs_info->subvol_sem);
+
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 12);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ /*
+ * We need to find a free sequence number both in the source and
+ * in the destination directory for the exchange.
+ */
+ ret = btrfs_set_inode_index(new_dir, &old_idx);
+ if (ret)
+ goto out_fail;
+ ret = btrfs_set_inode_index(old_dir, &new_idx);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ BTRFS_I(new_inode)->dir_index = 0ULL;
+
+ /* Reference for the source. */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_ino,
+ btrfs_ino(new_dir), old_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* And now for the dest. */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(dest->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, root,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len,
+ new_ino,
+ btrfs_ino(old_dir), new_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* Update inode version and ctime/mtime. */
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ inode_inc_iversion(new_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+ new_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+ btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+ }
+
+ /* src is a subvolume */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir,
+ root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else { /* src is an inode */
+ ret = __btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, old_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ /* dest is a subvolume */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ } else { /* dest is an inode */
+ ret = __btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, dest, new_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, old_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, old_dir, new_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, 0, new_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = old_idx;
+ if (new_inode->i_nlink == 1)
+ BTRFS_I(new_inode)->dir_index = new_idx;
+
+ if (root_log_pinned) {
+ parent = new_dentry->d_parent;
+ btrfs_log_new_name(trans, old_inode, old_dir, parent);
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ parent = old_dentry->d_parent;
+ btrfs_log_new_name(trans, new_inode, new_dir, parent);
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+ out_fail:
+ /*
+ * If we have pinned a log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && (root_log_pinned || dest_log_pinned)) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ if (root_log_pinned) {
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+ }
+ ret = btrfs_end_transaction(trans, root);
+ out_notrans:
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&dest->fs_info->subvol_sem);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ return ret;
+ }
+
+ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ struct dentry *dentry)
+ {
+ int ret;
+ struct inode *inode;
+ u64 objectid;
+ u64 index;
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ return ret;
+
+ inode = btrfs_new_inode(trans, root, dir,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ btrfs_ino(dir),
+ objectid,
+ S_IFCHR | WHITEOUT_MODE,
+ &index);
+
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ return ret;
+ }
+
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ WHITEOUT_DEV);
+
+ ret = btrfs_init_inode_security(trans, inode, dir,
+ &dentry->d_name);
+ if (ret)
+ goto out;
+
+ ret = btrfs_add_nondir(trans, dir, dentry,
+ inode, 0, index);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ out:
+ unlock_new_inode(inode);
+ if (ret)
+ inode_dec_link_count(inode);
+ iput(inode);
+
+ return ret;
+ }
+
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(old_inode);
+ bool log_pinned = false;
if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
* require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
+ * would require 5 item modifications, so we'll assume they are normal
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
* should cover the worst case number of items we'll modify.
+ * If our rename has the whiteout flag, we need more 5 units for the
+ * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+ * when selinux is enabled).
*/
- trans = btrfs_start_transaction(root, 11);
+ trans_num_items = 11;
+ if (flags & RENAME_WHITEOUT)
+ trans_num_items += 5;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_notrans;
- }
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(root->fs_info, trans);
} else {
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
btrfs_ino(new_dir), index);
if (ret)
goto out_fail;
- /*
- * this is an ugly little race, but the rename is required
- * to make sure that if we crash, the inode is either at the
- * old name or the new one. pinning the log transaction lets
- * us make sure we don't allow a log commit to come in after
- * we unlink the name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
}
inode_inc_iversion(old_dir);
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
+
btrfs_log_new_name(trans, old_inode, old_dir, parent);
btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
+
+ if (flags & RENAME_WHITEOUT) {
+ ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+ old_dentry);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
}
out_fail:
+ /*
+ * If we have pinned the log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && log_pinned) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
btrfs_end_transaction(trans, root);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (flags & RENAME_EXCHANGE)
+ return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
static void btrfs_run_delalloc_work(struct btrfs_work *work)
btrfs_end_transaction(trans, root);
break;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.iterate = btrfs_real_readdir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.update_time = btrfs_update_time,
};
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
- if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
- iflags |= FS_COMPR_FL;
- else if (flags & BTRFS_INODE_NOCOMPRESS)
+ if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
+ else if (flags & BTRFS_INODE_COMPRESS)
+ iflags |= FS_COMPR_FL;
return iflags;
}
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_root_item root_item;
+ struct btrfs_root_item *root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
u64 qgroup_reserved;
uuid_le new_uuid;
+ root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+ if (!root_item)
+ return -ENOMEM;
+
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
- return ret;
+ goto fail_free;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assume subvolme qgroup's level to be 0.
*/
- if (btrfs_qgroup_level(objectid))
- return -ENOSPC;
+ if (btrfs_qgroup_level(objectid)) {
+ ret = -ENOSPC;
+ goto fail_free;
+ }
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
8, &qgroup_reserved, false);
if (ret)
- return ret;
+ goto fail_free;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv,
qgroup_reserved);
- return ret;
+ goto fail_free;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
- memset(&root_item, 0, sizeof(root_item));
-
- inode_item = &root_item.inode;
+ inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
- btrfs_set_root_flags(&root_item, 0);
- btrfs_set_root_limit(&root_item, 0);
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
- btrfs_set_root_bytenr(&root_item, leaf->start);
- btrfs_set_root_generation(&root_item, trans->transid);
- btrfs_set_root_level(&root_item, 0);
- btrfs_set_root_refs(&root_item, 1);
- btrfs_set_root_used(&root_item, leaf->len);
- btrfs_set_root_last_snapshot(&root_item, 0);
+ btrfs_set_root_bytenr(root_item, leaf->start);
+ btrfs_set_root_generation(root_item, trans->transid);
+ btrfs_set_root_level(root_item, 0);
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_used(root_item, leaf->len);
+ btrfs_set_root_last_snapshot(root_item, 0);
- btrfs_set_root_generation_v2(&root_item,
- btrfs_root_generation(&root_item));
+ btrfs_set_root_generation_v2(root_item,
+ btrfs_root_generation(root_item));
uuid_le_gen(&new_uuid);
- memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
- btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
- btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
- root_item.ctime = root_item.otime;
- btrfs_set_root_ctransid(&root_item, trans->transid);
- btrfs_set_root_otransid(&root_item, trans->transid);
+ memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+ root_item->ctime = root_item->otime;
+ btrfs_set_root_ctransid(root_item, trans->transid);
+ btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(&root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, new_dirid);
key.objectid = objectid;
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- &root_item);
+ root_item);
if (ret)
goto fail;
BUG_ON(ret);
ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret)
btrfs_abort_transaction(trans, root, ret);
fail:
+ kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
d_instantiate(dentry, inode);
}
return ret;
+
+ fail_free:
+ kfree(root_item);
+ return ret;
}
static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
if (ret)
goto dec_and_free;
- btrfs_wait_ordered_extents(root, -1);
+ btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
struct dentry *dentry;
int error;
- error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- if (error == -EINTR)
- return error;
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ // XXX: should've been
+ // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ // if (error == -EINTR)
+ // return error;
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
goto out;
- err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- if (err == -EINTR)
- goto out_drop_write;
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ // XXX: should've been
+ // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ // if (err == -EINTR)
+ // goto out_drop_write;
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
-out_drop_write:
+//out_drop_write:
mnt_drop_write_file(file);
out:
kfree(vol_args);
return ret;
}
- static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
goto err_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ /* Check for compatibility reject unknown flags */
+ if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+ return -EOPNOTSUPP;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
}
mutex_lock(&root->fs_info->volume_mutex);
- ret = btrfs_rm_device(root, vol_args->name);
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ } else {
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ }
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
- if (!ret)
- btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
-
+ if (!ret) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+ btrfs_info(root->fs_info, "device deleted: id %llu",
+ vol_args->devid);
+ else
+ btrfs_info(root->fs_info, "device deleted: %s",
+ vol_args->name);
+ }
out:
kfree(vol_args);
err_drop:
return ret;
}
+ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ {
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_drop_write;
+ }
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ kfree(vol_args);
+ out:
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
+ }
+
static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(root->nodesize);
- if (!buf)
- return ret;
+ buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ if (!buf) {
+ buf = vmalloc(root->nodesize);
+ if (!buf)
+ return ret;
+ }
path = btrfs_alloc_path();
if (!path) {
- vfree(buf);
+ kvfree(buf);
return ret;
}
out:
btrfs_free_path(path);
- vfree(buf);
+ kvfree(buf);
return ret;
}
1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
- ret = btrfs_dev_replace_start(root, p);
+ ret = btrfs_dev_replace_by_ioctl(root, p);
atomic_set(
&root->fs_info->mutually_exclusive_operation_running,
0);
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_std_error(root->fs_info, ret,
- "failed to update qgroup status and info\n");
+ btrfs_handle_fs_error(root->fs_info, err,
+ "failed to update qgroup status and info");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
ret = err;
if (ret)
return ret;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop_write;
+ }
spin_lock(&root->fs_info->super_lock);
newflags = btrfs_super_compat_flags(super_block);
btrfs_set_super_incompat_flags(super_block, newflags);
spin_unlock(&root->fs_info->super_lock);
- return btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+ out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
}
long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(file, argp);
+ case BTRFS_IOC_RM_DEV_V2:
+ return btrfs_ioctl_rm_dev_v2(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(root, argp);
case BTRFS_IOC_DEV_INFO:
return -ENOTTY;
}
+
+ #ifdef CONFIG_COMPAT
+ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+ {
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+ }
+ #endif
INIT_LIST_HEAD(&extents);
+ down_write(&BTRFS_I(inode)->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
}
list_sort(NULL, &extents, extent_cmp);
+ btrfs_get_logged_extents(inode, logged_list, start, end);
/*
- * Collect any new ordered extents within the range. This is to
- * prevent logging file extent items without waiting for the disk
- * location they point to being written. We do this only to deal
- * with races against concurrent lockless direct IO writes.
+ * Some ordered extents started by fsync might have completed
+ * before we could collect them into the list logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
*/
- btrfs_get_logged_extents(inode, logged_list, start, end);
+ ret = btrfs_inode_check_errors(inode);
+ if (ret)
+ ctx->io_err = ret;
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
+ up_write(&BTRFS_I(inode)->dio_sem);
btrfs_release_path(path);
return ret;
mutex_lock(&BTRFS_I(inode)->log_mutex);
- /*
- * Collect ordered extents only if we are logging data. This is to
- * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
- * will process the ordered extents if they still exists at the time,
- * because when we collect them we test and set for the flag
- * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
- * same ordered extents. The consequence for the LOG_INODE_ALL log mode
- * not processing the ordered extents is that we end up logging the
- * corresponding file extent items, based on the extent maps in the
- * inode's extent_map_tree's modified_list, without logging the
- * respective checksums (since the may still be only attached to the
- * ordered extents and have not been inserted in the csum tree by
- * btrfs_finish_ordered_io() yet).
- */
- if (inode_only == LOG_INODE_ALL)
- btrfs_get_logged_extents(inode, &logged_list, start, end);
-
/*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
goto out_unlock;
}
if (fast_search) {
- /*
- * Some ordered extents started by fsync might have completed
- * before we collected the ordered extents in logged_list, which
- * means they're gone, not in our logged_list nor in the inode's
- * ordered tree. We want the application/user space to know an
- * error happened while attempting to persist file data so that
- * it can take proper action. If such error happened, we leave
- * without writing to the log tree and the fsync must report the
- * file data write error and not commit the current transaction.
- */
- err = btrfs_inode_check_errors(inode);
- if (err) {
- ctx->io_err = err;
- goto out_unlock;
- }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx, start, end);
if (ret) {
goto out;
if (!S_ISDIR(inode->i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
goto out;
inode = d_inode(parent);
}
break;
}
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
if (IS_ROOT(parent))
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR)
+ if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
if (IS_ERR(dir_inode))
continue;
+ if (ctx)
+ ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, dir_inode))
ret = 1;
+ if (!ret && ctx && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, root,
+ dir_inode, ctx);
iput(dir_inode);
if (ret)
goto out;
}
while (1) {
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
inode = d_inode(parent);
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_std_error(fs_info, ret, "Couldn't read target root "
+ btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode)) {
- mutex_lock(&BTRFS_I(inode)->log_mutex);
- BTRFS_I(inode)->last_unlink_trans = trans->transid;
- mutex_unlock(&BTRFS_I(inode)->log_mutex);
- }
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
/*
* if this directory was already logged any new
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
-#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
+#include <linux/uuid.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
[BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
};
+ /*
+ * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
+ * condition is not met. Zero means there's no corresponding
+ * BTRFS_ERROR_DEV_*_NOT_MET value.
+ */
+ const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+ [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+ [BTRFS_RAID_DUP] = 0,
+ [BTRFS_RAID_RAID0] = 0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+ [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+ };
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
* if there is new btrfs on an already registered device,
* then remove the stale device entry.
*/
- btrfs_free_stale_device(device);
+ if (ret > 0)
+ btrfs_free_stale_device(device);
*fs_devices_ret = fs_devices;
return ret;
}
+ void btrfs_release_disk_super(struct page *page)
+ {
+ kunmap(page);
+ put_page(page);
+ }
+
+ int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+ struct page **page, struct btrfs_super_block **disk_super)
+ {
+ void *p;
+ pgoff_t index;
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ return 1;
+
+ /* make sure our super fits in the page */
+ if (sizeof(**disk_super) > PAGE_SIZE)
+ return 1;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_SHIFT;
+ if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
+ return 1;
+
+ /* pull in the page with our super */
+ *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_KERNEL);
+
+ if (IS_ERR_OR_NULL(*page))
+ return 1;
+
+ p = kmap(*page);
+
+ /* align our pointer to the offset of the super block */
+ *disk_super = p + (bytenr & ~PAGE_MASK);
+
+ if (btrfs_super_bytenr(*disk_super) != bytenr ||
+ btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(*page);
+ return 1;
+ }
+
+ if ((*disk_super)->label[0] &&
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+
+ return 0;
+ }
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
struct btrfs_super_block *disk_super;
struct block_device *bdev;
struct page *page;
- void *p;
int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
u64 bytenr;
- pgoff_t index;
/*
* we would like to check all the supers, but that would make
mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
-
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error;
}
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
- goto error_bdev_put;
-
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
- goto error_bdev_put;
-
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- goto error_bdev_put;
-
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- index, GFP_NOFS);
-
- if (IS_ERR_OR_NULL(page))
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
goto error_bdev_put;
- p = kmap(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + (bytenr & ~PAGE_MASK);
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC)
- goto error_unmap;
-
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (ret > 0) {
if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
} else {
printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
- error_unmap:
- kunmap(page);
- put_page(page);
+ btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_std_error(root->fs_info, ret, "Slot search failed");
+ btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
return ret;
}
- int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+ /*
+ * Verify that @num_devices satisfies the RAID profile constraints in the whole
+ * filesystem. It's up to the caller to adjust that number regarding eg. device
+ * replace.
+ */
+ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+ u64 num_devices)
+ {
+ u64 all_avail;
+ unsigned seq;
+ int i;
+
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ all_avail = fs_info->avail_data_alloc_bits |
+ fs_info->avail_system_alloc_bits |
+ fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!(all_avail & btrfs_raid_group[i]))
+ continue;
+
+ if (num_devices < btrfs_raid_array[i].devs_min) {
+ int ret = btrfs_raid_mindev_error[i];
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+ }
+
+ struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
+ struct btrfs_device *device)
{
- struct btrfs_device *device;
struct btrfs_device *next_device;
- struct block_device *bdev;
- struct buffer_head *bh = NULL;
- struct btrfs_super_block *disk_super;
+
+ list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+ if (next_device != device &&
+ !next_device->missing && next_device->bdev)
+ return next_device;
+ }
+
+ return NULL;
+ }
+
+ /*
+ * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * and replace it with the provided or the next active device, in the context
+ * where this function called, there should be always be another device (or
+ * this_dev) which is active.
+ */
+ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *device, struct btrfs_device *this_dev)
+ {
+ struct btrfs_device *next_device;
+
+ if (this_dev)
+ next_device = this_dev;
+ else
+ next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+ device);
+ ASSERT(next_device);
+
+ if (fs_info->sb->s_bdev &&
+ (fs_info->sb->s_bdev == device->bdev))
+ fs_info->sb->s_bdev = next_device->bdev;
+
+ if (fs_info->fs_devices->latest_bdev == device->bdev)
+ fs_info->fs_devices->latest_bdev = next_device->bdev;
+ }
+
+ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+ {
+ struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
- u64 all_avail;
- u64 devid;
u64 num_devices;
- u8 *dev_uuid;
- unsigned seq;
int ret = 0;
bool clear_super = false;
+ char *dev_name = NULL;
mutex_lock(&uuid_mutex);
- do {
- seq = read_seqbegin(&root->fs_info->profiles_lock);
-
- all_avail = root->fs_info->avail_data_alloc_bits |
- root->fs_info->avail_system_alloc_bits |
- root->fs_info->avail_metadata_alloc_bits;
- } while (read_seqretry(&root->fs_info->profiles_lock, seq));
-
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
- goto out;
- }
-
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+ ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+ if (ret)
goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
- root->fs_info->fs_devices->rw_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
- goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
- root->fs_info->fs_devices->rw_devices <= 3) {
- ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+ ret = btrfs_find_device_by_devspec(root, devid, device_path,
+ &device);
+ if (ret)
goto out;
- }
-
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- device = NULL;
- devices = &root->fs_info->fs_devices->devices;
- /*
- * It is safe to read the devices since the volume_mutex
- * is held.
- */
- list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata &&
- !tmp->is_tgtdev_for_dev_replace &&
- !tmp->bdev) {
- device = tmp;
- break;
- }
- }
- bdev = NULL;
- bh = NULL;
- disk_super = NULL;
- if (!device) {
- ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
- goto out;
- }
- } else {
- ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_WRITE | FMODE_EXCL,
- root->fs_info->bdev_holder, 0,
- &bdev, &bh);
- if (ret)
- goto out;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(root->fs_info, devid, dev_uuid,
- disk_super->fsid);
- if (!device) {
- ret = -ENOENT;
- goto error_brelse;
- }
- }
if (device->is_tgtdev_for_dev_replace) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto error_brelse;
+ goto out;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto error_brelse;
+ goto out;
}
if (device->writeable) {
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
unlock_chunks(root);
+ dev_name = kstrdup(device->name->str, GFP_KERNEL);
+ if (!dev_name) {
+ ret = -ENOMEM;
+ goto error_undo;
+ }
clear_super = true;
}
if (device->missing)
device->fs_devices->missing_devices--;
- next_device = list_entry(root->fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (device->bdev == root->fs_info->sb->s_bdev)
- root->fs_info->sb->s_bdev = next_device->bdev;
- if (device->bdev == root->fs_info->fs_devices->latest_bdev)
- root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+ btrfs_assign_next_active_device(root->fs_info, device, NULL);
if (device->bdev) {
device->fs_devices->open_devices--;
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
- if (clear_super && disk_super) {
- u64 bytenr;
- int i;
-
- /* make sure this device isn't detected as part of
- * the FS anymore
- */
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
-
- /* clear the mirror copies of super block on the disk
- * being removed, 0th copy is been taken care above and
- * the below would take of the rest
- */
- for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
-
- brelse(bh);
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- continue;
- }
- memset(&disk_super->magic, 0,
- sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
+ if (clear_super) {
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder);
+ if (!IS_ERR(bdev)) {
+ btrfs_scratch_superblocks(bdev, dev_name);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
}
}
- ret = 0;
-
- if (bdev) {
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
- }
-
- error_brelse:
- brelse(bh);
- if (bdev)
- blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
+ kfree(dev_name);
+
mutex_unlock(&uuid_mutex);
return ret;
+
error_undo:
if (device->writeable) {
lock_chunks(root);
device->fs_devices->rw_devices++;
unlock_chunks(root);
}
- goto error_brelse;
+ goto out;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->missing)
fs_devices->missing_devices--;
- if (srcdev->writeable) {
+ if (srcdev->writeable)
fs_devices->rw_devices--;
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
- }
if (srcdev->bdev)
fs_devices->open_devices--;
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+ if (srcdev->writeable) {
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+ }
call_rcu(&srcdev->rcu, free_device);
/*
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
- struct btrfs_device *next_device;
-
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
- if (tgtdev->bdev) {
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ if (tgtdev->bdev)
fs_info->fs_devices->open_devices--;
- }
+
fs_info->fs_devices->num_devices--;
- next_device = list_entry(fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (tgtdev->bdev == fs_info->sb->s_bdev)
- fs_info->sb->s_bdev = next_device->bdev;
- if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
- list_del_rcu(&tgtdev->dev_list);
+ btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
- call_rcu(&tgtdev->rcu, free_device);
+ list_del_rcu(&tgtdev->dev_list);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+
+ /*
+ * The update_dev_time() with in btrfs_scratch_superblocks()
+ * may lead to a call to btrfs_show_devname() which will try
+ * to hold device_list_mutex. And here this device
+ * is already out of device list, so we don't have to hold
+ * the device_list_mutex lock.
+ */
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ call_rcu(&tgtdev->rcu, free_device);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
}
}
+ /*
+ * Lookup a device given by device id, or the path if the id is 0.
+ */
+ int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+ char *devpath,
+ struct btrfs_device **device)
+ {
+ int ret;
+
+ if (devid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, devid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ if (!devpath || !devpath[0])
+ return -EINVAL;
+
+ ret = btrfs_find_device_missing_or_by_path(root, devpath,
+ device);
+ }
+ return ret;
+ }
+
/*
* does all the dirty work required for changing file system's UUID.
*/
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_std_error(root->fs_info, -ENOENT,
+ btrfs_handle_fs_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
+ u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
goto loop;
}
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ ASSERT(fs_info->data_sinfo);
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ !chunk_reserved && !bytes_used) {
trans = btrfs_start_transaction(chunk_root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
num_devices--;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- if (num_devices == 1)
- allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices > 1)
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
+ if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
if (num_devices > 2)
allowed |= BTRFS_BLOCK_GROUP_RAID5;
stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
- BUG_ON(offset < stripe_offset);
+ if (offset < stripe_offset) {
+ btrfs_crit(fs_info, "stripe math has gone wrong, "
+ "stripe_offset=%llu, offset=%llu, start=%llu, "
+ "logical=%llu, stripe_len=%llu",
+ stripe_offset, offset, em->start, logical,
+ stripe_len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
&stripe_index);
mirror_num = stripe_index + 1;
}
- BUG_ON(stripe_index >= map->num_stripes);
+ if (stripe_index >= map->num_stripes) {
+ btrfs_crit(fs_info, "stripe index math went horribly wrong, "
+ "got stripe_index=%u, num_stripes=%u",
+ stripe_index, map->num_stripes);
+ ret = -EINVAL;
+ goto out;
+ }
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing) {
"invalid chunk length %llu", length);
return -EIO;
}
- if (!is_power_of_2(stripe_len)) {
+ if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
stripe_len);
return -EIO;