Merge 5.17-rc6 into char-misc-next
[linux-2.6-microblaze.git] / fs / btrfs / file.c
index a176236..a0179cc 100644 (file)
@@ -50,11 +50,14 @@ struct inode_defrag {
        /* root objectid */
        u64 root;
 
-       /* last offset we were able to defrag */
-       u64 last_offset;
-
-       /* if we've wrapped around back to zero once already */
-       int cycled;
+       /*
+        * The extent size threshold for autodefrag.
+        *
+        * This value is different for compressed/non-compressed extents,
+        * thus needs to be passed from higher layer.
+        * (aka, inode_should_defrag())
+        */
+       u32 extent_thresh;
 };
 
 static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
                         */
                        if (defrag->transid < entry->transid)
                                entry->transid = defrag->transid;
-                       if (defrag->last_offset > entry->last_offset)
-                               entry->last_offset = defrag->last_offset;
+                       entry->extent_thresh = min(defrag->extent_thresh,
+                                                  entry->extent_thresh);
                        return -EEXIST;
                }
        }
@@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
  * enabled
  */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
-                          struct btrfs_inode *inode)
+                          struct btrfs_inode *inode, u32 extent_thresh)
 {
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        defrag->ino = btrfs_ino(inode);
        defrag->transid = transid;
        defrag->root = root->root_key.objectid;
+       defrag->extent_thresh = extent_thresh;
 
        spin_lock(&fs_info->defrag_inodes_lock);
        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -178,34 +182,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
-                                      struct inode_defrag *defrag)
-{
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       int ret;
-
-       if (!__need_auto_defrag(fs_info))
-               goto out;
-
-       /*
-        * Here we don't check the IN_DEFRAG flag, because we need merge
-        * them together.
-        */
-       spin_lock(&fs_info->defrag_inodes_lock);
-       ret = __btrfs_add_inode_defrag(inode, defrag);
-       spin_unlock(&fs_info->defrag_inodes_lock);
-       if (ret)
-               goto out;
-       return;
-out:
-       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
 /*
  * pick the defragable inode that we want, if it doesn't exist, we will get
  * the next one.
@@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        struct btrfs_root *inode_root;
        struct inode *inode;
        struct btrfs_ioctl_defrag_range_args range;
-       int num_defrag;
-       int ret;
+       int ret = 0;
+       u64 cur = 0;
+
+again:
+       if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+               goto cleanup;
+       if (!__need_auto_defrag(fs_info))
+               goto cleanup;
 
        /* get the inode */
        inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
@@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
                goto cleanup;
        }
 
+       if (cur >= i_size_read(inode)) {
+               iput(inode);
+               goto cleanup;
+       }
+
        /* do a chunk of defrag */
        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        memset(&range, 0, sizeof(range));
        range.len = (u64)-1;
-       range.start = defrag->last_offset;
+       range.start = cur;
+       range.extent_thresh = defrag->extent_thresh;
 
        sb_start_write(fs_info->sb);
-       num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+       ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
                                       BTRFS_DEFRAG_BATCH);
        sb_end_write(fs_info->sb);
-       /*
-        * if we filled the whole defrag batch, there
-        * must be more work to do.  Queue this defrag
-        * again
-        */
-       if (num_defrag == BTRFS_DEFRAG_BATCH) {
-               defrag->last_offset = range.start;
-               btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
-       } else if (defrag->last_offset && !defrag->cycled) {
-               /*
-                * we didn't fill our defrag batch, but
-                * we didn't start at zero.  Make sure we loop
-                * around to the start of the file.
-                */
-               defrag->last_offset = 0;
-               defrag->cycled = 1;
-               btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
-       } else {
-               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-       }
-
        iput(inode);
-       return 0;
+
+       if (ret < 0)
+               goto cleanup;
+
+       cur = max(cur + fs_info->sectorsize, range.start);
+       goto again;
+
 cleanup:
        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
        return ret;
@@ -437,9 +410,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+                            struct page **pages, size_t num_pages,
+                            u64 pos, u64 copied)
 {
        size_t i;
+       u64 block_start = round_down(pos, fs_info->sectorsize);
+       u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+       ASSERT(block_len <= U32_MAX);
        for (i = 0; i < num_pages; i++) {
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
@@ -447,7 +426,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
                 * accessed as prepare_pages should have marked them accessed
                 * in prepare_pages via find_or_create_page()
                 */
-               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+                                              block_len);
                unlock_page(pages[i]);
                put_page(pages[i]);
        }
@@ -504,7 +484,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
                struct page *p = pages[i];
 
                btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
-               ClearPageChecked(p);
+               btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
                btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
        }
 
@@ -869,7 +849,8 @@ next_slot:
                                btrfs_init_data_ref(&ref,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               args->start - extent_offset);
+                                               args->start - extent_offset,
+                                               0, false);
                                ret = btrfs_inc_extent_ref(trans, &ref);
                                BUG_ON(ret); /* -ENOMEM */
                        }
@@ -955,7 +936,8 @@ delete_extent_item:
                                btrfs_init_data_ref(&ref,
                                                root->root_key.objectid,
                                                key.objectid,
-                                               key.offset - extent_offset);
+                                               key.offset - extent_offset, 0,
+                                               false);
                                ret = btrfs_free_extent(trans, &ref);
                                BUG_ON(ret); /* -ENOMEM */
                                args->bytes_found += extent_end - key.offset;
@@ -1020,8 +1002,7 @@ delete_extent_item:
                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
                                path->slots[0]++;
                }
-               setup_items_for_insert(root, path, &key,
-                                      &args->extent_item_size, 1);
+               btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
                args->extent_inserted = true;
        }
 
@@ -1232,7 +1213,7 @@ again:
                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
                                       num_bytes, 0);
                btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
-                                   orig_offset);
+                                   orig_offset, 0, false);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1238,8 @@ again:
        other_end = 0;
        btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                               num_bytes, 0);
-       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+                           0, false);
        if (extent_mergeable(leaf, path->slots[0] + 1,
                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
@@ -1709,7 +1691,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
                 * Fault pages before locking them in prepare_pages
                 * to avoid recursive lock
                 */
-               if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+               if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
                        ret = -EFAULT;
                        break;
                }
@@ -1844,7 +1826,7 @@ again:
 
                btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
                if (ret) {
-                       btrfs_drop_pages(pages, num_pages);
+                       btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
                        break;
                }
 
@@ -1852,7 +1834,7 @@ again:
                if (only_release_metadata)
                        btrfs_check_nocow_unlock(BTRFS_I(inode));
 
-               btrfs_drop_pages(pages, num_pages);
+               btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
 
                cond_resched();
 
@@ -1903,16 +1885,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 
 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
+       const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        loff_t pos;
        ssize_t written = 0;
        ssize_t written_buffered;
+       size_t prev_left = 0;
        loff_t endbyte;
        ssize_t err;
        unsigned int ilock_flags = 0;
-       struct iomap_dio *dio = NULL;
 
        if (iocb->ki_flags & IOCB_NOWAIT)
                ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1955,23 +1938,80 @@ relock:
                goto buffered;
        }
 
-       dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                            0);
+       /*
+        * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
+        * calls generic_write_sync() (through iomap_dio_complete()), because
+        * that results in calling fsync (btrfs_sync_file()) which will try to
+        * lock the inode in exclusive/write mode.
+        */
+       if (is_sync_write)
+               iocb->ki_flags &= ~IOCB_DSYNC;
 
-       btrfs_inode_unlock(inode, ilock_flags);
+       /*
+        * The iov_iter can be mapped to the same file range we are writing to.
+        * If that's the case, then we will deadlock in the iomap code, because
+        * it first calls our callback btrfs_dio_iomap_begin(), which will create
+        * an ordered extent, and after that it will fault in the pages that the
+        * iov_iter refers to. During the fault in we end up in the readahead
+        * pages code (starting at btrfs_readahead()), which will lock the range,
+        * find that ordered extent and then wait for it to complete (at
+        * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+        * obviously the ordered extent can never complete as we didn't submit
+        * yet the respective bio(s). This always happens when the buffer is
+        * memory mapped to the same file range, since the iomap DIO code always
+        * invalidates pages in the target file range (after starting and waiting
+        * for any writeback).
+        *
+        * So here we disable page faults in the iov_iter and then retry if we
+        * got -EFAULT, faulting in the pages before the retry.
+        */
+again:
+       from->nofault = true;
+       err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+                          IOMAP_DIO_PARTIAL, written);
+       from->nofault = false;
 
-       if (IS_ERR_OR_NULL(dio)) {
-               err = PTR_ERR_OR_ZERO(dio);
-               if (err < 0 && err != -ENOTBLK)
-                       goto out;
-       } else {
-               written = iomap_dio_complete(dio);
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (err > 0)
+               written = err;
+
+       if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+               const size_t left = iov_iter_count(from);
+               /*
+                * We have more data left to write. Try to fault in as many as
+                * possible of the remainder pages and retry. We do this without
+                * releasing and locking again the inode, to prevent races with
+                * truncate.
+                *
+                * Also, in case the iov refers to pages in the file range of the
+                * file we want to write to (due to a mmap), we could enter an
+                * infinite loop if we retry after faulting the pages in, since
+                * iomap will invalidate any pages in the range early on, before
+                * it tries to fault in the pages of the iov. So we keep track of
+                * how much was left of iov in the previous EFAULT and fallback
+                * to buffered IO in case we haven't made any progress.
+                */
+               if (left == prev_left) {
+                       err = -ENOTBLK;
+               } else {
+                       fault_in_iov_iter_readable(from, left);
+                       prev_left = left;
+                       goto again;
+               }
        }
 
-       if (written < 0 || !iov_iter_count(from)) {
-               err = written;
+       btrfs_inode_unlock(inode, ilock_flags);
+
+       /*
+        * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
+        * the fsync (call generic_write_sync()).
+        */
+       if (is_sync_write)
+               iocb->ki_flags |= IOCB_DSYNC;
+
+       /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
+       if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
                goto out;
-       }
 
 buffered:
        pos = iocb->ki_pos;
@@ -1996,7 +2036,7 @@ buffered:
        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
                                 endbyte >> PAGE_SHIFT);
 out:
-       return written ? written : err;
+       return err < 0 ? err : written;
 }
 
 static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -2012,7 +2052,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         * have opened a file as writable, we have to stop this write operation
         * to ensure consistency.
         */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+       if (BTRFS_FS_ERROR(inode->root->fs_info))
                return -EROFS;
 
        if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2620,7 +2660,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
                                       extent_info->disk_len, 0);
                ref_offset = extent_info->file_offset - extent_info->data_offset;
                btrfs_init_data_ref(&ref, root->root_key.objectid,
-                                   btrfs_ino(inode), ref_offset);
+                                   btrfs_ino(inode), ref_offset, 0, false);
                ret = btrfs_inc_extent_ref(trans, &ref);
        }
 
@@ -3650,6 +3690,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 {
        struct inode *inode = file_inode(iocb->ki_filp);
+       size_t prev_left = 0;
+       ssize_t read = 0;
        ssize_t ret;
 
        if (fsverity_active(inode))
@@ -3659,9 +3701,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
                return 0;
 
        btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
-       ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
+again:
+       /*
+        * This is similar to what we do for direct IO writes, see the comment
+        * at btrfs_direct_write(), but we also disable page faults in addition
+        * to disabling them only at the iov_iter level. This is because when
+        * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+        * which can still trigger page fault ins despite having set ->nofault
+        * to true of our 'to' iov_iter.
+        *
+        * The difference to direct IO writes is that we deadlock when trying
+        * to lock the extent range in the inode's tree during he page reads
+        * triggered by the fault in (while for writes it is due to waiting for
+        * our own ordered extent). This is because for direct IO reads,
+        * btrfs_dio_iomap_begin() returns with the extent range locked, which
+        * is only unlocked in the endio callback (end_bio_extent_readpage()).
+        */
+       pagefault_disable();
+       to->nofault = true;
+       ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+                          IOMAP_DIO_PARTIAL, read);
+       to->nofault = false;
+       pagefault_enable();
+
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (ret > 0)
+               read = ret;
+
+       if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+               const size_t left = iov_iter_count(to);
+
+               if (left == prev_left) {
+                       /*
+                        * We didn't make any progress since the last attempt,
+                        * fallback to a buffered read for the remainder of the
+                        * range. This is just to avoid any possibility of looping
+                        * for too long.
+                        */
+                       ret = read;
+               } else {
+                       /*
+                        * We made some progress since the last retry or this is
+                        * the first time we are retrying. Fault in as many pages
+                        * as possible and retry.
+                        */
+                       fault_in_iov_iter_writeable(to, left);
+                       prev_left = left;
+                       goto again;
+               }
+       }
        btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-       return ret;
+       return ret < 0 ? ret : read;
 }
 
 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)