#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = page_address(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
i++;
ptr += cur_size;
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
+ /* Subpage doesn't support compression yet */
+ if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+ return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, 0);
+ end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (start == 0) {
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
- int modified;
unsigned long flags;
/* Sanity check */
ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, modified);
+ replace_extent_mapping(em_tree, em, split_pre, 1);
/*
* Now we only have an extent_map at:
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, modified);
+ add_extent_mapping(em_tree, split_mid, 1);
}
if (post) {
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
- add_extent_mapping(em_tree, split_post, modified);
+ add_extent_mapping(em_tree, split_post, 1);
}
/* Once for us */
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate)
+ u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
return 0;
}
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ /*
+ * For subpage case, above PageChecked is not safe as it's not subpage
+ * compatible.
+ * But for now only cow fixup and compressed read utilize PageChecked
+ * flag, while in this context we can easily use io_bio->csum to
+ * determine if we really need to do csum verification.
+ *
+ * So for now, just exit if io_bio->csum is NULL, as it means it's
+ * compressed read, and its compressed data csum has already been
+ * verified.
+ */
+ if (io_bio->csum == NULL)
return 0;
- if (!root->fs_info->csum_root)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+ if (!root->fs_info->csum_root)
return 0;
- }
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
int ret;
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- inode->last_trans = fs_info->generation;
- inode->last_sub_trans = root->log_transid;
- inode->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
}
if (attr->ia_valid) {
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(&init_user_ns, inode,
- inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
if (ret != 0)
goto fail_unlock;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (err)
goto out_fail;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
return dip;
}
- static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct btrfs_dio_data *dio_data = iter->iomap.private;
struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
status = errno_to_blk_status(ret);
goto out_err_em;
}
- ASSERT(geom.len <= INT_MAX);
- clone_len = min_t(int, submit_len, geom.len);
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
extent_readahead(rac);
}
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1)
+
+ if (ret == 1) {
+ wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
+ }
return ret;
}
* do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
+ wait_subpage_spinlock(page);
/*
* For subpage case, we have call sites like
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- cur, range_end + 1 - cur, 1)) {
+ cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
/*
* The ordered extent has finished, now we're again
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root)
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
if (err < 0)
return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+ inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+ ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
bool dest_log_pinned = false;
bool need_abort = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+ btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
if (ret)
return ret;
- inode = btrfs_new_inode(trans, root, dir,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
return ret;
}
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
}
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
+ ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+ old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
}
struct btrfs_delalloc_work {
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
+ S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(&init_user_ns, inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (ret)
goto out;
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
- static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+ static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
{
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
- static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
return ret;
}
+ #ifdef CONFIG_FS_DAX_PMD
+ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap *iomap, void **entry)
+ {
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = mapping->host;
+ pgtable_t pgtable = NULL;
+ struct page *zero_page;
+ spinlock_t *ptl;
+ pmd_t pmd_entry;
+ pfn_t pfn;
+
+ zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ pfn = page_to_pfn_t(zero_page);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (!pmd_none(*(vmf->pmd))) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ if (pgtable) {
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+ pmd_entry = pmd_mkhuge(pmd_entry);
+ set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
+ spin_unlock(ptl);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+ return VM_FAULT_NOPAGE;
+
+ fallback:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+ return VM_FAULT_FALLBACK;
+ }
+ #else
+ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap *iomap, void **entry)
+ {
+ return VM_FAULT_FALLBACK;
+ }
+ #endif /* CONFIG_FS_DAX_PMD */
+
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
return size;
}
- static loff_t
- dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
+ struct iov_iter *iter)
{
+ const struct iomap *iomap = &iomi->iomap;
+ loff_t length = iomap_length(iomi);
+ loff_t pos = iomi->pos;
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
- struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
- end = min(end, i_size_read(inode));
+ end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
* written by write(2) is visible in mmap.
*/
if (iomap->flags & IOMAP_F_NEW) {
- invalidate_inode_pages2_range(inode->i_mapping,
+ invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
}
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos, ret = 0, done = 0;
- unsigned flags = 0;
+ struct iomap_iter iomi = {
+ .inode = iocb->ki_filp->f_mapping->host,
+ .pos = iocb->ki_pos,
+ .len = iov_iter_count(iter),
+ };
+ loff_t done = 0;
+ int ret;
if (iov_iter_rw(iter) == WRITE) {
- lockdep_assert_held_write(&inode->i_rwsem);
- flags |= IOMAP_WRITE;
+ lockdep_assert_held_write(&iomi.inode->i_rwsem);
+ iomi.flags |= IOMAP_WRITE;
} else {
- lockdep_assert_held(&inode->i_rwsem);
+ lockdep_assert_held(&iomi.inode->i_rwsem);
}
if (iocb->ki_flags & IOCB_NOWAIT)
- flags |= IOMAP_NOWAIT;
+ iomi.flags |= IOMAP_NOWAIT;
- while (iov_iter_count(iter)) {
- ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
- iter, dax_iomap_actor);
- if (ret <= 0)
- break;
- pos += ret;
- done += ret;
- }
+ while ((ret = iomap_iter(&iomi, ops)) > 0)
+ iomi.processed = dax_iomap_iter(&iomi, iter);
- iocb->ki_pos += done;
+ done = iomi.pos - iocb->ki_pos;
+ iocb->ki_pos = iomi.pos;
return done ? done : ret;
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(unsigned long flags,
- struct vm_area_struct *vma, struct iomap *iomap)
+ struct vm_area_struct *vma, const struct iomap *iomap)
{
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
&& (iomap->flags & IOMAP_F_DIRTY);
}
+ /*
+ * When handling a synchronous page fault and the inode need a fsync, we can
+ * insert the PTE/PMD into page tables only after that fsync happened. Skip
+ * insertion for now and return the pfn so that caller can insert it after the
+ * fsync is done.
+ */
+ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+ {
+ if (WARN_ON_ONCE(!pfnp))
+ return VM_FAULT_SIGBUS;
+ *pfnp = pfn;
+ return VM_FAULT_NEEDDSYNC;
+ }
+
+ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
+ const struct iomap_iter *iter)
+ {
+ sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
+ unsigned long vaddr = vmf->address;
+ vm_fault_t ret;
+ int error = 0;
+
+ switch (iter->iomap.type) {
+ case IOMAP_HOLE:
+ case IOMAP_UNWRITTEN:
+ clear_user_highpage(vmf->cow_page, vaddr);
+ break;
+ case IOMAP_MAPPED:
+ error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
+ sector, vmf->cow_page, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ if (error)
+ return dax_fault_return(error);
+
+ __SetPageUptodate(vmf->cow_page);
+ ret = finish_fault(vmf);
+ if (!ret)
+ return VM_FAULT_DONE_COW;
+ return ret;
+ }
+
+ /**
+ * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
+ * @vmf: vm fault instance
+ * @iter: iomap iter
+ * @pfnp: pfn to be returned
+ * @xas: the dax mapping tree of a file
+ * @entry: an unlocked dax entry to be inserted
+ * @pmd: distinguish whether it is a pmd fault
+ */
+ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
+ const struct iomap_iter *iter, pfn_t *pfnp,
+ struct xa_state *xas, void **entry, bool pmd)
+ {
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ const struct iomap *iomap = &iter->iomap;
+ size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
+ loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+ unsigned long entry_flags = pmd ? DAX_PMD : 0;
+ int err = 0;
+ pfn_t pfn;
+
+ if (!pmd && vmf->cow_page)
+ return dax_fault_cow_page(vmf, iter);
+
+ /* if we are reading UNWRITTEN and HOLE, return a hole. */
+ if (!write &&
+ (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
+ if (!pmd)
+ return dax_load_hole(xas, mapping, entry, vmf);
+ return dax_pmd_load_hole(xas, vmf, iomap, entry);
+ }
+
+ if (iomap->type != IOMAP_MAPPED) {
+ WARN_ON_ONCE(1);
+ return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
+ }
+
+ err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+ if (err)
+ return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
+
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
+ write && !sync);
+
+ if (sync)
+ return dax_fault_synchronous_pfnp(pfnp, pfn);
+
+ /* insert PMD pfn */
+ if (pmd)
+ return vmf_insert_pfn_pmd(vmf, pfn, write);
+
+ /* insert PTE pfn */
+ if (write)
+ return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ }
+
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
- struct vm_area_struct *vma = vmf->vma;
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
- struct inode *inode = mapping->host;
- unsigned long vaddr = vmf->address;
- loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- struct iomap iomap = { .type = IOMAP_HOLE };
- struct iomap srcmap = { .type = IOMAP_HOLE };
- unsigned flags = IOMAP_FAULT;
- int error, major = 0;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
+ struct iomap_iter iter = {
+ .inode = mapping->host,
+ .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
+ .len = PAGE_SIZE,
+ .flags = IOMAP_FAULT,
+ };
vm_fault_t ret = 0;
void *entry;
- pfn_t pfn;
+ int error;
- trace_dax_pte_fault(inode, vmf, ret);
+ trace_dax_pte_fault(iter.inode, vmf, ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode)) {
+ if (iter.pos >= i_size_read(iter.inode)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
- if (write && !vmf->cow_page)
- flags |= IOMAP_WRITE;
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ iter.flags |= IOMAP_WRITE;
entry = grab_mapping_entry(&xas, mapping, 0);
if (xa_is_internal(entry)) {
goto unlock_entry;
}
- /*
- * Note that we don't bother to use iomap_apply here: DAX required
- * the file system block size to be equal the page size, which means
- * that we never have to deal with more than a single extent here.
- */
- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
- if (iomap_errp)
- *iomap_errp = error;
- if (error) {
- ret = dax_fault_return(error);
- goto unlock_entry;
- }
- if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- error = -EIO; /* fs corruption? */
- goto error_finish_iomap;
- }
-
- if (vmf->cow_page) {
- sector_t sector = dax_iomap_sector(&iomap, pos);
-
- switch (iomap.type) {
- case IOMAP_HOLE:
- case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
- break;
- case IOMAP_MAPPED:
- error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
+ while ((error = iomap_iter(&iter, ops)) > 0) {
+ if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
+ iter.processed = -EIO; /* fs corruption? */
+ continue;
}
- if (error)
- goto error_finish_iomap;
-
- __SetPageUptodate(vmf->cow_page);
- ret = finish_fault(vmf);
- if (!ret)
- ret = VM_FAULT_DONE_COW;
- goto finish_iomap;
- }
-
- sync = dax_fault_is_synchronous(flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- if (iomap.flags & IOMAP_F_NEW) {
+ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
+ if (ret != VM_FAULT_SIGBUS &&
+ (iter.iomap.flags & IOMAP_F_NEW)) {
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+ ret |= VM_FAULT_MAJOR;
}
- error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
- if (error < 0)
- goto error_finish_iomap;
- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- 0, write && !sync);
-
- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PTE into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
- if (sync) {
- if (WARN_ON_ONCE(!pfnp)) {
- error = -EIO;
- goto error_finish_iomap;
- }
- *pfnp = pfn;
- ret = VM_FAULT_NEEDDSYNC | major;
- goto finish_iomap;
- }
- trace_dax_insert_mapping(inode, vmf, entry);
- if (write)
- ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
- else
- ret = vmf_insert_mixed(vma, vaddr, pfn);
-
- goto finish_iomap;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (!write) {
- ret = dax_load_hole(&xas, mapping, &entry, vmf);
- goto finish_iomap;
- }
- fallthrough;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
+ if (!(ret & VM_FAULT_ERROR))
+ iter.processed = PAGE_SIZE;
}
- error_finish_iomap:
- ret = dax_fault_return(error);
- finish_iomap:
- if (ops->iomap_end) {
- int copied = PAGE_SIZE;
+ if (iomap_errp)
+ *iomap_errp = error;
+ if (!ret && error)
+ ret = dax_fault_return(error);
- if (ret & VM_FAULT_ERROR)
- copied = 0;
- /*
- * The fault is done by now and there's no way back (other
- * thread may be already happily using PTE we have installed).
- * Just ignore error from ->iomap_end since we cannot do much
- * with it.
- */
- ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
- }
- unlock_entry:
+ unlock_entry:
dax_unlock_entry(&xas, entry);
- out:
- trace_dax_pte_fault_done(inode, vmf, ret);
- return ret | major;
+ out:
+ trace_dax_pte_fault_done(iter.inode, vmf, ret);
+ return ret;
}
#ifdef CONFIG_FS_DAX_PMD
- static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- struct iomap *iomap, void **entry)
+ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
+ pgoff_t max_pgoff)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
- struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
- struct page *zero_page;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- pfn_t pfn;
-
- zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
-
- if (unlikely(!zero_page))
- goto fallback;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
- pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ /*
+ * Make sure that the faulting address's PMD offset (color) matches
+ * the PMD offset from the start of the file. This is necessary so
+ * that a PMD range in the page table overlaps exactly with a PMD
+ * range in the page cache.
+ */
+ if ((vmf->pgoff & PG_PMD_COLOUR) !=
+ ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
+ return true;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vmf->vma->vm_flags & VM_SHARED))
+ return true;
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
- }
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vmf->vma->vm_start)
+ return true;
+ if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+ return true;
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
- return VM_FAULT_NOPAGE;
+ /* If the PMD would extend beyond the file size */
+ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ return true;
- fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
- return VM_FAULT_FALLBACK;
+ return false;
}
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
- struct vm_area_struct *vma = vmf->vma;
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
- unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
- struct inode *inode = mapping->host;
- vm_fault_t result = VM_FAULT_FALLBACK;
- struct iomap iomap = { .type = IOMAP_HOLE };
- struct iomap srcmap = { .type = IOMAP_HOLE };
+ struct iomap_iter iter = {
+ .inode = mapping->host,
+ .len = PMD_SIZE,
+ .flags = IOMAP_FAULT,
+ };
+ vm_fault_t ret = VM_FAULT_FALLBACK;
pgoff_t max_pgoff;
void *entry;
- loff_t pos;
int error;
- pfn_t pfn;
+
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ iter.flags |= IOMAP_WRITE;
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-
- trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
-
- /*
- * Make sure that the faulting address's PMD offset (color) matches
- * the PMD offset from the start of the file. This is necessary so
- * that a PMD range in the page table overlaps exactly with a PMD
- * range in the page cache.
- */
- if ((vmf->pgoff & PG_PMD_COLOUR) !=
- ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
- goto fallback;
+ max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
- /* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
- goto fallback;
-
- /* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
- goto fallback;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
- goto fallback;
+ trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
if (xas.xa_index >= max_pgoff) {
- result = VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- /* If the PMD would extend beyond the file size */
- if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
goto fallback;
/*
*/
entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
if (xa_is_internal(entry)) {
- result = xa_to_internal(entry);
+ ret = xa_to_internal(entry);
goto fallback;
}
*/
if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
!pmd_devmap(*vmf->pmd)) {
- result = 0;
+ ret = 0;
goto unlock_entry;
}
- /*
- * Note that we don't use iomap_apply here. We aren't doing I/O, only
- * setting up a mapping, so really we're using iomap_begin() as a way
- * to look up our filesystem block.
- */
- pos = (loff_t)xas.xa_index << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
- &srcmap);
- if (error)
- goto unlock_entry;
-
- if (iomap.offset + iomap.length < pos + PMD_SIZE)
- goto finish_iomap;
-
- sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
- if (error < 0)
- goto finish_iomap;
-
- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- DAX_PMD, write && !sync);
-
- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PMD into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
- if (sync) {
- if (WARN_ON_ONCE(!pfnp))
- goto finish_iomap;
- *pfnp = pfn;
- result = VM_FAULT_NEEDDSYNC;
- goto finish_iomap;
- }
+ iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
+ while ((error = iomap_iter(&iter, ops)) > 0) {
+ if (iomap_length(&iter) < PMD_SIZE)
+ continue; /* actually breaks out of the loop */
- trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
- result = vmf_insert_pfn_pmd(vmf, pfn, write);
- break;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (WARN_ON_ONCE(write))
- break;
- result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
+ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
+ if (ret != VM_FAULT_FALLBACK)
+ iter.processed = PMD_SIZE;
}
- finish_iomap:
- if (ops->iomap_end) {
- int copied = PMD_SIZE;
-
- if (result == VM_FAULT_FALLBACK)
- copied = 0;
- /*
- * The fault is done by now and there's no way back (other
- * thread may be already happily using PMD we have installed).
- * Just ignore error from ->iomap_end since we cannot do much
- * with it.
- */
- ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
- &iomap);
- }
- unlock_entry:
+ unlock_entry:
dax_unlock_entry(&xas, entry);
- fallback:
- if (result == VM_FAULT_FALLBACK) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
+ fallback:
+ if (ret == VM_FAULT_FALLBACK) {
+ split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
count_vm_event(THP_FAULT_FALLBACK);
}
out:
- trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
- return result;
+ trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
+ return ret;
}
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,