From: Linus Torvalds Date: Tue, 7 Sep 2021 19:18:29 +0000 (-0700) Subject: Merge tag 'fuse-update-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/mszered... X-Git-Tag: microblaze-v5.16~73 X-Git-Url: http://git.monstr.eu/?p=linux-2.6-microblaze.git;a=commitdiff_plain;h=75b96f0ec5faf730128c32187e3e28441c27a094;hp=996fe06160998a38ff07189feb3ec8ab8f68fd4e Merge tag 'fuse-update-5.15' of git://git./linux/kernel/git/mszeredi/fuse Pull fuse updates from Miklos Szeredi: - Allow mounting an active fuse device. Previously the fuse device would always be mounted during initialization, and sharing a fuse superblock was only possible through mount or namespace cloning - Fix data flushing in syncfs (virtiofs only) - Fix data flushing in copy_file_range() - Fix a possible deadlock in atomic O_TRUNC - Misc fixes and cleanups * tag 'fuse-update-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse: fuse: remove unused arg in fuse_write_file_get() fuse: wait for writepages in syncfs fuse: flush extending writes fuse: truncate pagecache on atomic_o_trunc fuse: allow sharing existing sb fuse: move fget() to fuse_get_tree() fuse: move option checking into fuse_fill_super() fuse: name fs_context consistently fuse: fix use after free in fuse_read_interrupt() --- diff --git a/fs/fuse/control.c b/fs/fuse/control.c index cc7e94d73c6c..000d2e5627e9 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -328,7 +328,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) drop_nlink(d_inode(fuse_control_sb->s_root)); } -static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) +static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; @@ -354,18 +354,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) return 0; } -static int fuse_ctl_get_tree(struct fs_context *fc) +static int fuse_ctl_get_tree(struct fs_context *fsc) { - return get_tree_single(fc, fuse_ctl_fill_super); + return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; -static int fuse_ctl_init_fs_context(struct fs_context *fc) +static int fuse_ctl_init_fs_context(struct fs_context *fsc) { - fc->ops = &fuse_ctl_context_ops; + fsc->ops = &fuse_ctl_context_ops; return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1c8f79b3dd06..dde341a6388a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -288,10 +288,10 @@ void fuse_request_end(struct fuse_req *req) /* * test_and_set_bit() implies smp_mb() between bit - * changing and below intr_entry check. Pairs with + * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ - if (!list_empty(&req->intr_entry)) { + if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 621a662c19fb..11404f8c21c7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); + truncate_pagecache(inode, 0); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); + } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { + invalidate_inode_pages2(inode->i_mapping); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -389,6 +392,7 @@ struct fuse_writepage_args { struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; + struct fuse_sync_bucket *bucket; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, @@ -1608,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; int i; + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); @@ -1813,8 +1820,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff = NULL; @@ -1829,22 +1835,20 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false, false); @@ -1871,6 +1875,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) } +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1894,10 +1912,11 @@ static int fuse_writepage_locked(struct page *page) goto err_free; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fc, fi); + wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) goto err_nofile; + fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); @@ -2113,7 +2132,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, fi); + data->ff = fuse_write_file_get(fi); if (!data->ff) goto out_unlock; } @@ -2148,6 +2167,8 @@ static int fuse_writepages_fill(struct page *page, __free_page(tmp_page); goto out_unlock; } + fuse_writepage_add_to_bucket(fc, wpa); + data->max_pages = 1; ap = &wpa->ia.ap; @@ -2881,7 +2902,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); if (!err) fuse_sync_writes(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3d18556a01ad..319596df5dc6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -482,6 +482,7 @@ struct fuse_dev { struct fuse_fs_context { int fd; + struct file *file; unsigned int rootmode; kuid_t user_id; kgid_t group_id; @@ -508,6 +509,13 @@ struct fuse_fs_context { void **fudptr; }; +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + /** * A Fuse connection. * @@ -800,6 +808,9 @@ struct fuse_conn { /** List of filesystems using this connection */ struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; }; /* @@ -903,6 +914,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, descs[i].length = PAGE_SIZE - descs[i].offset; } +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e07e429f32e1..36cd03114b6d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -137,12 +137,12 @@ static void fuse_evict_inode(struct inode *inode) } } -static int fuse_reconfigure(struct fs_context *fc) +static int fuse_reconfigure(struct fs_context *fsc) { - struct super_block *sb = fc->root->d_sb; + struct super_block *sb = fsc->root->d_sb; sync_filesystem(sb); - if (fc->sb_flags & SB_MANDLOCK) + if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -505,6 +505,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); @@ -527,6 +578,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait) if (!fc->sync_fs) return 0; + fuse_sync_fs_writes(fc); + memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); @@ -572,38 +625,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { {} }; -static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* * Ignore options coming from mount(MS_REMOUNT) for backward * compatibility. */ - if (fc->oldapi) + if (fsc->oldapi) return 0; - return invalfc(fc, "No changes allowed in reconfigure"); + return invalfc(fsc, "No changes allowed in reconfigure"); } - opt = fs_parse(fc, fuse_fs_parameters, param, &result); + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_SOURCE: - if (fc->source) - return invalfc(fc, "Multiple sources specified"); - fc->source = param->string; + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; param->string = NULL; break; case OPT_SUBTYPE: if (ctx->subtype) - return invalfc(fc, "Multiple subtypes specified"); + return invalfc(fsc, "Multiple subtypes specified"); ctx->subtype = param->string; param->string = NULL; return 0; @@ -615,22 +668,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_ROOTMODE: if (!fuse_valid_type(result.uint_32)) - return invalfc(fc, "Invalid rootmode"); + return invalfc(fsc, "Invalid rootmode"); ctx->rootmode = result.uint_32; ctx->rootmode_present = true; break; case OPT_USER_ID: - ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); if (!uid_valid(ctx->user_id)) - return invalfc(fc, "Invalid user_id"); + return invalfc(fsc, "Invalid user_id"); ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); if (!gid_valid(ctx->group_id)) - return invalfc(fc, "Invalid group_id"); + return invalfc(fsc, "Invalid group_id"); ctx->group_id_present = true; break; @@ -648,7 +701,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_BLKSIZE: if (!ctx->is_bdev) - return invalfc(fc, "blksize only supported for fuseblk"); + return invalfc(fsc, "blksize only supported for fuseblk"); ctx->blksize = result.uint_32; break; @@ -659,9 +712,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void fuse_free_fc(struct fs_context *fc) +static void fuse_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { kfree(ctx->subtype); @@ -762,6 +815,7 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); @@ -769,6 +823,11 @@ void fuse_conn_put(struct fuse_conn *fc) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -1417,6 +1476,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) { @@ -1508,34 +1568,33 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; - struct file *file; int err; struct fuse_conn *fc; struct fuse_mount *fm; - err = -EINVAL; - file = fget(ctx->fd); - if (!file) - goto err; + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; /* * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != sb->s_user_ns)) - goto err_fput; - ctx->fudptr = &file->private_data; + err = -EINVAL; + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + goto err; + ctx->fudptr = &ctx->file->private_data; fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) - goto err_fput; + goto err; fm = kzalloc(sizeof(*fm), GFP_KERNEL); if (!fm) { kfree(fc); - goto err_fput; + goto err; } fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); @@ -1546,12 +1605,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) err = fuse_fill_super_common(sb, ctx); if (err) goto err_put_conn; - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; @@ -1559,30 +1614,68 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fuse_conn_put(fc); kfree(fm); sb->s_fs_info = NULL; - err_fput: - fput(file); err: return err; } -static int fuse_get_tree(struct fs_context *fc) +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + return -ENOTCONN; +} - if (!ctx->fd_present || !ctx->rootmode_present || - !ctx->user_id_present || !ctx->group_id_present) - return -EINVAL; +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) +{ -#ifdef CONFIG_BLOCK - if (ctx->is_bdev) - return get_tree_bdev(fc, fuse_fill_super); -#endif + return fsc->sget_key == get_fuse_conn_super(sb); +} + +static int fuse_get_tree(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct super_block *sb; + int err; - return get_tree_nodev(fc, fuse_fill_super); + if (ctx->fd_present) + ctx->file = fget(ctx->fd); + + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { + err = get_tree_bdev(fsc, fuse_fill_super); + goto out_fput; + } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + if (!ctx->file) + return -EINVAL; + + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = READ_ONCE(ctx->file->private_data); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); + } +out_fput: + if (ctx->file) + fput(ctx->file); + return err; } static const struct fs_context_operations fuse_context_ops = { - .free = fuse_free_fc, + .free = fuse_free_fsc, .parse_param = fuse_parse_param, .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, @@ -1591,7 +1684,7 @@ static const struct fs_context_operations fuse_context_ops = { /* * Set up the filesystem mount context. */ -static int fuse_init_fs_context(struct fs_context *fc) +static int fuse_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; @@ -1604,14 +1697,14 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) { + if (fsc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; ctx->destroy = true; } #endif - fc->fs_private = ctx; - fc->ops = &fuse_context_ops; + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; return 0; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8f52cdaa8445..0ad89c6629d7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,14 +97,14 @@ static const struct fs_parameter_spec virtio_fs_parameters[] = { {} }; -static int virtio_fs_parse_param(struct fs_context *fc, +static int virtio_fs_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - opt = fs_parse(fc, virtio_fs_parameters, param, &result); + opt = fs_parse(fsc, virtio_fs_parameters, param, &result); if (opt < 0) return opt; @@ -119,9 +119,9 @@ static int virtio_fs_parse_param(struct fs_context *fc, return 0; } -static void virtio_fs_free_fc(struct fs_context *fc) +static void virtio_fs_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; kfree(ctx); } @@ -1488,7 +1488,7 @@ out_err: } static const struct fs_context_operations virtio_fs_context_ops = { - .free = virtio_fs_free_fc, + .free = virtio_fs_free_fsc, .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, };