Merge tag 'fuse-update-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/mszered...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 19:18:29 +0000 (12:18 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 19:18:29 +0000 (12:18 -0700)
Pull fuse updates from Miklos Szeredi:

 - Allow mounting an active fuse device. Previously the fuse device
   would always be mounted during initialization, and sharing a fuse
   superblock was only possible through mount or namespace cloning

 - Fix data flushing in syncfs (virtiofs only)

 - Fix data flushing in copy_file_range()

 - Fix a possible deadlock in atomic O_TRUNC

 - Misc fixes and cleanups

* tag 'fuse-update-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
  fuse: remove unused arg in fuse_write_file_get()
  fuse: wait for writepages in syncfs
  fuse: flush extending writes
  fuse: truncate pagecache on atomic_o_trunc
  fuse: allow sharing existing sb
  fuse: move fget() to fuse_get_tree()
  fuse: move option checking into fuse_fill_super()
  fuse: name fs_context consistently
  fuse: fix use after free in fuse_read_interrupt()

1  2 
fs/fuse/file.c
fs/fuse/fuse_i.h
fs/fuse/inode.c

diff --combined fs/fuse/file.c
@@@ -198,12 -198,11 +198,11 @@@ void fuse_finish_open(struct inode *ino
        struct fuse_file *ff = file->private_data;
        struct fuse_conn *fc = get_fuse_conn(inode);
  
-       if (!(ff->open_flags & FOPEN_KEEP_CACHE))
-               invalidate_inode_pages2(inode->i_mapping);
        if (ff->open_flags & FOPEN_STREAM)
                stream_open(inode, file);
        else if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
                struct fuse_inode *fi = get_fuse_inode(inode);
  
                fi->attr_version = atomic64_inc_return(&fc->attr_version);
                i_size_write(inode, 0);
                spin_unlock(&fi->lock);
+               truncate_pagecache(inode, 0);
                fuse_invalidate_attr(inode);
                if (fc->writeback_cache)
                        file_update_time(file);
+       } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+               invalidate_inode_pages2(inode->i_mapping);
        }
        if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
                fuse_link_write_file(file);
  }
@@@ -243,7 -246,7 +246,7 @@@ int fuse_open_common(struct inode *inod
        }
  
        if (dax_truncate) {
 -              down_write(&get_fuse_inode(inode)->i_mmap_sem);
 +              filemap_invalidate_lock(inode->i_mapping);
                err = fuse_dax_break_layouts(inode, 0, 0);
                if (err)
                        goto out;
  
  out:
        if (dax_truncate)
 -              up_write(&get_fuse_inode(inode)->i_mmap_sem);
 +              filemap_invalidate_unlock(inode->i_mapping);
  
        if (is_wb_truncate | dax_truncate) {
                fuse_release_nowrite(inode);
@@@ -389,6 -392,7 +392,7 @@@ struct fuse_writepage_args 
        struct list_head queue_entry;
        struct fuse_writepage_args *next;
        struct inode *inode;
+       struct fuse_sync_bucket *bucket;
  };
  
  static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
@@@ -1608,6 -1612,9 +1612,9 @@@ static void fuse_writepage_free(struct 
        struct fuse_args_pages *ap = &wpa->ia.ap;
        int i;
  
+       if (wpa->bucket)
+               fuse_sync_bucket_dec(wpa->bucket);
        for (i = 0; i < ap->num_pages; i++)
                __free_page(ap->pages[i]);
  
@@@ -1813,8 -1820,7 +1820,7 @@@ static void fuse_writepage_end(struct f
        fuse_writepage_free(wpa);
  }
  
- static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
-                                              struct fuse_inode *fi)
+ static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
  {
        struct fuse_file *ff = NULL;
  
        return ff;
  }
  
- static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
-                                            struct fuse_inode *fi)
+ static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
  {
-       struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+       struct fuse_file *ff = __fuse_write_file_get(fi);
        WARN_ON(!ff);
        return ff;
  }
  
  int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
        struct fuse_file *ff;
        int err;
  
-       ff = __fuse_write_file_get(fc, fi);
+       ff = __fuse_write_file_get(fi);
        err = fuse_flush_times(inode, ff);
        if (ff)
                fuse_file_put(ff, false, false);
@@@ -1871,6 -1875,20 +1875,20 @@@ static struct fuse_writepage_args *fuse
  
  }
  
+ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+                                        struct fuse_writepage_args *wpa)
+ {
+       if (!fc->sync_fs)
+               return;
+       rcu_read_lock();
+       /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+       do {
+               wpa->bucket = rcu_dereference(fc->curr_bucket);
+       } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+       rcu_read_unlock();
+ }
  static int fuse_writepage_locked(struct page *page)
  {
        struct address_space *mapping = page->mapping;
                goto err_free;
  
        error = -EIO;
-       wpa->ia.ff = fuse_write_file_get(fc, fi);
+       wpa->ia.ff = fuse_write_file_get(fi);
        if (!wpa->ia.ff)
                goto err_nofile;
  
+       fuse_writepage_add_to_bucket(fc, wpa);
        fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
  
        copy_highpage(tmp_page, page);
@@@ -2113,7 -2132,7 +2132,7 @@@ static int fuse_writepages_fill(struct 
  
        if (!data->ff) {
                err = -EIO;
-               data->ff = fuse_write_file_get(fc, fi);
+               data->ff = fuse_write_file_get(fi);
                if (!data->ff)
                        goto out_unlock;
        }
                        __free_page(tmp_page);
                        goto out_unlock;
                }
+               fuse_writepage_add_to_bucket(fc, wpa);
                data->max_pages = 1;
  
                ap = &wpa->ia.ap;
@@@ -2881,7 -2902,7 +2902,7 @@@ fuse_direct_IO(struct kiocb *iocb, stru
  
  static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
  {
-       int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
  
        if (!err)
                fuse_sync_writes(inode);
@@@ -2920,7 -2941,7 +2941,7 @@@ static long fuse_file_fallocate(struct 
        if (lock_inode) {
                inode_lock(inode);
                if (block_faults) {
 -                      down_write(&fi->i_mmap_sem);
 +                      filemap_invalidate_lock(inode->i_mapping);
                        err = fuse_dax_break_layouts(inode, 0, 0);
                        if (err)
                                goto out;
@@@ -2976,7 -2997,7 +2997,7 @@@ out
                clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
  
        if (block_faults)
 -              up_write(&fi->i_mmap_sem);
 +              filemap_invalidate_unlock(inode->i_mapping);
  
        if (lock_inode)
                inode_unlock(inode);
@@@ -3045,7 -3066,7 +3066,7 @@@ static ssize_t __fuse_copy_file_range(s
         * modifications.  Yet this does give less guarantees than if the
         * copying was performed with write(2).
         *
 -       * To fix this a i_mmap_sem style lock could be used to prevent new
 +       * To fix this a mapping->invalidate_lock could be used to prevent new
         * faults while the copy is ongoing.
         */
        err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
diff --combined fs/fuse/fuse_i.h
@@@ -149,6 -149,13 +149,6 @@@ struct fuse_inode 
        /** Lock to protect write related fields */
        spinlock_t lock;
  
 -      /**
 -       * Can't take inode lock in fault path (leads to circular dependency).
 -       * Introduce another semaphore which can be taken in fault path and
 -       * then other filesystem paths can take this to block faults.
 -       */
 -      struct rw_semaphore i_mmap_sem;
 -
  #ifdef CONFIG_FUSE_DAX
        /*
         * Dax specific inode data
@@@ -482,6 -489,7 +482,7 @@@ struct fuse_dev 
  
  struct fuse_fs_context {
        int fd;
+       struct file *file;
        unsigned int rootmode;
        kuid_t user_id;
        kgid_t group_id;
        void **fudptr;
  };
  
+ struct fuse_sync_bucket {
+       /* count is a possible scalability bottleneck */
+       atomic_t count;
+       wait_queue_head_t waitq;
+       struct rcu_head rcu;
+ };
  /**
   * A Fuse connection.
   *
@@@ -800,6 -815,9 +808,9 @@@ struct fuse_conn 
  
        /** List of filesystems using this connection */
        struct list_head mounts;
+       /* New writepages go into this bucket */
+       struct fuse_sync_bucket __rcu *curr_bucket;
  };
  
  /*
@@@ -903,6 -921,15 +914,15 @@@ static inline void fuse_page_descs_leng
                descs[i].length = PAGE_SIZE - descs[i].offset;
  }
  
+ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+ {
+       /* Need RCU protection to prevent use after free after the decrement */
+       rcu_read_lock();
+       if (atomic_dec_and_test(&bucket->count))
+               wake_up(&bucket->waitq);
+       rcu_read_unlock();
+ }
  /** Device operations */
  extern const struct file_operations fuse_dev_operations;
  
@@@ -1209,7 -1236,7 +1229,7 @@@ extern const struct xattr_handler *fuse
  extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
  
  struct posix_acl;
 -struct posix_acl *fuse_get_acl(struct inode *inode, int type);
 +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
  int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
                 struct posix_acl *acl, int type);
  
diff --combined fs/fuse/inode.c
@@@ -85,6 -85,7 +85,6 @@@ static struct inode *fuse_alloc_inode(s
        fi->orig_ino = 0;
        fi->state = 0;
        mutex_init(&fi->mutex);
 -      init_rwsem(&fi->i_mmap_sem);
        spin_lock_init(&fi->lock);
        fi->forget = fuse_alloc_forget();
        if (!fi->forget)
@@@ -137,12 -138,12 +137,12 @@@ static void fuse_evict_inode(struct ino
        }
  }
  
- static int fuse_reconfigure(struct fs_context *fc)
+ static int fuse_reconfigure(struct fs_context *fsc)
  {
-       struct super_block *sb = fc->root->d_sb;
+       struct super_block *sb = fsc->root->d_sb;
  
        sync_filesystem(sb);
-       if (fc->sb_flags & SB_MANDLOCK)
+       if (fsc->sb_flags & SB_MANDLOCK)
                return -EINVAL;
  
        return 0;
@@@ -505,6 -506,57 +505,57 @@@ static int fuse_statfs(struct dentry *d
        return err;
  }
  
+ static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+ {
+       struct fuse_sync_bucket *bucket;
+       bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+       if (bucket) {
+               init_waitqueue_head(&bucket->waitq);
+               /* Initial active count */
+               atomic_set(&bucket->count, 1);
+       }
+       return bucket;
+ }
+ static void fuse_sync_fs_writes(struct fuse_conn *fc)
+ {
+       struct fuse_sync_bucket *bucket, *new_bucket;
+       int count;
+       new_bucket = fuse_sync_bucket_alloc();
+       spin_lock(&fc->lock);
+       bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+       count = atomic_read(&bucket->count);
+       WARN_ON(count < 1);
+       /* No outstanding writes? */
+       if (count == 1) {
+               spin_unlock(&fc->lock);
+               kfree(new_bucket);
+               return;
+       }
+       /*
+        * Completion of new bucket depends on completion of this bucket, so add
+        * one more count.
+        */
+       atomic_inc(&new_bucket->count);
+       rcu_assign_pointer(fc->curr_bucket, new_bucket);
+       spin_unlock(&fc->lock);
+       /*
+        * Drop initial active count.  At this point if all writes in this and
+        * ancestor buckets complete, the count will go to zero and this task
+        * will be woken up.
+        */
+       atomic_dec(&bucket->count);
+       wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+       /* Drop temp count on descendant bucket */
+       fuse_sync_bucket_dec(new_bucket);
+       kfree_rcu(bucket, rcu);
+ }
  static int fuse_sync_fs(struct super_block *sb, int wait)
  {
        struct fuse_mount *fm = get_fuse_mount_super(sb);
        if (!fc->sync_fs)
                return 0;
  
+       fuse_sync_fs_writes(fc);
        memset(&inarg, 0, sizeof(inarg));
        args.in_numargs = 1;
        args.in_args[0].size = sizeof(inarg);
@@@ -572,38 -626,38 +625,38 @@@ static const struct fs_parameter_spec f
        {}
  };
  
- static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
+ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
  {
        struct fs_parse_result result;
-       struct fuse_fs_context *ctx = fc->fs_private;
+       struct fuse_fs_context *ctx = fsc->fs_private;
        int opt;
  
-       if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+       if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                /*
                 * Ignore options coming from mount(MS_REMOUNT) for backward
                 * compatibility.
                 */
-               if (fc->oldapi)
+               if (fsc->oldapi)
                        return 0;
  
-               return invalfc(fc, "No changes allowed in reconfigure");
+               return invalfc(fsc, "No changes allowed in reconfigure");
        }
  
-       opt = fs_parse(fc, fuse_fs_parameters, param, &result);
+       opt = fs_parse(fsc, fuse_fs_parameters, param, &result);
        if (opt < 0)
                return opt;
  
        switch (opt) {
        case OPT_SOURCE:
-               if (fc->source)
-                       return invalfc(fc, "Multiple sources specified");
-               fc->source = param->string;
+               if (fsc->source)
+                       return invalfc(fsc, "Multiple sources specified");
+               fsc->source = param->string;
                param->string = NULL;
                break;
  
        case OPT_SUBTYPE:
                if (ctx->subtype)
-                       return invalfc(fc, "Multiple subtypes specified");
+                       return invalfc(fsc, "Multiple subtypes specified");
                ctx->subtype = param->string;
                param->string = NULL;
                return 0;
  
        case OPT_ROOTMODE:
                if (!fuse_valid_type(result.uint_32))
-                       return invalfc(fc, "Invalid rootmode");
+                       return invalfc(fsc, "Invalid rootmode");
                ctx->rootmode = result.uint_32;
                ctx->rootmode_present = true;
                break;
  
        case OPT_USER_ID:
-               ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+               ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
                if (!uid_valid(ctx->user_id))
-                       return invalfc(fc, "Invalid user_id");
+                       return invalfc(fsc, "Invalid user_id");
                ctx->user_id_present = true;
                break;
  
        case OPT_GROUP_ID:
-               ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+               ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
                if (!gid_valid(ctx->group_id))
-                       return invalfc(fc, "Invalid group_id");
+                       return invalfc(fsc, "Invalid group_id");
                ctx->group_id_present = true;
                break;
  
  
        case OPT_BLKSIZE:
                if (!ctx->is_bdev)
-                       return invalfc(fc, "blksize only supported for fuseblk");
+                       return invalfc(fsc, "blksize only supported for fuseblk");
                ctx->blksize = result.uint_32;
                break;
  
        return 0;
  }
  
- static void fuse_free_fc(struct fs_context *fc)
+ static void fuse_free_fsc(struct fs_context *fsc)
  {
-       struct fuse_fs_context *ctx = fc->fs_private;
+       struct fuse_fs_context *ctx = fsc->fs_private;
  
        if (ctx) {
                kfree(ctx->subtype);
@@@ -762,6 -816,7 +815,7 @@@ void fuse_conn_put(struct fuse_conn *fc
  {
        if (refcount_dec_and_test(&fc->count)) {
                struct fuse_iqueue *fiq = &fc->iq;
+               struct fuse_sync_bucket *bucket;
  
                if (IS_ENABLED(CONFIG_FUSE_DAX))
                        fuse_dax_conn_free(fc);
                        fiq->ops->release(fiq);
                put_pid_ns(fc->pid_ns);
                put_user_ns(fc->user_ns);
+               bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+               if (bucket) {
+                       WARN_ON(atomic_read(&bucket->count) != 1);
+                       kfree(bucket);
+               }
                fc->release(fc);
        }
  }
@@@ -1417,6 -1477,7 +1476,7 @@@ int fuse_fill_super_common(struct super
        if (sb->s_flags & SB_MANDLOCK)
                goto err;
  
+       rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
        fuse_sb_defaults(sb);
  
        if (ctx->is_bdev) {
@@@ -1508,34 -1569,33 +1568,33 @@@ EXPORT_SYMBOL_GPL(fuse_fill_super_commo
  static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
  {
        struct fuse_fs_context *ctx = fsc->fs_private;
-       struct file *file;
        int err;
        struct fuse_conn *fc;
        struct fuse_mount *fm;
  
-       err = -EINVAL;
-       file = fget(ctx->fd);
-       if (!file)
-               goto err;
+       if (!ctx->file || !ctx->rootmode_present ||
+           !ctx->user_id_present || !ctx->group_id_present)
+               return -EINVAL;
  
        /*
         * Require mount to happen from the same user namespace which
         * opened /dev/fuse to prevent potential attacks.
         */
-       if ((file->f_op != &fuse_dev_operations) ||
-           (file->f_cred->user_ns != sb->s_user_ns))
-               goto err_fput;
-       ctx->fudptr = &file->private_data;
+       err = -EINVAL;
+       if ((ctx->file->f_op != &fuse_dev_operations) ||
+           (ctx->file->f_cred->user_ns != sb->s_user_ns))
+               goto err;
+       ctx->fudptr = &ctx->file->private_data;
  
        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
        err = -ENOMEM;
        if (!fc)
-               goto err_fput;
+               goto err;
  
        fm = kzalloc(sizeof(*fm), GFP_KERNEL);
        if (!fm) {
                kfree(fc);
-               goto err_fput;
+               goto err;
        }
  
        fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
        err = fuse_fill_super_common(sb, ctx);
        if (err)
                goto err_put_conn;
-       /*
-        * atomic_dec_and_test() in fput() provides the necessary
-        * memory barrier for file->private_data to be visible on all
-        * CPUs after this
-        */
-       fput(file);
+       /* file->private_data shall be visible on all CPUs after this */
+       smp_mb();
        fuse_send_init(get_fuse_mount_super(sb));
        return 0;
  
        fuse_conn_put(fc);
        kfree(fm);
        sb->s_fs_info = NULL;
-  err_fput:
-       fput(file);
   err:
        return err;
  }
  
- static int fuse_get_tree(struct fs_context *fc)
+ /*
+  * This is the path where user supplied an already initialized fuse dev.  In
+  * this case never create a new super if the old one is gone.
+  */
+ static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc)
  {
-       struct fuse_fs_context *ctx = fc->fs_private;
+       return -ENOTCONN;
+ }
  
-       if (!ctx->fd_present || !ctx->rootmode_present ||
-           !ctx->user_id_present || !ctx->group_id_present)
-               return -EINVAL;
+ static int fuse_test_super(struct super_block *sb, struct fs_context *fsc)
+ {
  
- #ifdef CONFIG_BLOCK
-       if (ctx->is_bdev)
-               return get_tree_bdev(fc, fuse_fill_super);
- #endif
+       return fsc->sget_key == get_fuse_conn_super(sb);
+ }
+ static int fuse_get_tree(struct fs_context *fsc)
+ {
+       struct fuse_fs_context *ctx = fsc->fs_private;
+       struct fuse_dev *fud;
+       struct super_block *sb;
+       int err;
  
-       return get_tree_nodev(fc, fuse_fill_super);
+       if (ctx->fd_present)
+               ctx->file = fget(ctx->fd);
+       if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
+               err = get_tree_bdev(fsc, fuse_fill_super);
+               goto out_fput;
+       }
+       /*
+        * While block dev mount can be initialized with a dummy device fd
+        * (found by device name), normal fuse mounts can't
+        */
+       if (!ctx->file)
+               return -EINVAL;
+       /*
+        * Allow creating a fuse mount with an already initialized fuse
+        * connection
+        */
+       fud = READ_ONCE(ctx->file->private_data);
+       if (ctx->file->f_op == &fuse_dev_operations && fud) {
+               fsc->sget_key = fud->fc;
+               sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
+               err = PTR_ERR_OR_ZERO(sb);
+               if (!IS_ERR(sb))
+                       fsc->root = dget(sb->s_root);
+       } else {
+               err = get_tree_nodev(fsc, fuse_fill_super);
+       }
+ out_fput:
+       if (ctx->file)
+               fput(ctx->file);
+       return err;
  }
  
  static const struct fs_context_operations fuse_context_ops = {
-       .free           = fuse_free_fc,
+       .free           = fuse_free_fsc,
        .parse_param    = fuse_parse_param,
        .reconfigure    = fuse_reconfigure,
        .get_tree       = fuse_get_tree,
  /*
   * Set up the filesystem mount context.
   */
- static int fuse_init_fs_context(struct fs_context *fc)
+ static int fuse_init_fs_context(struct fs_context *fsc)
  {
        struct fuse_fs_context *ctx;
  
        ctx->legacy_opts_show = true;
  
  #ifdef CONFIG_BLOCK
-       if (fc->fs_type == &fuseblk_fs_type) {
+       if (fsc->fs_type == &fuseblk_fs_type) {
                ctx->is_bdev = true;
                ctx->destroy = true;
        }
  #endif
  
-       fc->fs_private = ctx;
-       fc->ops = &fuse_context_ops;
+       fsc->fs_private = ctx;
+       fsc->ops = &fuse_context_ops;
        return 0;
  }