fuse: wait for writepages in syncfs
authorMiklos Szeredi <mszeredi@redhat.com>
Wed, 1 Sep 2021 10:39:02 +0000 (12:39 +0200)
committerMiklos Szeredi <mszeredi@redhat.com>
Mon, 6 Sep 2021 11:37:02 +0000 (13:37 +0200)
In case of fuse the MM subsystem doesn't guarantee that page writeback
completes by the time ->sync_fs() is called.  This is because fuse
completes page writeback immediately to prevent DoS of memory reclaim by
the userspace file server.

This means that fuse itself must ensure that writes are synced before
sending the SYNCFS request to the server.

Introduce sync buckets, that hold a counter for the number of outstanding
write requests.  On syncfs replace the current bucket with a new one and
wait until the old bucket's counter goes down to zero.

It is possible to have multiple syncfs calls in parallel, in which case
there could be more than one waited-on buckets.  Descendant buckets must
not complete until the parent completes.  Add a count to the child (new)
bucket until the (parent) old bucket completes.

Use RCU protection to dereference the current bucket and to wake up an
emptied bucket.  Use fc->lock to protect against parallel assignments to
the current bucket.

This leaves just the counter to be a possible scalability issue.  The
fc->num_waiting counter has a similar issue, so both should be addressed at
the same time.

Reported-by: Amir Goldstein <amir73il@gmail.com>
Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server")
Cc: <stable@vger.kernel.org> # v5.14
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
fs/fuse/file.c
fs/fuse/fuse_i.h
fs/fuse/inode.c

index 88be26e..2bca7ed 100644 (file)
@@ -392,6 +392,7 @@ struct fuse_writepage_args {
        struct list_head queue_entry;
        struct fuse_writepage_args *next;
        struct inode *inode;
+       struct fuse_sync_bucket *bucket;
 };
 
 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
@@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
        struct fuse_args_pages *ap = &wpa->ia.ap;
        int i;
 
+       if (wpa->bucket)
+               fuse_sync_bucket_dec(wpa->bucket);
+
        for (i = 0; i < ap->num_pages; i++)
                __free_page(ap->pages[i]);
 
@@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
 
 }
 
+static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+                                        struct fuse_writepage_args *wpa)
+{
+       if (!fc->sync_fs)
+               return;
+
+       rcu_read_lock();
+       /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+       do {
+               wpa->bucket = rcu_dereference(fc->curr_bucket);
+       } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+       rcu_read_unlock();
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
        struct address_space *mapping = page->mapping;
@@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct page *page)
        if (!wpa->ia.ff)
                goto err_nofile;
 
+       fuse_writepage_add_to_bucket(fc, wpa);
        fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
 
        copy_highpage(tmp_page, page);
@@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct page *page,
                        __free_page(tmp_page);
                        goto out_unlock;
                }
+               fuse_writepage_add_to_bucket(fc, wpa);
+
                data->max_pages = 1;
 
                ap = &wpa->ia.ap;
index a784809..f166e24 100644 (file)
@@ -516,6 +516,13 @@ struct fuse_fs_context {
        void **fudptr;
 };
 
+struct fuse_sync_bucket {
+       /* count is a possible scalability bottleneck */
+       atomic_t count;
+       wait_queue_head_t waitq;
+       struct rcu_head rcu;
+};
+
 /**
  * A Fuse connection.
  *
@@ -808,6 +815,9 @@ struct fuse_conn {
 
        /** List of filesystems using this connection */
        struct list_head mounts;
+
+       /* New writepages go into this bucket */
+       struct fuse_sync_bucket __rcu *curr_bucket;
 };
 
 /*
@@ -911,6 +921,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
                descs[i].length = PAGE_SIZE - descs[i].offset;
 }
 
+static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+{
+       /* Need RCU protection to prevent use after free after the decrement */
+       rcu_read_lock();
+       if (atomic_dec_and_test(&bucket->count))
+               wake_up(&bucket->waitq);
+       rcu_read_unlock();
+}
+
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
index a3e7fb4..2187211 100644 (file)
@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        return err;
 }
 
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+       struct fuse_sync_bucket *bucket;
+
+       bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+       if (bucket) {
+               init_waitqueue_head(&bucket->waitq);
+               /* Initial active count */
+               atomic_set(&bucket->count, 1);
+       }
+       return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+       struct fuse_sync_bucket *bucket, *new_bucket;
+       int count;
+
+       new_bucket = fuse_sync_bucket_alloc();
+       spin_lock(&fc->lock);
+       bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+       count = atomic_read(&bucket->count);
+       WARN_ON(count < 1);
+       /* No outstanding writes? */
+       if (count == 1) {
+               spin_unlock(&fc->lock);
+               kfree(new_bucket);
+               return;
+       }
+
+       /*
+        * Completion of new bucket depends on completion of this bucket, so add
+        * one more count.
+        */
+       atomic_inc(&new_bucket->count);
+       rcu_assign_pointer(fc->curr_bucket, new_bucket);
+       spin_unlock(&fc->lock);
+       /*
+        * Drop initial active count.  At this point if all writes in this and
+        * ancestor buckets complete, the count will go to zero and this task
+        * will be woken up.
+        */
+       atomic_dec(&bucket->count);
+
+       wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+       /* Drop temp count on descendant bucket */
+       fuse_sync_bucket_dec(new_bucket);
+       kfree_rcu(bucket, rcu);
+}
+
 static int fuse_sync_fs(struct super_block *sb, int wait)
 {
        struct fuse_mount *fm = get_fuse_mount_super(sb);
@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait)
        if (!fc->sync_fs)
                return 0;
 
+       fuse_sync_fs_writes(fc);
+
        memset(&inarg, 0, sizeof(inarg));
        args.in_numargs = 1;
        args.in_args[0].size = sizeof(inarg);
@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 {
        if (refcount_dec_and_test(&fc->count)) {
                struct fuse_iqueue *fiq = &fc->iq;
+               struct fuse_sync_bucket *bucket;
 
                if (IS_ENABLED(CONFIG_FUSE_DAX))
                        fuse_dax_conn_free(fc);
@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc)
                        fiq->ops->release(fiq);
                put_pid_ns(fc->pid_ns);
                put_user_ns(fc->user_ns);
+               bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+               if (bucket) {
+                       WARN_ON(atomic_read(&bucket->count) != 1);
+                       kfree(bucket);
+               }
                fc->release(fc);
        }
 }
@@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
        if (sb->s_flags & SB_MANDLOCK)
                goto err;
 
+       rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
        fuse_sb_defaults(sb);
 
        if (ctx->is_bdev) {