fuse: wait for writepages in syncfs

author Miklos Szeredi <mszeredi@redhat.com>

Wed, 1 Sep 2021 10:39:02 +0000 (12:39 +0200)

committer Miklos Szeredi <mszeredi@redhat.com>

Mon, 6 Sep 2021 11:37:02 +0000 (13:37 +0200)
author Miklos Szeredi <mszeredi@redhat.com>
Wed, 1 Sep 2021 10:39:02 +0000 (12:39 +0200)
committer Miklos Szeredi <mszeredi@redhat.com>
Mon, 6 Sep 2021 11:37:02 +0000 (13:37 +0200)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 88be26e..2bca7ed 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -392,6 +392,7 @@ struct fuse_writepage_args {
         struct list_head queue_entry;
         struct fuse_writepage_args *next;
         struct inode *inode;
+       struct fuse_sync_bucket *bucket;
  };
  
  static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
@@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
         struct fuse_args_pages *ap = &wpa->ia.ap;
         int i;
  
+       if (wpa->bucket)
+               fuse_sync_bucket_dec(wpa->bucket);
+
         for (i = 0; i < ap->num_pages; i++)
                 __free_page(ap->pages[i]);
  
@@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
  
  }
  
+static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+                                        struct fuse_writepage_args *wpa)
+{
+       if (!fc->sync_fs)
+               return;
+
+       rcu_read_lock();
+       /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+       do {
+               wpa->bucket = rcu_dereference(fc->curr_bucket);
+       } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+       rcu_read_unlock();
+}
+
  static int fuse_writepage_locked(struct page *page)
  {
         struct address_space *mapping = page->mapping;
@@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct page *page)
         if (!wpa->ia.ff)
                 goto err_nofile;
  
+       fuse_writepage_add_to_bucket(fc, wpa);
         fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
  
         copy_highpage(tmp_page, page);
@@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct page *page,
                         __free_page(tmp_page);
                         goto out_unlock;
                 }
+               fuse_writepage_add_to_bucket(fc, wpa);
+
                 data->max_pages = 1;
  
                 ap = &wpa->ia.ap;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h

index a784809..f166e24 100644 (file)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -516,6 +516,13 @@ struct fuse_fs_context {
         void **fudptr;
  };
  
+struct fuse_sync_bucket {
+       /* count is a possible scalability bottleneck */
+       atomic_t count;
+       wait_queue_head_t waitq;
+       struct rcu_head rcu;
+};
+
  /**
   * A Fuse connection.
   *
@@ -808,6 +815,9 @@ struct fuse_conn {
  
         /** List of filesystems using this connection */
         struct list_head mounts;
+
+       /* New writepages go into this bucket */
+       struct fuse_sync_bucket __rcu *curr_bucket;
  };
  
  /*
@@ -911,6 +921,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
                 descs[i].length = PAGE_SIZE - descs[i].offset;
  }
  
+static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+{
+       /* Need RCU protection to prevent use after free after the decrement */
+       rcu_read_lock();
+       if (atomic_dec_and_test(&bucket->count))
+               wake_up(&bucket->waitq);
+       rcu_read_unlock();
+}
+
  /** Device operations */
  extern const struct file_operations fuse_dev_operations;
  
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c

index a3e7fb4..2187211 100644 (file)
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
         return err;
  }
  
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+       struct fuse_sync_bucket *bucket;
+
+       bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+       if (bucket) {
+               init_waitqueue_head(&bucket->waitq);
+               /* Initial active count */
+               atomic_set(&bucket->count, 1);
+       }
+       return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+       struct fuse_sync_bucket *bucket, *new_bucket;
+       int count;
+
+       new_bucket = fuse_sync_bucket_alloc();
+       spin_lock(&fc->lock);
+       bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+       count = atomic_read(&bucket->count);
+       WARN_ON(count < 1);
+       /* No outstanding writes? */
+       if (count == 1) {
+               spin_unlock(&fc->lock);
+               kfree(new_bucket);
+               return;
+       }
+
+       /*
+        * Completion of new bucket depends on completion of this bucket, so add
+        * one more count.
+        */
+       atomic_inc(&new_bucket->count);
+       rcu_assign_pointer(fc->curr_bucket, new_bucket);
+       spin_unlock(&fc->lock);
+       /*
+        * Drop initial active count.  At this point if all writes in this and
+        * ancestor buckets complete, the count will go to zero and this task
+        * will be woken up.
+        */
+       atomic_dec(&bucket->count);
+
+       wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+       /* Drop temp count on descendant bucket */
+       fuse_sync_bucket_dec(new_bucket);
+       kfree_rcu(bucket, rcu);
+}
+
  static int fuse_sync_fs(struct super_block *sb, int wait)
  {
         struct fuse_mount *fm = get_fuse_mount_super(sb);
@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait)
         if (!fc->sync_fs)
                 return 0;
  
+       fuse_sync_fs_writes(fc);
+
         memset(&inarg, 0, sizeof(inarg));
         args.in_numargs = 1;
         args.in_args[0].size = sizeof(inarg);
@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc)
  {
         if (refcount_dec_and_test(&fc->count)) {
                 struct fuse_iqueue *fiq = &fc->iq;
+               struct fuse_sync_bucket *bucket;
  
                 if (IS_ENABLED(CONFIG_FUSE_DAX))
                         fuse_dax_conn_free(fc);
@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc)
                         fiq->ops->release(fiq);
                 put_pid_ns(fc->pid_ns);
                 put_user_ns(fc->user_ns);
+               bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+               if (bucket) {
+                       WARN_ON(atomic_read(&bucket->count) != 1);
+                       kfree(bucket);
+               }
                 fc->release(fc);
         }
  }
@@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
         if (sb->s_flags & SB_MANDLOCK)
                 goto err;
  
+       rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
         fuse_sb_defaults(sb);
  
         if (ctx->is_bdev) {
author	Miklos Szeredi <mszeredi@redhat.com>
	Wed, 1 Sep 2021 10:39:02 +0000 (12:39 +0200)
committer	Miklos Szeredi <mszeredi@redhat.com>
	Mon, 6 Sep 2021 11:37:02 +0000 (13:37 +0200)
fs/fuse/file.c		patch \| blob \| history
fs/fuse/fuse_i.h		patch \| blob \| history
fs/fuse/inode.c		patch \| blob \| history