virtiofs: schedule blocking async replies in separate worker
[linux-2.6-microblaze.git] / fs / fuse / virtio_fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * virtio-fs: Virtio Filesystem
4  * Copyright (C) 2018 Red Hat, Inc.
5  */
6
7 #include <linux/fs.h>
8 #include <linux/module.h>
9 #include <linux/virtio.h>
10 #include <linux/virtio_fs.h>
11 #include <linux/delay.h>
12 #include <linux/fs_context.h>
13 #include <linux/highmem.h>
14 #include "fuse_i.h"
15
16 /* List of virtio-fs device instances and a lock for the list. Also provides
17  * mutual exclusion in device removal and mounting path
18  */
19 static DEFINE_MUTEX(virtio_fs_mutex);
20 static LIST_HEAD(virtio_fs_instances);
21
22 enum {
23         VQ_HIPRIO,
24         VQ_REQUEST
25 };
26
27 /* Per-virtqueue state */
28 struct virtio_fs_vq {
29         spinlock_t lock;
30         struct virtqueue *vq;     /* protected by ->lock */
31         struct work_struct done_work;
32         struct list_head queued_reqs;
33         struct list_head end_reqs;      /* End these requests */
34         struct delayed_work dispatch_work;
35         struct fuse_dev *fud;
36         bool connected;
37         long in_flight;
38         struct completion in_flight_zero; /* No inflight requests */
39         char name[24];
40 } ____cacheline_aligned_in_smp;
41
42 /* A virtio-fs device instance */
43 struct virtio_fs {
44         struct kref refcount;
45         struct list_head list;    /* on virtio_fs_instances */
46         char *tag;
47         struct virtio_fs_vq *vqs;
48         unsigned int nvqs;               /* number of virtqueues */
49         unsigned int num_request_queues; /* number of request queues */
50 };
51
52 struct virtio_fs_forget_req {
53         struct fuse_in_header ih;
54         struct fuse_forget_in arg;
55 };
56
57 struct virtio_fs_forget {
58         /* This request can be temporarily queued on virt queue */
59         struct list_head list;
60         struct virtio_fs_forget_req req;
61 };
62
63 struct virtio_fs_req_work {
64         struct fuse_req *req;
65         struct virtio_fs_vq *fsvq;
66         struct work_struct done_work;
67 };
68
69 static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
70                                  struct fuse_req *req, bool in_flight);
71
72 static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
73 {
74         struct virtio_fs *fs = vq->vdev->priv;
75
76         return &fs->vqs[vq->index];
77 }
78
79 static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
80 {
81         return &vq_to_fsvq(vq)->fud->pq;
82 }
83
84 /* Should be called with fsvq->lock held. */
85 static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
86 {
87         fsvq->in_flight++;
88 }
89
90 /* Should be called with fsvq->lock held. */
91 static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
92 {
93         WARN_ON(fsvq->in_flight <= 0);
94         fsvq->in_flight--;
95         if (!fsvq->in_flight)
96                 complete(&fsvq->in_flight_zero);
97 }
98
99 static void release_virtio_fs_obj(struct kref *ref)
100 {
101         struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
102
103         kfree(vfs->vqs);
104         kfree(vfs);
105 }
106
107 /* Make sure virtiofs_mutex is held */
108 static void virtio_fs_put(struct virtio_fs *fs)
109 {
110         kref_put(&fs->refcount, release_virtio_fs_obj);
111 }
112
113 static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
114 {
115         struct virtio_fs *vfs = fiq->priv;
116
117         mutex_lock(&virtio_fs_mutex);
118         virtio_fs_put(vfs);
119         mutex_unlock(&virtio_fs_mutex);
120 }
121
122 static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
123 {
124         WARN_ON(fsvq->in_flight < 0);
125
126         /* Wait for in flight requests to finish.*/
127         spin_lock(&fsvq->lock);
128         if (fsvq->in_flight) {
129                 /* We are holding virtio_fs_mutex. There should not be any
130                  * waiters waiting for completion.
131                  */
132                 reinit_completion(&fsvq->in_flight_zero);
133                 spin_unlock(&fsvq->lock);
134                 wait_for_completion(&fsvq->in_flight_zero);
135         } else {
136                 spin_unlock(&fsvq->lock);
137         }
138
139         flush_work(&fsvq->done_work);
140         flush_delayed_work(&fsvq->dispatch_work);
141 }
142
143 static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs)
144 {
145         struct virtio_fs_vq *fsvq;
146         int i;
147
148         for (i = 0; i < fs->nvqs; i++) {
149                 fsvq = &fs->vqs[i];
150                 virtio_fs_drain_queue(fsvq);
151         }
152 }
153
154 static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
155 {
156         /* Provides mutual exclusion between ->remove and ->kill_sb
157          * paths. We don't want both of these draining queue at the
158          * same time. Current completion logic reinits completion
159          * and that means there should not be any other thread
160          * doing reinit or waiting for completion already.
161          */
162         mutex_lock(&virtio_fs_mutex);
163         virtio_fs_drain_all_queues_locked(fs);
164         mutex_unlock(&virtio_fs_mutex);
165 }
166
167 static void virtio_fs_start_all_queues(struct virtio_fs *fs)
168 {
169         struct virtio_fs_vq *fsvq;
170         int i;
171
172         for (i = 0; i < fs->nvqs; i++) {
173                 fsvq = &fs->vqs[i];
174                 spin_lock(&fsvq->lock);
175                 fsvq->connected = true;
176                 spin_unlock(&fsvq->lock);
177         }
178 }
179
180 /* Add a new instance to the list or return -EEXIST if tag name exists*/
181 static int virtio_fs_add_instance(struct virtio_fs *fs)
182 {
183         struct virtio_fs *fs2;
184         bool duplicate = false;
185
186         mutex_lock(&virtio_fs_mutex);
187
188         list_for_each_entry(fs2, &virtio_fs_instances, list) {
189                 if (strcmp(fs->tag, fs2->tag) == 0)
190                         duplicate = true;
191         }
192
193         if (!duplicate)
194                 list_add_tail(&fs->list, &virtio_fs_instances);
195
196         mutex_unlock(&virtio_fs_mutex);
197
198         if (duplicate)
199                 return -EEXIST;
200         return 0;
201 }
202
203 /* Return the virtio_fs with a given tag, or NULL */
204 static struct virtio_fs *virtio_fs_find_instance(const char *tag)
205 {
206         struct virtio_fs *fs;
207
208         mutex_lock(&virtio_fs_mutex);
209
210         list_for_each_entry(fs, &virtio_fs_instances, list) {
211                 if (strcmp(fs->tag, tag) == 0) {
212                         kref_get(&fs->refcount);
213                         goto found;
214                 }
215         }
216
217         fs = NULL; /* not found */
218
219 found:
220         mutex_unlock(&virtio_fs_mutex);
221
222         return fs;
223 }
224
225 static void virtio_fs_free_devs(struct virtio_fs *fs)
226 {
227         unsigned int i;
228
229         for (i = 0; i < fs->nvqs; i++) {
230                 struct virtio_fs_vq *fsvq = &fs->vqs[i];
231
232                 if (!fsvq->fud)
233                         continue;
234
235                 fuse_dev_free(fsvq->fud);
236                 fsvq->fud = NULL;
237         }
238 }
239
240 /* Read filesystem name from virtio config into fs->tag (must kfree()). */
241 static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
242 {
243         char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
244         char *end;
245         size_t len;
246
247         virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
248                            &tag_buf, sizeof(tag_buf));
249         end = memchr(tag_buf, '\0', sizeof(tag_buf));
250         if (end == tag_buf)
251                 return -EINVAL; /* empty tag */
252         if (!end)
253                 end = &tag_buf[sizeof(tag_buf)];
254
255         len = end - tag_buf;
256         fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
257         if (!fs->tag)
258                 return -ENOMEM;
259         memcpy(fs->tag, tag_buf, len);
260         fs->tag[len] = '\0';
261         return 0;
262 }
263
264 /* Work function for hiprio completion */
265 static void virtio_fs_hiprio_done_work(struct work_struct *work)
266 {
267         struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
268                                                  done_work);
269         struct virtqueue *vq = fsvq->vq;
270
271         /* Free completed FUSE_FORGET requests */
272         spin_lock(&fsvq->lock);
273         do {
274                 unsigned int len;
275                 void *req;
276
277                 virtqueue_disable_cb(vq);
278
279                 while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
280                         kfree(req);
281                         dec_in_flight_req(fsvq);
282                 }
283         } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
284         spin_unlock(&fsvq->lock);
285 }
286
287 static void virtio_fs_request_dispatch_work(struct work_struct *work)
288 {
289         struct fuse_req *req;
290         struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
291                                                  dispatch_work.work);
292         struct fuse_conn *fc = fsvq->fud->fc;
293         int ret;
294
295         pr_debug("virtio-fs: worker %s called.\n", __func__);
296         while (1) {
297                 spin_lock(&fsvq->lock);
298                 req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
299                                                list);
300                 if (!req) {
301                         spin_unlock(&fsvq->lock);
302                         break;
303                 }
304
305                 list_del_init(&req->list);
306                 spin_unlock(&fsvq->lock);
307                 fuse_request_end(fc, req);
308         }
309
310         /* Dispatch pending requests */
311         while (1) {
312                 spin_lock(&fsvq->lock);
313                 req = list_first_entry_or_null(&fsvq->queued_reqs,
314                                                struct fuse_req, list);
315                 if (!req) {
316                         spin_unlock(&fsvq->lock);
317                         return;
318                 }
319                 list_del_init(&req->list);
320                 spin_unlock(&fsvq->lock);
321
322                 ret = virtio_fs_enqueue_req(fsvq, req, true);
323                 if (ret < 0) {
324                         if (ret == -ENOMEM || ret == -ENOSPC) {
325                                 spin_lock(&fsvq->lock);
326                                 list_add_tail(&req->list, &fsvq->queued_reqs);
327                                 schedule_delayed_work(&fsvq->dispatch_work,
328                                                       msecs_to_jiffies(1));
329                                 spin_unlock(&fsvq->lock);
330                                 return;
331                         }
332                         req->out.h.error = ret;
333                         spin_lock(&fsvq->lock);
334                         dec_in_flight_req(fsvq);
335                         spin_unlock(&fsvq->lock);
336                         pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
337                                ret);
338                         fuse_request_end(fc, req);
339                 }
340         }
341 }
342
343 /*
344  * Returns 1 if queue is full and sender should wait a bit before sending
345  * next request, 0 otherwise.
346  */
347 static int send_forget_request(struct virtio_fs_vq *fsvq,
348                                struct virtio_fs_forget *forget,
349                                bool in_flight)
350 {
351         struct scatterlist sg;
352         struct virtqueue *vq;
353         int ret = 0;
354         bool notify;
355         struct virtio_fs_forget_req *req = &forget->req;
356
357         spin_lock(&fsvq->lock);
358         if (!fsvq->connected) {
359                 if (in_flight)
360                         dec_in_flight_req(fsvq);
361                 kfree(forget);
362                 goto out;
363         }
364
365         sg_init_one(&sg, req, sizeof(*req));
366         vq = fsvq->vq;
367         dev_dbg(&vq->vdev->dev, "%s\n", __func__);
368
369         ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC);
370         if (ret < 0) {
371                 if (ret == -ENOMEM || ret == -ENOSPC) {
372                         pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
373                                  ret);
374                         list_add_tail(&forget->list, &fsvq->queued_reqs);
375                         schedule_delayed_work(&fsvq->dispatch_work,
376                                               msecs_to_jiffies(1));
377                         if (!in_flight)
378                                 inc_in_flight_req(fsvq);
379                         /* Queue is full */
380                         ret = 1;
381                 } else {
382                         pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
383                                  ret);
384                         kfree(forget);
385                         if (in_flight)
386                                 dec_in_flight_req(fsvq);
387                 }
388                 goto out;
389         }
390
391         if (!in_flight)
392                 inc_in_flight_req(fsvq);
393         notify = virtqueue_kick_prepare(vq);
394         spin_unlock(&fsvq->lock);
395
396         if (notify)
397                 virtqueue_notify(vq);
398         return ret;
399 out:
400         spin_unlock(&fsvq->lock);
401         return ret;
402 }
403
404 static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
405 {
406         struct virtio_fs_forget *forget;
407         struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
408                                                  dispatch_work.work);
409         pr_debug("virtio-fs: worker %s called.\n", __func__);
410         while (1) {
411                 spin_lock(&fsvq->lock);
412                 forget = list_first_entry_or_null(&fsvq->queued_reqs,
413                                         struct virtio_fs_forget, list);
414                 if (!forget) {
415                         spin_unlock(&fsvq->lock);
416                         return;
417                 }
418
419                 list_del(&forget->list);
420                 spin_unlock(&fsvq->lock);
421                 if (send_forget_request(fsvq, forget, true))
422                         return;
423         }
424 }
425
426 /* Allocate and copy args into req->argbuf */
427 static int copy_args_to_argbuf(struct fuse_req *req)
428 {
429         struct fuse_args *args = req->args;
430         unsigned int offset = 0;
431         unsigned int num_in;
432         unsigned int num_out;
433         unsigned int len;
434         unsigned int i;
435
436         num_in = args->in_numargs - args->in_pages;
437         num_out = args->out_numargs - args->out_pages;
438         len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
439               fuse_len_args(num_out, args->out_args);
440
441         req->argbuf = kmalloc(len, GFP_ATOMIC);
442         if (!req->argbuf)
443                 return -ENOMEM;
444
445         for (i = 0; i < num_in; i++) {
446                 memcpy(req->argbuf + offset,
447                        args->in_args[i].value,
448                        args->in_args[i].size);
449                 offset += args->in_args[i].size;
450         }
451
452         return 0;
453 }
454
455 /* Copy args out of and free req->argbuf */
456 static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
457 {
458         unsigned int remaining;
459         unsigned int offset;
460         unsigned int num_in;
461         unsigned int num_out;
462         unsigned int i;
463
464         remaining = req->out.h.len - sizeof(req->out.h);
465         num_in = args->in_numargs - args->in_pages;
466         num_out = args->out_numargs - args->out_pages;
467         offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
468
469         for (i = 0; i < num_out; i++) {
470                 unsigned int argsize = args->out_args[i].size;
471
472                 if (args->out_argvar &&
473                     i == args->out_numargs - 1 &&
474                     argsize > remaining) {
475                         argsize = remaining;
476                 }
477
478                 memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
479                 offset += argsize;
480
481                 if (i != args->out_numargs - 1)
482                         remaining -= argsize;
483         }
484
485         /* Store the actual size of the variable-length arg */
486         if (args->out_argvar)
487                 args->out_args[args->out_numargs - 1].size = remaining;
488
489         kfree(req->argbuf);
490         req->argbuf = NULL;
491 }
492
493 /* Work function for request completion */
494 static void virtio_fs_request_complete(struct fuse_req *req,
495                                        struct virtio_fs_vq *fsvq)
496 {
497         struct fuse_pqueue *fpq = &fsvq->fud->pq;
498         struct fuse_conn *fc = fsvq->fud->fc;
499         struct fuse_args *args;
500         struct fuse_args_pages *ap;
501         unsigned int len, i, thislen;
502         struct page *page;
503
504         /*
505          * TODO verify that server properly follows FUSE protocol
506          * (oh.uniq, oh.len)
507          */
508         args = req->args;
509         copy_args_from_argbuf(args, req);
510
511         if (args->out_pages && args->page_zeroing) {
512                 len = args->out_args[args->out_numargs - 1].size;
513                 ap = container_of(args, typeof(*ap), args);
514                 for (i = 0; i < ap->num_pages; i++) {
515                         thislen = ap->descs[i].length;
516                         if (len < thislen) {
517                                 WARN_ON(ap->descs[i].offset);
518                                 page = ap->pages[i];
519                                 zero_user_segment(page, len, thislen);
520                                 len = 0;
521                         } else {
522                                 len -= thislen;
523                         }
524                 }
525         }
526
527         spin_lock(&fpq->lock);
528         clear_bit(FR_SENT, &req->flags);
529         spin_unlock(&fpq->lock);
530
531         fuse_request_end(fc, req);
532         spin_lock(&fsvq->lock);
533         dec_in_flight_req(fsvq);
534         spin_unlock(&fsvq->lock);
535 }
536
537 static void virtio_fs_complete_req_work(struct work_struct *work)
538 {
539         struct virtio_fs_req_work *w =
540                 container_of(work, typeof(*w), done_work);
541
542         virtio_fs_request_complete(w->req, w->fsvq);
543         kfree(w);
544 }
545
546 static void virtio_fs_requests_done_work(struct work_struct *work)
547 {
548         struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
549                                                  done_work);
550         struct fuse_pqueue *fpq = &fsvq->fud->pq;
551         struct virtqueue *vq = fsvq->vq;
552         struct fuse_req *req;
553         struct fuse_req *next;
554         unsigned int len;
555         LIST_HEAD(reqs);
556
557         /* Collect completed requests off the virtqueue */
558         spin_lock(&fsvq->lock);
559         do {
560                 virtqueue_disable_cb(vq);
561
562                 while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
563                         spin_lock(&fpq->lock);
564                         list_move_tail(&req->list, &reqs);
565                         spin_unlock(&fpq->lock);
566                 }
567         } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
568         spin_unlock(&fsvq->lock);
569
570         /* End requests */
571         list_for_each_entry_safe(req, next, &reqs, list) {
572                 list_del_init(&req->list);
573
574                 /* blocking async request completes in a worker context */
575                 if (req->args->may_block) {
576                         struct virtio_fs_req_work *w;
577
578                         w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
579                         INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
580                         w->fsvq = fsvq;
581                         w->req = req;
582                         schedule_work(&w->done_work);
583                 } else {
584                         virtio_fs_request_complete(req, fsvq);
585                 }
586         }
587 }
588
589 /* Virtqueue interrupt handler */
590 static void virtio_fs_vq_done(struct virtqueue *vq)
591 {
592         struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
593
594         dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
595
596         schedule_work(&fsvq->done_work);
597 }
598
599 /* Initialize virtqueues */
600 static int virtio_fs_setup_vqs(struct virtio_device *vdev,
601                                struct virtio_fs *fs)
602 {
603         struct virtqueue **vqs;
604         vq_callback_t **callbacks;
605         const char **names;
606         unsigned int i;
607         int ret = 0;
608
609         virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
610                      &fs->num_request_queues);
611         if (fs->num_request_queues == 0)
612                 return -EINVAL;
613
614         fs->nvqs = 1 + fs->num_request_queues;
615         fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
616         if (!fs->vqs)
617                 return -ENOMEM;
618
619         vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
620         callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
621                                         GFP_KERNEL);
622         names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
623         if (!vqs || !callbacks || !names) {
624                 ret = -ENOMEM;
625                 goto out;
626         }
627
628         callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
629         snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
630                         "hiprio");
631         names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
632         INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
633         INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
634         INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
635         INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
636                         virtio_fs_hiprio_dispatch_work);
637         init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
638         spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
639
640         /* Initialize the requests virtqueues */
641         for (i = VQ_REQUEST; i < fs->nvqs; i++) {
642                 spin_lock_init(&fs->vqs[i].lock);
643                 INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
644                 INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
645                                   virtio_fs_request_dispatch_work);
646                 INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
647                 INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
648                 init_completion(&fs->vqs[i].in_flight_zero);
649                 snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
650                          "requests.%u", i - VQ_REQUEST);
651                 callbacks[i] = virtio_fs_vq_done;
652                 names[i] = fs->vqs[i].name;
653         }
654
655         ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
656         if (ret < 0)
657                 goto out;
658
659         for (i = 0; i < fs->nvqs; i++)
660                 fs->vqs[i].vq = vqs[i];
661
662         virtio_fs_start_all_queues(fs);
663 out:
664         kfree(names);
665         kfree(callbacks);
666         kfree(vqs);
667         if (ret)
668                 kfree(fs->vqs);
669         return ret;
670 }
671
672 /* Free virtqueues (device must already be reset) */
673 static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
674                                   struct virtio_fs *fs)
675 {
676         vdev->config->del_vqs(vdev);
677 }
678
679 static int virtio_fs_probe(struct virtio_device *vdev)
680 {
681         struct virtio_fs *fs;
682         int ret;
683
684         fs = kzalloc(sizeof(*fs), GFP_KERNEL);
685         if (!fs)
686                 return -ENOMEM;
687         kref_init(&fs->refcount);
688         vdev->priv = fs;
689
690         ret = virtio_fs_read_tag(vdev, fs);
691         if (ret < 0)
692                 goto out;
693
694         ret = virtio_fs_setup_vqs(vdev, fs);
695         if (ret < 0)
696                 goto out;
697
698         /* TODO vq affinity */
699
700         /* Bring the device online in case the filesystem is mounted and
701          * requests need to be sent before we return.
702          */
703         virtio_device_ready(vdev);
704
705         ret = virtio_fs_add_instance(fs);
706         if (ret < 0)
707                 goto out_vqs;
708
709         return 0;
710
711 out_vqs:
712         vdev->config->reset(vdev);
713         virtio_fs_cleanup_vqs(vdev, fs);
714
715 out:
716         vdev->priv = NULL;
717         kfree(fs);
718         return ret;
719 }
720
721 static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
722 {
723         struct virtio_fs_vq *fsvq;
724         int i;
725
726         for (i = 0; i < fs->nvqs; i++) {
727                 fsvq = &fs->vqs[i];
728                 spin_lock(&fsvq->lock);
729                 fsvq->connected = false;
730                 spin_unlock(&fsvq->lock);
731         }
732 }
733
734 static void virtio_fs_remove(struct virtio_device *vdev)
735 {
736         struct virtio_fs *fs = vdev->priv;
737
738         mutex_lock(&virtio_fs_mutex);
739         /* This device is going away. No one should get new reference */
740         list_del_init(&fs->list);
741         virtio_fs_stop_all_queues(fs);
742         virtio_fs_drain_all_queues_locked(fs);
743         vdev->config->reset(vdev);
744         virtio_fs_cleanup_vqs(vdev, fs);
745
746         vdev->priv = NULL;
747         /* Put device reference on virtio_fs object */
748         virtio_fs_put(fs);
749         mutex_unlock(&virtio_fs_mutex);
750 }
751
752 #ifdef CONFIG_PM_SLEEP
753 static int virtio_fs_freeze(struct virtio_device *vdev)
754 {
755         /* TODO need to save state here */
756         pr_warn("virtio-fs: suspend/resume not yet supported\n");
757         return -EOPNOTSUPP;
758 }
759
760 static int virtio_fs_restore(struct virtio_device *vdev)
761 {
762          /* TODO need to restore state here */
763         return 0;
764 }
765 #endif /* CONFIG_PM_SLEEP */
766
767 static const struct virtio_device_id id_table[] = {
768         { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
769         {},
770 };
771
772 static const unsigned int feature_table[] = {};
773
774 static struct virtio_driver virtio_fs_driver = {
775         .driver.name            = KBUILD_MODNAME,
776         .driver.owner           = THIS_MODULE,
777         .id_table               = id_table,
778         .feature_table          = feature_table,
779         .feature_table_size     = ARRAY_SIZE(feature_table),
780         .probe                  = virtio_fs_probe,
781         .remove                 = virtio_fs_remove,
782 #ifdef CONFIG_PM_SLEEP
783         .freeze                 = virtio_fs_freeze,
784         .restore                = virtio_fs_restore,
785 #endif
786 };
787
788 static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
789 __releases(fiq->lock)
790 {
791         struct fuse_forget_link *link;
792         struct virtio_fs_forget *forget;
793         struct virtio_fs_forget_req *req;
794         struct virtio_fs *fs;
795         struct virtio_fs_vq *fsvq;
796         u64 unique;
797
798         link = fuse_dequeue_forget(fiq, 1, NULL);
799         unique = fuse_get_unique(fiq);
800
801         fs = fiq->priv;
802         fsvq = &fs->vqs[VQ_HIPRIO];
803         spin_unlock(&fiq->lock);
804
805         /* Allocate a buffer for the request */
806         forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
807         req = &forget->req;
808
809         req->ih = (struct fuse_in_header){
810                 .opcode = FUSE_FORGET,
811                 .nodeid = link->forget_one.nodeid,
812                 .unique = unique,
813                 .len = sizeof(*req),
814         };
815         req->arg = (struct fuse_forget_in){
816                 .nlookup = link->forget_one.nlookup,
817         };
818
819         send_forget_request(fsvq, forget, false);
820         kfree(link);
821 }
822
823 static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
824 __releases(fiq->lock)
825 {
826         /*
827          * TODO interrupts.
828          *
829          * Normal fs operations on a local filesystems aren't interruptible.
830          * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
831          * with shared lock between host and guest.
832          */
833         spin_unlock(&fiq->lock);
834 }
835
836 /* Return the number of scatter-gather list elements required */
837 static unsigned int sg_count_fuse_req(struct fuse_req *req)
838 {
839         struct fuse_args *args = req->args;
840         struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
841         unsigned int total_sgs = 1 /* fuse_in_header */;
842
843         if (args->in_numargs - args->in_pages)
844                 total_sgs += 1;
845
846         if (args->in_pages)
847                 total_sgs += ap->num_pages;
848
849         if (!test_bit(FR_ISREPLY, &req->flags))
850                 return total_sgs;
851
852         total_sgs += 1 /* fuse_out_header */;
853
854         if (args->out_numargs - args->out_pages)
855                 total_sgs += 1;
856
857         if (args->out_pages)
858                 total_sgs += ap->num_pages;
859
860         return total_sgs;
861 }
862
863 /* Add pages to scatter-gather list and return number of elements used */
864 static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
865                                        struct page **pages,
866                                        struct fuse_page_desc *page_descs,
867                                        unsigned int num_pages,
868                                        unsigned int total_len)
869 {
870         unsigned int i;
871         unsigned int this_len;
872
873         for (i = 0; i < num_pages && total_len; i++) {
874                 sg_init_table(&sg[i], 1);
875                 this_len =  min(page_descs[i].length, total_len);
876                 sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
877                 total_len -= this_len;
878         }
879
880         return i;
881 }
882
883 /* Add args to scatter-gather list and return number of elements used */
884 static unsigned int sg_init_fuse_args(struct scatterlist *sg,
885                                       struct fuse_req *req,
886                                       struct fuse_arg *args,
887                                       unsigned int numargs,
888                                       bool argpages,
889                                       void *argbuf,
890                                       unsigned int *len_used)
891 {
892         struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
893         unsigned int total_sgs = 0;
894         unsigned int len;
895
896         len = fuse_len_args(numargs - argpages, args);
897         if (len)
898                 sg_init_one(&sg[total_sgs++], argbuf, len);
899
900         if (argpages)
901                 total_sgs += sg_init_fuse_pages(&sg[total_sgs],
902                                                 ap->pages, ap->descs,
903                                                 ap->num_pages,
904                                                 args[numargs - 1].size);
905
906         if (len_used)
907                 *len_used = len;
908
909         return total_sgs;
910 }
911
912 /* Add a request to a virtqueue and kick the device */
913 static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
914                                  struct fuse_req *req, bool in_flight)
915 {
916         /* requests need at least 4 elements */
917         struct scatterlist *stack_sgs[6];
918         struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
919         struct scatterlist **sgs = stack_sgs;
920         struct scatterlist *sg = stack_sg;
921         struct virtqueue *vq;
922         struct fuse_args *args = req->args;
923         unsigned int argbuf_used = 0;
924         unsigned int out_sgs = 0;
925         unsigned int in_sgs = 0;
926         unsigned int total_sgs;
927         unsigned int i;
928         int ret;
929         bool notify;
930         struct fuse_pqueue *fpq;
931
932         /* Does the sglist fit on the stack? */
933         total_sgs = sg_count_fuse_req(req);
934         if (total_sgs > ARRAY_SIZE(stack_sgs)) {
935                 sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
936                 sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
937                 if (!sgs || !sg) {
938                         ret = -ENOMEM;
939                         goto out;
940                 }
941         }
942
943         /* Use a bounce buffer since stack args cannot be mapped */
944         ret = copy_args_to_argbuf(req);
945         if (ret < 0)
946                 goto out;
947
948         /* Request elements */
949         sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
950         out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
951                                      (struct fuse_arg *)args->in_args,
952                                      args->in_numargs, args->in_pages,
953                                      req->argbuf, &argbuf_used);
954
955         /* Reply elements */
956         if (test_bit(FR_ISREPLY, &req->flags)) {
957                 sg_init_one(&sg[out_sgs + in_sgs++],
958                             &req->out.h, sizeof(req->out.h));
959                 in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
960                                             args->out_args, args->out_numargs,
961                                             args->out_pages,
962                                             req->argbuf + argbuf_used, NULL);
963         }
964
965         WARN_ON(out_sgs + in_sgs != total_sgs);
966
967         for (i = 0; i < total_sgs; i++)
968                 sgs[i] = &sg[i];
969
970         spin_lock(&fsvq->lock);
971
972         if (!fsvq->connected) {
973                 spin_unlock(&fsvq->lock);
974                 ret = -ENOTCONN;
975                 goto out;
976         }
977
978         vq = fsvq->vq;
979         ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
980         if (ret < 0) {
981                 spin_unlock(&fsvq->lock);
982                 goto out;
983         }
984
985         /* Request successfully sent. */
986         fpq = &fsvq->fud->pq;
987         spin_lock(&fpq->lock);
988         list_add_tail(&req->list, fpq->processing);
989         spin_unlock(&fpq->lock);
990         set_bit(FR_SENT, &req->flags);
991         /* matches barrier in request_wait_answer() */
992         smp_mb__after_atomic();
993
994         if (!in_flight)
995                 inc_in_flight_req(fsvq);
996         notify = virtqueue_kick_prepare(vq);
997
998         spin_unlock(&fsvq->lock);
999
1000         if (notify)
1001                 virtqueue_notify(vq);
1002
1003 out:
1004         if (ret < 0 && req->argbuf) {
1005                 kfree(req->argbuf);
1006                 req->argbuf = NULL;
1007         }
1008         if (sgs != stack_sgs) {
1009                 kfree(sgs);
1010                 kfree(sg);
1011         }
1012
1013         return ret;
1014 }
1015
1016 static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
1017 __releases(fiq->lock)
1018 {
1019         unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
1020         struct virtio_fs *fs;
1021         struct fuse_req *req;
1022         struct virtio_fs_vq *fsvq;
1023         int ret;
1024
1025         WARN_ON(list_empty(&fiq->pending));
1026         req = list_last_entry(&fiq->pending, struct fuse_req, list);
1027         clear_bit(FR_PENDING, &req->flags);
1028         list_del_init(&req->list);
1029         WARN_ON(!list_empty(&fiq->pending));
1030         spin_unlock(&fiq->lock);
1031
1032         fs = fiq->priv;
1033
1034         pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
1035                   __func__, req->in.h.opcode, req->in.h.unique,
1036                  req->in.h.nodeid, req->in.h.len,
1037                  fuse_len_args(req->args->out_numargs, req->args->out_args));
1038
1039         fsvq = &fs->vqs[queue_id];
1040         ret = virtio_fs_enqueue_req(fsvq, req, false);
1041         if (ret < 0) {
1042                 if (ret == -ENOMEM || ret == -ENOSPC) {
1043                         /*
1044                          * Virtqueue full. Retry submission from worker
1045                          * context as we might be holding fc->bg_lock.
1046                          */
1047                         spin_lock(&fsvq->lock);
1048                         list_add_tail(&req->list, &fsvq->queued_reqs);
1049                         inc_in_flight_req(fsvq);
1050                         schedule_delayed_work(&fsvq->dispatch_work,
1051                                                 msecs_to_jiffies(1));
1052                         spin_unlock(&fsvq->lock);
1053                         return;
1054                 }
1055                 req->out.h.error = ret;
1056                 pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
1057
1058                 /* Can't end request in submission context. Use a worker */
1059                 spin_lock(&fsvq->lock);
1060                 list_add_tail(&req->list, &fsvq->end_reqs);
1061                 schedule_delayed_work(&fsvq->dispatch_work, 0);
1062                 spin_unlock(&fsvq->lock);
1063                 return;
1064         }
1065 }
1066
1067 static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
1068         .wake_forget_and_unlock         = virtio_fs_wake_forget_and_unlock,
1069         .wake_interrupt_and_unlock      = virtio_fs_wake_interrupt_and_unlock,
1070         .wake_pending_and_unlock        = virtio_fs_wake_pending_and_unlock,
1071         .release                        = virtio_fs_fiq_release,
1072 };
1073
1074 static int virtio_fs_fill_super(struct super_block *sb)
1075 {
1076         struct fuse_conn *fc = get_fuse_conn_super(sb);
1077         struct virtio_fs *fs = fc->iq.priv;
1078         unsigned int i;
1079         int err;
1080         struct fuse_fs_context ctx = {
1081                 .rootmode = S_IFDIR,
1082                 .default_permissions = 1,
1083                 .allow_other = 1,
1084                 .max_read = UINT_MAX,
1085                 .blksize = 512,
1086                 .destroy = true,
1087                 .no_control = true,
1088                 .no_force_umount = true,
1089                 .no_mount_options = true,
1090         };
1091
1092         mutex_lock(&virtio_fs_mutex);
1093
1094         /* After holding mutex, make sure virtiofs device is still there.
1095          * Though we are holding a reference to it, drive ->remove might
1096          * still have cleaned up virtual queues. In that case bail out.
1097          */
1098         err = -EINVAL;
1099         if (list_empty(&fs->list)) {
1100                 pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
1101                 goto err;
1102         }
1103
1104         err = -ENOMEM;
1105         /* Allocate fuse_dev for hiprio and notification queues */
1106         for (i = 0; i < VQ_REQUEST; i++) {
1107                 struct virtio_fs_vq *fsvq = &fs->vqs[i];
1108
1109                 fsvq->fud = fuse_dev_alloc();
1110                 if (!fsvq->fud)
1111                         goto err_free_fuse_devs;
1112         }
1113
1114         ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
1115         err = fuse_fill_super_common(sb, &ctx);
1116         if (err < 0)
1117                 goto err_free_fuse_devs;
1118
1119         fc = fs->vqs[VQ_REQUEST].fud->fc;
1120
1121         for (i = 0; i < fs->nvqs; i++) {
1122                 struct virtio_fs_vq *fsvq = &fs->vqs[i];
1123
1124                 if (i == VQ_REQUEST)
1125                         continue; /* already initialized */
1126                 fuse_dev_install(fsvq->fud, fc);
1127         }
1128
1129         /* Previous unmount will stop all queues. Start these again */
1130         virtio_fs_start_all_queues(fs);
1131         fuse_send_init(fc);
1132         mutex_unlock(&virtio_fs_mutex);
1133         return 0;
1134
1135 err_free_fuse_devs:
1136         virtio_fs_free_devs(fs);
1137 err:
1138         mutex_unlock(&virtio_fs_mutex);
1139         return err;
1140 }
1141
1142 static void virtio_kill_sb(struct super_block *sb)
1143 {
1144         struct fuse_conn *fc = get_fuse_conn_super(sb);
1145         struct virtio_fs *vfs;
1146         struct virtio_fs_vq *fsvq;
1147
1148         /* If mount failed, we can still be called without any fc */
1149         if (!fc)
1150                 return fuse_kill_sb_anon(sb);
1151
1152         vfs = fc->iq.priv;
1153         fsvq = &vfs->vqs[VQ_HIPRIO];
1154
1155         /* Stop forget queue. Soon destroy will be sent */
1156         spin_lock(&fsvq->lock);
1157         fsvq->connected = false;
1158         spin_unlock(&fsvq->lock);
1159         virtio_fs_drain_all_queues(vfs);
1160
1161         fuse_kill_sb_anon(sb);
1162
1163         /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
1164          * and drain one more time and free fuse devices. Freeing fuse
1165          * devices will drop their reference on fuse_conn and that in
1166          * turn will drop its reference on virtio_fs object.
1167          */
1168         virtio_fs_stop_all_queues(vfs);
1169         virtio_fs_drain_all_queues(vfs);
1170         virtio_fs_free_devs(vfs);
1171 }
1172
1173 static int virtio_fs_test_super(struct super_block *sb,
1174                                 struct fs_context *fsc)
1175 {
1176         struct fuse_conn *fc = fsc->s_fs_info;
1177
1178         return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
1179 }
1180
1181 static int virtio_fs_set_super(struct super_block *sb,
1182                                struct fs_context *fsc)
1183 {
1184         int err;
1185
1186         err = get_anon_bdev(&sb->s_dev);
1187         if (!err)
1188                 fuse_conn_get(fsc->s_fs_info);
1189
1190         return err;
1191 }
1192
1193 static int virtio_fs_get_tree(struct fs_context *fsc)
1194 {
1195         struct virtio_fs *fs;
1196         struct super_block *sb;
1197         struct fuse_conn *fc;
1198         int err;
1199
1200         /* This gets a reference on virtio_fs object. This ptr gets installed
1201          * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
1202          * to drop the reference to this object.
1203          */
1204         fs = virtio_fs_find_instance(fsc->source);
1205         if (!fs) {
1206                 pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
1207                 return -EINVAL;
1208         }
1209
1210         fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
1211         if (!fc) {
1212                 mutex_lock(&virtio_fs_mutex);
1213                 virtio_fs_put(fs);
1214                 mutex_unlock(&virtio_fs_mutex);
1215                 return -ENOMEM;
1216         }
1217
1218         fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
1219                        fs);
1220         fc->release = fuse_free_conn;
1221         fc->delete_stale = true;
1222
1223         fsc->s_fs_info = fc;
1224         sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
1225         fuse_conn_put(fc);
1226         if (IS_ERR(sb))
1227                 return PTR_ERR(sb);
1228
1229         if (!sb->s_root) {
1230                 err = virtio_fs_fill_super(sb);
1231                 if (err) {
1232                         deactivate_locked_super(sb);
1233                         return err;
1234                 }
1235
1236                 sb->s_flags |= SB_ACTIVE;
1237         }
1238
1239         WARN_ON(fsc->root);
1240         fsc->root = dget(sb->s_root);
1241         return 0;
1242 }
1243
1244 static const struct fs_context_operations virtio_fs_context_ops = {
1245         .get_tree       = virtio_fs_get_tree,
1246 };
1247
1248 static int virtio_fs_init_fs_context(struct fs_context *fsc)
1249 {
1250         fsc->ops = &virtio_fs_context_ops;
1251         return 0;
1252 }
1253
1254 static struct file_system_type virtio_fs_type = {
1255         .owner          = THIS_MODULE,
1256         .name           = "virtiofs",
1257         .init_fs_context = virtio_fs_init_fs_context,
1258         .kill_sb        = virtio_kill_sb,
1259 };
1260
1261 static int __init virtio_fs_init(void)
1262 {
1263         int ret;
1264
1265         ret = register_virtio_driver(&virtio_fs_driver);
1266         if (ret < 0)
1267                 return ret;
1268
1269         ret = register_filesystem(&virtio_fs_type);
1270         if (ret < 0) {
1271                 unregister_virtio_driver(&virtio_fs_driver);
1272                 return ret;
1273         }
1274
1275         return 0;
1276 }
1277 module_init(virtio_fs_init);
1278
1279 static void __exit virtio_fs_exit(void)
1280 {
1281         unregister_filesystem(&virtio_fs_type);
1282         unregister_virtio_driver(&virtio_fs_driver);
1283 }
1284 module_exit(virtio_fs_exit);
1285
1286 MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
1287 MODULE_DESCRIPTION("Virtio Filesystem");
1288 MODULE_LICENSE("GPL");
1289 MODULE_ALIAS_FS(KBUILD_MODNAME);
1290 MODULE_DEVICE_TABLE(virtio, id_table);