1 // SPDX-License-Identifier: GPL-2.0-only
3 * VDUSE: vDPA Device in Userspace
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
7 * Author: Xie Yongji <xieyongji@bytedance.com>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18 #include <linux/dma-map-ops.h>
19 #include <linux/poll.h>
20 #include <linux/file.h>
21 #include <linux/uio.h>
22 #include <linux/vdpa.h>
23 #include <linux/nospec.h>
24 #include <uapi/linux/vduse.h>
25 #include <uapi/linux/vdpa.h>
26 #include <uapi/linux/virtio_config.h>
27 #include <uapi/linux/virtio_ids.h>
28 #include <uapi/linux/virtio_blk.h>
29 #include <linux/mod_devicetable.h>
31 #include "iova_domain.h"
33 #define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
34 #define DRV_DESC "vDPA Device in Userspace"
35 #define DRV_LICENSE "GPL v2"
37 #define VDUSE_DEV_MAX (1U << MINORBITS)
38 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
39 #define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
40 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
42 struct vduse_virtqueue {
49 struct vdpa_vq_state state;
54 struct eventfd_ctx *kickfd;
55 struct vdpa_callback cb;
56 struct work_struct inject;
57 struct work_struct kick;
63 struct vdpa_device vdpa;
64 struct vduse_dev *dev;
68 struct vduse_vdpa *vdev;
70 struct vduse_virtqueue *vqs;
71 struct vduse_iova_domain *domain;
77 wait_queue_head_t waitq;
78 struct list_head send_list;
79 struct list_head recv_list;
80 struct vdpa_callback config_cb;
81 struct work_struct inject;
99 struct vduse_dev_msg {
100 struct vduse_dev_request req;
101 struct vduse_dev_response resp;
102 struct list_head list;
103 wait_queue_head_t waitq;
107 struct vduse_control {
111 static DEFINE_MUTEX(vduse_lock);
112 static DEFINE_IDR(vduse_idr);
114 static dev_t vduse_major;
115 static struct class *vduse_class;
116 static struct cdev vduse_ctrl_cdev;
117 static struct cdev vduse_cdev;
118 static struct workqueue_struct *vduse_irq_wq;
120 static u32 allowed_device_id[] = {
124 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
126 struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
131 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
133 struct vdpa_device *vdpa = dev_to_vdpa(dev);
135 return vdpa_to_vduse(vdpa);
138 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
141 struct vduse_dev_msg *msg;
143 list_for_each_entry(msg, head, list) {
144 if (msg->req.request_id == request_id) {
145 list_del(&msg->list);
153 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
155 struct vduse_dev_msg *msg = NULL;
157 if (!list_empty(head)) {
158 msg = list_first_entry(head, struct vduse_dev_msg, list);
159 list_del(&msg->list);
165 static void vduse_enqueue_msg(struct list_head *head,
166 struct vduse_dev_msg *msg)
168 list_add_tail(&msg->list, head);
171 static void vduse_dev_broken(struct vduse_dev *dev)
173 struct vduse_dev_msg *msg, *tmp;
175 if (unlikely(dev->broken))
178 list_splice_init(&dev->recv_list, &dev->send_list);
179 list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
180 list_del(&msg->list);
182 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
183 wake_up(&msg->waitq);
186 wake_up(&dev->waitq);
189 static int vduse_dev_msg_sync(struct vduse_dev *dev,
190 struct vduse_dev_msg *msg)
194 if (unlikely(dev->broken))
197 init_waitqueue_head(&msg->waitq);
198 spin_lock(&dev->msg_lock);
199 if (unlikely(dev->broken)) {
200 spin_unlock(&dev->msg_lock);
203 msg->req.request_id = dev->msg_unique++;
204 vduse_enqueue_msg(&dev->send_list, msg);
205 wake_up(&dev->waitq);
206 spin_unlock(&dev->msg_lock);
207 if (dev->msg_timeout)
208 ret = wait_event_killable_timeout(msg->waitq, msg->completed,
209 (long)dev->msg_timeout * HZ);
211 ret = wait_event_killable(msg->waitq, msg->completed);
213 spin_lock(&dev->msg_lock);
214 if (!msg->completed) {
215 list_del(&msg->list);
216 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
217 /* Mark the device as malfunction when there is a timeout */
219 vduse_dev_broken(dev);
221 ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
222 spin_unlock(&dev->msg_lock);
227 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
228 struct vduse_virtqueue *vq,
229 struct vdpa_vq_state_packed *packed)
231 struct vduse_dev_msg msg = { 0 };
234 msg.req.type = VDUSE_GET_VQ_STATE;
235 msg.req.vq_state.index = vq->index;
237 ret = vduse_dev_msg_sync(dev, &msg);
241 packed->last_avail_counter =
242 msg.resp.vq_state.packed.last_avail_counter & 0x0001;
243 packed->last_avail_idx =
244 msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
245 packed->last_used_counter =
246 msg.resp.vq_state.packed.last_used_counter & 0x0001;
247 packed->last_used_idx =
248 msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
253 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
254 struct vduse_virtqueue *vq,
255 struct vdpa_vq_state_split *split)
257 struct vduse_dev_msg msg = { 0 };
260 msg.req.type = VDUSE_GET_VQ_STATE;
261 msg.req.vq_state.index = vq->index;
263 ret = vduse_dev_msg_sync(dev, &msg);
267 split->avail_index = msg.resp.vq_state.split.avail_index;
272 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
274 struct vduse_dev_msg msg = { 0 };
276 msg.req.type = VDUSE_SET_STATUS;
277 msg.req.s.status = status;
279 return vduse_dev_msg_sync(dev, &msg);
282 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
285 struct vduse_dev_msg msg = { 0 };
290 msg.req.type = VDUSE_UPDATE_IOTLB;
291 msg.req.iova.start = start;
292 msg.req.iova.last = last;
294 return vduse_dev_msg_sync(dev, &msg);
297 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
299 struct file *file = iocb->ki_filp;
300 struct vduse_dev *dev = file->private_data;
301 struct vduse_dev_msg *msg;
302 int size = sizeof(struct vduse_dev_request);
305 if (iov_iter_count(to) < size)
308 spin_lock(&dev->msg_lock);
310 msg = vduse_dequeue_msg(&dev->send_list);
315 if (file->f_flags & O_NONBLOCK)
318 spin_unlock(&dev->msg_lock);
319 ret = wait_event_interruptible_exclusive(dev->waitq,
320 !list_empty(&dev->send_list));
324 spin_lock(&dev->msg_lock);
326 spin_unlock(&dev->msg_lock);
327 ret = copy_to_iter(&msg->req, size, to);
328 spin_lock(&dev->msg_lock);
331 vduse_enqueue_msg(&dev->send_list, msg);
334 vduse_enqueue_msg(&dev->recv_list, msg);
336 spin_unlock(&dev->msg_lock);
341 static bool is_mem_zero(const char *ptr, int size)
345 for (i = 0; i < size; i++) {
352 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
354 struct file *file = iocb->ki_filp;
355 struct vduse_dev *dev = file->private_data;
356 struct vduse_dev_response resp;
357 struct vduse_dev_msg *msg;
360 ret = copy_from_iter(&resp, sizeof(resp), from);
361 if (ret != sizeof(resp))
364 if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
367 spin_lock(&dev->msg_lock);
368 msg = vduse_find_msg(&dev->recv_list, resp.request_id);
374 memcpy(&msg->resp, &resp, sizeof(resp));
376 wake_up(&msg->waitq);
378 spin_unlock(&dev->msg_lock);
383 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
385 struct vduse_dev *dev = file->private_data;
388 poll_wait(file, &dev->waitq, wait);
390 spin_lock(&dev->msg_lock);
392 if (unlikely(dev->broken))
394 if (!list_empty(&dev->send_list))
395 mask |= EPOLLIN | EPOLLRDNORM;
396 if (!list_empty(&dev->recv_list))
397 mask |= EPOLLOUT | EPOLLWRNORM;
399 spin_unlock(&dev->msg_lock);
404 static void vduse_dev_reset(struct vduse_dev *dev)
407 struct vduse_iova_domain *domain = dev->domain;
409 /* The coherent mappings are handled in vduse_dev_free_coherent() */
410 if (domain->bounce_map)
411 vduse_domain_reset_bounce_map(domain);
414 dev->driver_features = 0;
416 spin_lock(&dev->irq_lock);
417 dev->config_cb.callback = NULL;
418 dev->config_cb.private = NULL;
419 spin_unlock(&dev->irq_lock);
420 flush_work(&dev->inject);
422 for (i = 0; i < dev->vq_num; i++) {
423 struct vduse_virtqueue *vq = &dev->vqs[i];
430 memset(&vq->state, 0, sizeof(vq->state));
432 spin_lock(&vq->kick_lock);
435 eventfd_ctx_put(vq->kickfd);
437 spin_unlock(&vq->kick_lock);
439 spin_lock(&vq->irq_lock);
440 vq->cb.callback = NULL;
441 vq->cb.private = NULL;
442 spin_unlock(&vq->irq_lock);
443 flush_work(&vq->inject);
444 flush_work(&vq->kick);
448 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
449 u64 desc_area, u64 driver_area,
452 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
453 struct vduse_virtqueue *vq = &dev->vqs[idx];
455 vq->desc_addr = desc_area;
456 vq->driver_addr = driver_area;
457 vq->device_addr = device_area;
462 static void vduse_vq_kick(struct vduse_virtqueue *vq)
464 spin_lock(&vq->kick_lock);
469 eventfd_signal(vq->kickfd, 1);
473 spin_unlock(&vq->kick_lock);
476 static void vduse_vq_kick_work(struct work_struct *work)
478 struct vduse_virtqueue *vq = container_of(work,
479 struct vduse_virtqueue, kick);
484 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
486 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
487 struct vduse_virtqueue *vq = &dev->vqs[idx];
489 if (eventfd_signal_count()) {
490 schedule_work(&vq->kick);
496 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
497 struct vdpa_callback *cb)
499 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
500 struct vduse_virtqueue *vq = &dev->vqs[idx];
502 spin_lock(&vq->irq_lock);
503 vq->cb.callback = cb->callback;
504 vq->cb.private = cb->private;
505 spin_unlock(&vq->irq_lock);
508 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
510 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
511 struct vduse_virtqueue *vq = &dev->vqs[idx];
516 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
519 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
520 struct vduse_virtqueue *vq = &dev->vqs[idx];
525 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
527 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
528 struct vduse_virtqueue *vq = &dev->vqs[idx];
533 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
534 const struct vdpa_vq_state *state)
536 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
537 struct vduse_virtqueue *vq = &dev->vqs[idx];
539 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
540 vq->state.packed.last_avail_counter =
541 state->packed.last_avail_counter;
542 vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
543 vq->state.packed.last_used_counter =
544 state->packed.last_used_counter;
545 vq->state.packed.last_used_idx = state->packed.last_used_idx;
547 vq->state.split.avail_index = state->split.avail_index;
552 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
553 struct vdpa_vq_state *state)
555 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
556 struct vduse_virtqueue *vq = &dev->vqs[idx];
558 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
559 return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
561 return vduse_dev_get_vq_state_split(dev, vq, &state->split);
564 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
566 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
568 return dev->vq_align;
571 static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
573 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
575 return dev->device_features;
578 static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
580 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
582 dev->driver_features = features;
586 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
587 struct vdpa_callback *cb)
589 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
591 spin_lock(&dev->irq_lock);
592 dev->config_cb.callback = cb->callback;
593 dev->config_cb.private = cb->private;
594 spin_unlock(&dev->irq_lock);
597 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
599 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
603 for (i = 0; i < dev->vq_num; i++)
604 if (num_max < dev->vqs[i].num_max)
605 num_max = dev->vqs[i].num_max;
610 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
612 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
614 return dev->device_id;
617 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
619 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
621 return dev->vendor_id;
624 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
626 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
631 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
633 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
635 if (vduse_dev_set_status(dev, status))
638 dev->status = status;
641 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
643 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
645 return dev->config_size;
648 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
649 void *buf, unsigned int len)
651 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
653 if (len > dev->config_size - offset)
656 memcpy(buf, dev->config + offset, len);
659 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
660 const void *buf, unsigned int len)
662 /* Now we only support read-only configuration space */
665 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
667 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
669 if (vduse_dev_set_status(dev, 0))
672 vduse_dev_reset(dev);
677 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
679 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
681 return dev->generation;
684 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
685 struct vhost_iotlb *iotlb)
687 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
690 ret = vduse_domain_set_map(dev->domain, iotlb);
694 ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
696 vduse_domain_clear_map(dev->domain, iotlb);
703 static void vduse_vdpa_free(struct vdpa_device *vdpa)
705 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
710 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
711 .set_vq_address = vduse_vdpa_set_vq_address,
712 .kick_vq = vduse_vdpa_kick_vq,
713 .set_vq_cb = vduse_vdpa_set_vq_cb,
714 .set_vq_num = vduse_vdpa_set_vq_num,
715 .set_vq_ready = vduse_vdpa_set_vq_ready,
716 .get_vq_ready = vduse_vdpa_get_vq_ready,
717 .set_vq_state = vduse_vdpa_set_vq_state,
718 .get_vq_state = vduse_vdpa_get_vq_state,
719 .get_vq_align = vduse_vdpa_get_vq_align,
720 .get_features = vduse_vdpa_get_features,
721 .set_features = vduse_vdpa_set_features,
722 .set_config_cb = vduse_vdpa_set_config_cb,
723 .get_vq_num_max = vduse_vdpa_get_vq_num_max,
724 .get_device_id = vduse_vdpa_get_device_id,
725 .get_vendor_id = vduse_vdpa_get_vendor_id,
726 .get_status = vduse_vdpa_get_status,
727 .set_status = vduse_vdpa_set_status,
728 .get_config_size = vduse_vdpa_get_config_size,
729 .get_config = vduse_vdpa_get_config,
730 .set_config = vduse_vdpa_set_config,
731 .get_generation = vduse_vdpa_get_generation,
732 .reset = vduse_vdpa_reset,
733 .set_map = vduse_vdpa_set_map,
734 .free = vduse_vdpa_free,
737 static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
738 unsigned long offset, size_t size,
739 enum dma_data_direction dir,
742 struct vduse_dev *vdev = dev_to_vduse(dev);
743 struct vduse_iova_domain *domain = vdev->domain;
745 return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
748 static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
749 size_t size, enum dma_data_direction dir,
752 struct vduse_dev *vdev = dev_to_vduse(dev);
753 struct vduse_iova_domain *domain = vdev->domain;
755 return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
758 static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
759 dma_addr_t *dma_addr, gfp_t flag,
762 struct vduse_dev *vdev = dev_to_vduse(dev);
763 struct vduse_iova_domain *domain = vdev->domain;
767 *dma_addr = DMA_MAPPING_ERROR;
768 addr = vduse_domain_alloc_coherent(domain, size,
769 (dma_addr_t *)&iova, flag, attrs);
773 *dma_addr = (dma_addr_t)iova;
778 static void vduse_dev_free_coherent(struct device *dev, size_t size,
779 void *vaddr, dma_addr_t dma_addr,
782 struct vduse_dev *vdev = dev_to_vduse(dev);
783 struct vduse_iova_domain *domain = vdev->domain;
785 vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
788 static size_t vduse_dev_max_mapping_size(struct device *dev)
790 struct vduse_dev *vdev = dev_to_vduse(dev);
791 struct vduse_iova_domain *domain = vdev->domain;
793 return domain->bounce_size;
796 static const struct dma_map_ops vduse_dev_dma_ops = {
797 .map_page = vduse_dev_map_page,
798 .unmap_page = vduse_dev_unmap_page,
799 .alloc = vduse_dev_alloc_coherent,
800 .free = vduse_dev_free_coherent,
801 .max_mapping_size = vduse_dev_max_mapping_size,
804 static unsigned int perm_to_file_flags(u8 perm)
806 unsigned int flags = 0;
809 case VDUSE_ACCESS_WO:
812 case VDUSE_ACCESS_RO:
815 case VDUSE_ACCESS_RW:
819 WARN(1, "invalidate vhost IOTLB permission\n");
826 static int vduse_kickfd_setup(struct vduse_dev *dev,
827 struct vduse_vq_eventfd *eventfd)
829 struct eventfd_ctx *ctx = NULL;
830 struct vduse_virtqueue *vq;
833 if (eventfd->index >= dev->vq_num)
836 index = array_index_nospec(eventfd->index, dev->vq_num);
837 vq = &dev->vqs[index];
838 if (eventfd->fd >= 0) {
839 ctx = eventfd_ctx_fdget(eventfd->fd);
842 } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
845 spin_lock(&vq->kick_lock);
847 eventfd_ctx_put(vq->kickfd);
849 if (vq->ready && vq->kicked && vq->kickfd) {
850 eventfd_signal(vq->kickfd, 1);
853 spin_unlock(&vq->kick_lock);
858 static bool vduse_dev_is_ready(struct vduse_dev *dev)
862 for (i = 0; i < dev->vq_num; i++)
863 if (!dev->vqs[i].num_max)
869 static void vduse_dev_irq_inject(struct work_struct *work)
871 struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
873 spin_lock_irq(&dev->irq_lock);
874 if (dev->config_cb.callback)
875 dev->config_cb.callback(dev->config_cb.private);
876 spin_unlock_irq(&dev->irq_lock);
879 static void vduse_vq_irq_inject(struct work_struct *work)
881 struct vduse_virtqueue *vq = container_of(work,
882 struct vduse_virtqueue, inject);
884 spin_lock_irq(&vq->irq_lock);
885 if (vq->ready && vq->cb.callback)
886 vq->cb.callback(vq->cb.private);
887 spin_unlock_irq(&vq->irq_lock);
890 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
893 struct vduse_dev *dev = file->private_data;
894 void __user *argp = (void __user *)arg;
897 if (unlikely(dev->broken))
901 case VDUSE_IOTLB_GET_FD: {
902 struct vduse_iotlb_entry entry;
903 struct vhost_iotlb_map *map;
904 struct vdpa_map_file *map_file;
905 struct vduse_iova_domain *domain = dev->domain;
906 struct file *f = NULL;
909 if (copy_from_user(&entry, argp, sizeof(entry)))
913 if (entry.start > entry.last)
916 spin_lock(&domain->iotlb_lock);
917 map = vhost_iotlb_itree_first(domain->iotlb,
918 entry.start, entry.last);
920 map_file = (struct vdpa_map_file *)map->opaque;
921 f = get_file(map_file->file);
922 entry.offset = map_file->offset;
923 entry.start = map->start;
924 entry.last = map->last;
925 entry.perm = map->perm;
927 spin_unlock(&domain->iotlb_lock);
933 if (copy_to_user(argp, &entry, sizeof(entry))) {
937 ret = receive_fd(f, perm_to_file_flags(entry.perm));
941 case VDUSE_DEV_GET_FEATURES:
943 * Just mirror what driver wrote here.
944 * The driver is expected to check FEATURE_OK later.
946 ret = put_user(dev->driver_features, (u64 __user *)argp);
948 case VDUSE_DEV_SET_CONFIG: {
949 struct vduse_config_data config;
950 unsigned long size = offsetof(struct vduse_config_data,
954 if (copy_from_user(&config, argp, size))
958 if (config.length == 0 ||
959 config.length > dev->config_size - config.offset)
963 if (copy_from_user(dev->config + config.offset, argp + size,
970 case VDUSE_DEV_INJECT_CONFIG_IRQ:
972 queue_work(vduse_irq_wq, &dev->inject);
974 case VDUSE_VQ_SETUP: {
975 struct vduse_vq_config config;
979 if (copy_from_user(&config, argp, sizeof(config)))
983 if (config.index >= dev->vq_num)
986 if (!is_mem_zero((const char *)config.reserved,
987 sizeof(config.reserved)))
990 index = array_index_nospec(config.index, dev->vq_num);
991 dev->vqs[index].num_max = config.max_size;
995 case VDUSE_VQ_GET_INFO: {
996 struct vduse_vq_info vq_info;
997 struct vduse_virtqueue *vq;
1001 if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1005 if (vq_info.index >= dev->vq_num)
1008 index = array_index_nospec(vq_info.index, dev->vq_num);
1009 vq = &dev->vqs[index];
1010 vq_info.desc_addr = vq->desc_addr;
1011 vq_info.driver_addr = vq->driver_addr;
1012 vq_info.device_addr = vq->device_addr;
1013 vq_info.num = vq->num;
1015 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1016 vq_info.packed.last_avail_counter =
1017 vq->state.packed.last_avail_counter;
1018 vq_info.packed.last_avail_idx =
1019 vq->state.packed.last_avail_idx;
1020 vq_info.packed.last_used_counter =
1021 vq->state.packed.last_used_counter;
1022 vq_info.packed.last_used_idx =
1023 vq->state.packed.last_used_idx;
1025 vq_info.split.avail_index =
1026 vq->state.split.avail_index;
1028 vq_info.ready = vq->ready;
1031 if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1037 case VDUSE_VQ_SETUP_KICKFD: {
1038 struct vduse_vq_eventfd eventfd;
1041 if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1044 ret = vduse_kickfd_setup(dev, &eventfd);
1047 case VDUSE_VQ_INJECT_IRQ: {
1051 if (get_user(index, (u32 __user *)argp))
1055 if (index >= dev->vq_num)
1059 index = array_index_nospec(index, dev->vq_num);
1060 queue_work(vduse_irq_wq, &dev->vqs[index].inject);
1071 static int vduse_dev_release(struct inode *inode, struct file *file)
1073 struct vduse_dev *dev = file->private_data;
1075 spin_lock(&dev->msg_lock);
1076 /* Make sure the inflight messages can processed after reconncection */
1077 list_splice_init(&dev->recv_list, &dev->send_list);
1078 spin_unlock(&dev->msg_lock);
1079 dev->connected = false;
1084 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1086 struct vduse_dev *dev;
1088 mutex_lock(&vduse_lock);
1089 dev = idr_find(&vduse_idr, minor);
1090 mutex_unlock(&vduse_lock);
1095 static int vduse_dev_open(struct inode *inode, struct file *file)
1098 struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1104 mutex_lock(&dev->lock);
1109 dev->connected = true;
1110 file->private_data = dev;
1112 mutex_unlock(&dev->lock);
1117 static const struct file_operations vduse_dev_fops = {
1118 .owner = THIS_MODULE,
1119 .open = vduse_dev_open,
1120 .release = vduse_dev_release,
1121 .read_iter = vduse_dev_read_iter,
1122 .write_iter = vduse_dev_write_iter,
1123 .poll = vduse_dev_poll,
1124 .unlocked_ioctl = vduse_dev_ioctl,
1125 .compat_ioctl = compat_ptr_ioctl,
1126 .llseek = noop_llseek,
1129 static struct vduse_dev *vduse_dev_create(void)
1131 struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1136 mutex_init(&dev->lock);
1137 spin_lock_init(&dev->msg_lock);
1138 INIT_LIST_HEAD(&dev->send_list);
1139 INIT_LIST_HEAD(&dev->recv_list);
1140 spin_lock_init(&dev->irq_lock);
1142 INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1143 init_waitqueue_head(&dev->waitq);
1148 static void vduse_dev_destroy(struct vduse_dev *dev)
1153 static struct vduse_dev *vduse_find_dev(const char *name)
1155 struct vduse_dev *dev;
1158 idr_for_each_entry(&vduse_idr, dev, id)
1159 if (!strcmp(dev->name, name))
1165 static int vduse_destroy_dev(char *name)
1167 struct vduse_dev *dev = vduse_find_dev(name);
1172 mutex_lock(&dev->lock);
1173 if (dev->vdev || dev->connected) {
1174 mutex_unlock(&dev->lock);
1177 dev->connected = true;
1178 mutex_unlock(&dev->lock);
1180 vduse_dev_reset(dev);
1181 device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1182 idr_remove(&vduse_idr, dev->minor);
1183 kvfree(dev->config);
1185 vduse_domain_destroy(dev->domain);
1187 vduse_dev_destroy(dev);
1188 module_put(THIS_MODULE);
1193 static bool device_is_allowed(u32 device_id)
1197 for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1198 if (allowed_device_id[i] == device_id)
1204 static bool features_is_valid(u64 features)
1206 if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
1209 /* Now we only support read-only configuration space */
1210 if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
1216 static bool vduse_validate_config(struct vduse_dev_config *config)
1218 if (!is_mem_zero((const char *)config->reserved,
1219 sizeof(config->reserved)))
1222 if (config->vq_align > PAGE_SIZE)
1225 if (config->config_size > PAGE_SIZE)
1228 if (!device_is_allowed(config->device_id))
1231 if (!features_is_valid(config->features))
1237 static ssize_t msg_timeout_show(struct device *device,
1238 struct device_attribute *attr, char *buf)
1240 struct vduse_dev *dev = dev_get_drvdata(device);
1242 return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1245 static ssize_t msg_timeout_store(struct device *device,
1246 struct device_attribute *attr,
1247 const char *buf, size_t count)
1249 struct vduse_dev *dev = dev_get_drvdata(device);
1252 ret = kstrtouint(buf, 10, &dev->msg_timeout);
1259 static DEVICE_ATTR_RW(msg_timeout);
1261 static struct attribute *vduse_dev_attrs[] = {
1262 &dev_attr_msg_timeout.attr,
1266 ATTRIBUTE_GROUPS(vduse_dev);
1268 static int vduse_create_dev(struct vduse_dev_config *config,
1269 void *config_buf, u64 api_version)
1272 struct vduse_dev *dev;
1275 if (vduse_find_dev(config->name))
1279 dev = vduse_dev_create();
1283 dev->api_version = api_version;
1284 dev->device_features = config->features;
1285 dev->device_id = config->device_id;
1286 dev->vendor_id = config->vendor_id;
1287 dev->name = kstrdup(config->name, GFP_KERNEL);
1291 dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
1296 dev->config = config_buf;
1297 dev->config_size = config->config_size;
1298 dev->vq_align = config->vq_align;
1299 dev->vq_num = config->vq_num;
1300 dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1304 for (i = 0; i < dev->vq_num; i++) {
1305 dev->vqs[i].index = i;
1306 INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
1307 INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
1308 spin_lock_init(&dev->vqs[i].kick_lock);
1309 spin_lock_init(&dev->vqs[i].irq_lock);
1312 ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1317 dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1318 dev->dev = device_create(vduse_class, NULL,
1319 MKDEV(MAJOR(vduse_major), dev->minor),
1320 dev, "%s", config->name);
1321 if (IS_ERR(dev->dev)) {
1322 ret = PTR_ERR(dev->dev);
1325 __module_get(THIS_MODULE);
1329 idr_remove(&vduse_idr, dev->minor);
1333 vduse_domain_destroy(dev->domain);
1337 vduse_dev_destroy(dev);
1343 static long vduse_ioctl(struct file *file, unsigned int cmd,
1347 void __user *argp = (void __user *)arg;
1348 struct vduse_control *control = file->private_data;
1350 mutex_lock(&vduse_lock);
1352 case VDUSE_GET_API_VERSION:
1353 ret = put_user(control->api_version, (u64 __user *)argp);
1355 case VDUSE_SET_API_VERSION: {
1359 if (get_user(api_version, (u64 __user *)argp))
1363 if (api_version > VDUSE_API_VERSION)
1367 control->api_version = api_version;
1370 case VDUSE_CREATE_DEV: {
1371 struct vduse_dev_config config;
1372 unsigned long size = offsetof(struct vduse_dev_config, config);
1376 if (copy_from_user(&config, argp, size))
1380 if (vduse_validate_config(&config) == false)
1383 buf = vmemdup_user(argp + size, config.config_size);
1388 config.name[VDUSE_NAME_MAX - 1] = '\0';
1389 ret = vduse_create_dev(&config, buf, control->api_version);
1392 case VDUSE_DESTROY_DEV: {
1393 char name[VDUSE_NAME_MAX];
1396 if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1399 name[VDUSE_NAME_MAX - 1] = '\0';
1400 ret = vduse_destroy_dev(name);
1407 mutex_unlock(&vduse_lock);
1412 static int vduse_release(struct inode *inode, struct file *file)
1414 struct vduse_control *control = file->private_data;
1420 static int vduse_open(struct inode *inode, struct file *file)
1422 struct vduse_control *control;
1424 control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1428 control->api_version = VDUSE_API_VERSION;
1429 file->private_data = control;
1434 static const struct file_operations vduse_ctrl_fops = {
1435 .owner = THIS_MODULE,
1437 .release = vduse_release,
1438 .unlocked_ioctl = vduse_ioctl,
1439 .compat_ioctl = compat_ptr_ioctl,
1440 .llseek = noop_llseek,
1443 static char *vduse_devnode(struct device *dev, umode_t *mode)
1445 return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1448 static void vduse_mgmtdev_release(struct device *dev)
1452 static struct device vduse_mgmtdev = {
1453 .init_name = "vduse",
1454 .release = vduse_mgmtdev_release,
1457 static struct vdpa_mgmt_dev mgmt_dev;
1459 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
1461 struct vduse_vdpa *vdev;
1467 vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
1468 &vduse_vdpa_config_ops, name, true);
1470 return PTR_ERR(vdev);
1474 vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
1475 ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
1477 put_device(&vdev->vdpa.dev);
1480 set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
1481 vdev->vdpa.dma_dev = &vdev->vdpa.dev;
1482 vdev->vdpa.mdev = &mgmt_dev;
1487 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
1489 struct vduse_dev *dev;
1492 mutex_lock(&vduse_lock);
1493 dev = vduse_find_dev(name);
1494 if (!dev || !vduse_dev_is_ready(dev)) {
1495 mutex_unlock(&vduse_lock);
1498 ret = vduse_dev_init_vdpa(dev, name);
1499 mutex_unlock(&vduse_lock);
1503 ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
1505 put_device(&dev->vdev->vdpa.dev);
1512 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
1514 _vdpa_unregister_device(dev);
1517 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
1518 .dev_add = vdpa_dev_add,
1519 .dev_del = vdpa_dev_del,
1522 static struct virtio_device_id id_table[] = {
1523 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
1527 static struct vdpa_mgmt_dev mgmt_dev = {
1528 .device = &vduse_mgmtdev,
1529 .id_table = id_table,
1530 .ops = &vdpa_dev_mgmtdev_ops,
1533 static int vduse_mgmtdev_init(void)
1537 ret = device_register(&vduse_mgmtdev);
1541 ret = vdpa_mgmtdev_register(&mgmt_dev);
1547 device_unregister(&vduse_mgmtdev);
1551 static void vduse_mgmtdev_exit(void)
1553 vdpa_mgmtdev_unregister(&mgmt_dev);
1554 device_unregister(&vduse_mgmtdev);
1557 static int vduse_init(void)
1562 vduse_class = class_create(THIS_MODULE, "vduse");
1563 if (IS_ERR(vduse_class))
1564 return PTR_ERR(vduse_class);
1566 vduse_class->devnode = vduse_devnode;
1567 vduse_class->dev_groups = vduse_dev_groups;
1569 ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
1571 goto err_chardev_region;
1573 /* /dev/vduse/control */
1574 cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
1575 vduse_ctrl_cdev.owner = THIS_MODULE;
1576 ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
1580 dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
1586 /* /dev/vduse/$DEVICE */
1587 cdev_init(&vduse_cdev, &vduse_dev_fops);
1588 vduse_cdev.owner = THIS_MODULE;
1589 ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
1594 vduse_irq_wq = alloc_workqueue("vduse-irq",
1595 WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
1599 ret = vduse_domain_init();
1603 ret = vduse_mgmtdev_init();
1609 vduse_domain_exit();
1611 destroy_workqueue(vduse_irq_wq);
1613 cdev_del(&vduse_cdev);
1615 device_destroy(vduse_class, vduse_major);
1617 cdev_del(&vduse_ctrl_cdev);
1619 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1621 class_destroy(vduse_class);
1624 module_init(vduse_init);
1626 static void vduse_exit(void)
1628 vduse_mgmtdev_exit();
1629 vduse_domain_exit();
1630 destroy_workqueue(vduse_irq_wq);
1631 cdev_del(&vduse_cdev);
1632 device_destroy(vduse_class, vduse_major);
1633 cdev_del(&vduse_ctrl_cdev);
1634 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1635 class_destroy(vduse_class);
1637 module_exit(vduse_exit);
1639 MODULE_LICENSE(DRV_LICENSE);
1640 MODULE_AUTHOR(DRV_AUTHOR);
1641 MODULE_DESCRIPTION(DRV_DESC);