1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
36 #define DRIVER_VERSION "0.3"
37 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC "VFIO - User Level meta-driver"
42 struct list_head iommu_drivers_list;
43 struct mutex iommu_drivers_lock;
44 struct list_head group_list;
46 struct mutex group_lock;
47 struct cdev group_cdev;
49 wait_queue_head_t release_q;
52 struct vfio_iommu_driver {
53 const struct vfio_iommu_driver_ops *ops;
54 struct list_head vfio_next;
57 struct vfio_container {
59 struct list_head group_list;
60 struct rw_semaphore group_lock;
61 struct vfio_iommu_driver *iommu_driver;
66 struct vfio_unbound_dev {
68 struct list_head unbound_next;
74 atomic_t container_users;
75 struct iommu_group *iommu_group;
76 struct vfio_container *container;
77 struct list_head device_list;
78 struct mutex device_lock;
80 struct notifier_block nb;
81 struct list_head vfio_next;
82 struct list_head container_next;
83 struct list_head unbound_list;
84 struct mutex unbound_lock;
86 wait_queue_head_t container_q;
88 unsigned int dev_counter;
90 struct blocking_notifier_head notifier;
96 const struct vfio_device_ops *ops;
97 struct vfio_group *group;
98 struct list_head group_next;
102 #ifdef CONFIG_VFIO_NOIOMMU
103 static bool noiommu __read_mostly;
104 module_param_named(enable_unsafe_noiommu_mode,
105 noiommu, bool, S_IRUGO | S_IWUSR);
106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
119 struct iommu_group *group;
120 int __maybe_unused ret;
122 group = iommu_group_get(dev);
124 #ifdef CONFIG_VFIO_NOIOMMU
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
131 if (group || !noiommu || iommu_present(dev->bus))
134 group = iommu_group_alloc();
138 iommu_group_set_name(group, "vfio-noiommu");
139 iommu_group_set_iommudata(group, &noiommu, NULL);
140 ret = iommu_group_add_device(group, dev);
142 iommu_group_put(group);
147 * Where to taint? At this point we've added an IOMMU group for a
148 * device that is not backed by iommu_ops, therefore any iommu_
149 * callback using iommu_ops can legitimately Oops. So, while we may
150 * be about to give a DMA capable device to a user without IOMMU
151 * protection, which is clearly taint-worthy, let's go ahead and do
154 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
155 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
160 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
162 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
164 #ifdef CONFIG_VFIO_NOIOMMU
165 if (iommu_group_get_iommudata(group) == &noiommu)
166 iommu_group_remove_device(dev);
169 iommu_group_put(group);
171 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
173 #ifdef CONFIG_VFIO_NOIOMMU
174 static void *vfio_noiommu_open(unsigned long arg)
176 if (arg != VFIO_NOIOMMU_IOMMU)
177 return ERR_PTR(-EINVAL);
178 if (!capable(CAP_SYS_RAWIO))
179 return ERR_PTR(-EPERM);
184 static void vfio_noiommu_release(void *iommu_data)
188 static long vfio_noiommu_ioctl(void *iommu_data,
189 unsigned int cmd, unsigned long arg)
191 if (cmd == VFIO_CHECK_EXTENSION)
192 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
197 static int vfio_noiommu_attach_group(void *iommu_data,
198 struct iommu_group *iommu_group)
200 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
203 static void vfio_noiommu_detach_group(void *iommu_data,
204 struct iommu_group *iommu_group)
208 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
209 .name = "vfio-noiommu",
210 .owner = THIS_MODULE,
211 .open = vfio_noiommu_open,
212 .release = vfio_noiommu_release,
213 .ioctl = vfio_noiommu_ioctl,
214 .attach_group = vfio_noiommu_attach_group,
215 .detach_group = vfio_noiommu_detach_group,
221 * IOMMU driver registration
223 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
225 struct vfio_iommu_driver *driver, *tmp;
227 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
233 mutex_lock(&vfio.iommu_drivers_lock);
235 /* Check for duplicates */
236 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
237 if (tmp->ops == ops) {
238 mutex_unlock(&vfio.iommu_drivers_lock);
244 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
246 mutex_unlock(&vfio.iommu_drivers_lock);
250 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
252 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
254 struct vfio_iommu_driver *driver;
256 mutex_lock(&vfio.iommu_drivers_lock);
257 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
258 if (driver->ops == ops) {
259 list_del(&driver->vfio_next);
260 mutex_unlock(&vfio.iommu_drivers_lock);
265 mutex_unlock(&vfio.iommu_drivers_lock);
267 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
270 * Group minor allocation/free - both called with vfio.group_lock held
272 static int vfio_alloc_group_minor(struct vfio_group *group)
274 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
277 static void vfio_free_group_minor(int minor)
279 idr_remove(&vfio.group_idr, minor);
282 static int vfio_iommu_group_notifier(struct notifier_block *nb,
283 unsigned long action, void *data);
284 static void vfio_group_get(struct vfio_group *group);
287 * Container objects - containers are created when /dev/vfio/vfio is
288 * opened, but their lifecycle extends until the last user is done, so
289 * it's freed via kref. Must support container/group/device being
290 * closed in any order.
292 static void vfio_container_get(struct vfio_container *container)
294 kref_get(&container->kref);
297 static void vfio_container_release(struct kref *kref)
299 struct vfio_container *container;
300 container = container_of(kref, struct vfio_container, kref);
305 static void vfio_container_put(struct vfio_container *container)
307 kref_put(&container->kref, vfio_container_release);
310 static void vfio_group_unlock_and_free(struct vfio_group *group)
312 mutex_unlock(&vfio.group_lock);
314 * Unregister outside of lock. A spurious callback is harmless now
315 * that the group is no longer in vfio.group_list.
317 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
322 * Group objects - create, release, get, put, search
324 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
326 struct vfio_group *group, *tmp;
330 group = kzalloc(sizeof(*group), GFP_KERNEL);
332 return ERR_PTR(-ENOMEM);
334 kref_init(&group->kref);
335 INIT_LIST_HEAD(&group->device_list);
336 mutex_init(&group->device_lock);
337 INIT_LIST_HEAD(&group->unbound_list);
338 mutex_init(&group->unbound_lock);
339 atomic_set(&group->container_users, 0);
340 atomic_set(&group->opened, 0);
341 init_waitqueue_head(&group->container_q);
342 group->iommu_group = iommu_group;
343 #ifdef CONFIG_VFIO_NOIOMMU
344 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
346 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
348 group->nb.notifier_call = vfio_iommu_group_notifier;
351 * blocking notifiers acquire a rwsem around registering and hold
352 * it around callback. Therefore, need to register outside of
353 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
354 * do anything unless it can find the group in vfio.group_list, so
355 * no harm in registering early.
357 ret = iommu_group_register_notifier(iommu_group, &group->nb);
363 mutex_lock(&vfio.group_lock);
365 /* Did we race creating this group? */
366 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
367 if (tmp->iommu_group == iommu_group) {
369 vfio_group_unlock_and_free(group);
374 minor = vfio_alloc_group_minor(group);
376 vfio_group_unlock_and_free(group);
377 return ERR_PTR(minor);
380 dev = device_create(vfio.class, NULL,
381 MKDEV(MAJOR(vfio.group_devt), minor),
382 group, "%s%d", group->noiommu ? "noiommu-" : "",
383 iommu_group_id(iommu_group));
385 vfio_free_group_minor(minor);
386 vfio_group_unlock_and_free(group);
387 return ERR_CAST(dev);
390 group->minor = minor;
393 list_add(&group->vfio_next, &vfio.group_list);
395 mutex_unlock(&vfio.group_lock);
400 /* called with vfio.group_lock held */
401 static void vfio_group_release(struct kref *kref)
403 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
404 struct vfio_unbound_dev *unbound, *tmp;
405 struct iommu_group *iommu_group = group->iommu_group;
407 WARN_ON(!list_empty(&group->device_list));
408 WARN_ON(group->notifier.head);
410 list_for_each_entry_safe(unbound, tmp,
411 &group->unbound_list, unbound_next) {
412 list_del(&unbound->unbound_next);
416 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
417 list_del(&group->vfio_next);
418 vfio_free_group_minor(group->minor);
419 vfio_group_unlock_and_free(group);
420 iommu_group_put(iommu_group);
423 static void vfio_group_put(struct vfio_group *group)
425 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
428 struct vfio_group_put_work {
429 struct work_struct work;
430 struct vfio_group *group;
433 static void vfio_group_put_bg(struct work_struct *work)
435 struct vfio_group_put_work *do_work;
437 do_work = container_of(work, struct vfio_group_put_work, work);
439 vfio_group_put(do_work->group);
443 static void vfio_group_schedule_put(struct vfio_group *group)
445 struct vfio_group_put_work *do_work;
447 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
448 if (WARN_ON(!do_work))
451 INIT_WORK(&do_work->work, vfio_group_put_bg);
452 do_work->group = group;
453 schedule_work(&do_work->work);
456 /* Assume group_lock or group reference is held */
457 static void vfio_group_get(struct vfio_group *group)
459 kref_get(&group->kref);
463 * Not really a try as we will sleep for mutex, but we need to make
464 * sure the group pointer is valid under lock and get a reference.
466 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
468 struct vfio_group *target = group;
470 mutex_lock(&vfio.group_lock);
471 list_for_each_entry(group, &vfio.group_list, vfio_next) {
472 if (group == target) {
473 vfio_group_get(group);
474 mutex_unlock(&vfio.group_lock);
478 mutex_unlock(&vfio.group_lock);
484 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
486 struct vfio_group *group;
488 mutex_lock(&vfio.group_lock);
489 list_for_each_entry(group, &vfio.group_list, vfio_next) {
490 if (group->iommu_group == iommu_group) {
491 vfio_group_get(group);
492 mutex_unlock(&vfio.group_lock);
496 mutex_unlock(&vfio.group_lock);
501 static struct vfio_group *vfio_group_get_from_minor(int minor)
503 struct vfio_group *group;
505 mutex_lock(&vfio.group_lock);
506 group = idr_find(&vfio.group_idr, minor);
508 mutex_unlock(&vfio.group_lock);
511 vfio_group_get(group);
512 mutex_unlock(&vfio.group_lock);
517 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
519 struct iommu_group *iommu_group;
520 struct vfio_group *group;
522 iommu_group = iommu_group_get(dev);
526 group = vfio_group_get_from_iommu(iommu_group);
527 iommu_group_put(iommu_group);
533 * Device objects - create, release, get, put, search
536 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
538 const struct vfio_device_ops *ops,
541 struct vfio_device *device;
543 device = kzalloc(sizeof(*device), GFP_KERNEL);
545 return ERR_PTR(-ENOMEM);
547 kref_init(&device->kref);
549 /* Our reference on group is moved to the device */
550 device->group = group;
552 device->device_data = device_data;
553 dev_set_drvdata(dev, device);
555 mutex_lock(&group->device_lock);
556 list_add(&device->group_next, &group->device_list);
557 group->dev_counter++;
558 mutex_unlock(&group->device_lock);
563 static void vfio_device_release(struct kref *kref)
565 struct vfio_device *device = container_of(kref,
566 struct vfio_device, kref);
567 struct vfio_group *group = device->group;
569 list_del(&device->group_next);
570 group->dev_counter--;
571 mutex_unlock(&group->device_lock);
573 dev_set_drvdata(device->dev, NULL);
577 /* vfio_del_group_dev may be waiting for this device */
578 wake_up(&vfio.release_q);
581 /* Device reference always implies a group reference */
582 void vfio_device_put(struct vfio_device *device)
584 struct vfio_group *group = device->group;
585 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
587 EXPORT_SYMBOL_GPL(vfio_device_put);
589 static void vfio_device_get(struct vfio_device *device)
591 kref_get(&device->kref);
594 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
597 struct vfio_device *device;
599 mutex_lock(&group->device_lock);
600 list_for_each_entry(device, &group->device_list, group_next) {
601 if (device->dev == dev) {
602 vfio_device_get(device);
603 mutex_unlock(&group->device_lock);
607 mutex_unlock(&group->device_lock);
612 * Some drivers, like pci-stub, are only used to prevent other drivers from
613 * claiming a device and are therefore perfectly legitimate for a user owned
614 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
615 * of the device, but it does prevent the user from having direct access to
616 * the device, which is useful in some circumstances.
618 * We also assume that we can include PCI interconnect devices, ie. bridges.
619 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
620 * then all of the downstream devices will be part of the same IOMMU group as
621 * the bridge. Thus, if placing the bridge into the user owned IOVA space
622 * breaks anything, it only does so for user owned devices downstream. Note
623 * that error notification via MSI can be affected for platforms that handle
624 * MSI within the same IOVA space as DMA.
626 static const char * const vfio_driver_allowed[] = { "pci-stub" };
628 static bool vfio_dev_driver_allowed(struct device *dev,
629 struct device_driver *drv)
631 if (dev_is_pci(dev)) {
632 struct pci_dev *pdev = to_pci_dev(dev);
634 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
638 return match_string(vfio_driver_allowed,
639 ARRAY_SIZE(vfio_driver_allowed),
644 * A vfio group is viable for use by userspace if all devices are in
645 * one of the following states:
647 * - bound to a vfio driver
648 * - bound to an otherwise allowed driver
649 * - a PCI interconnect device
651 * We use two methods to determine whether a device is bound to a vfio
652 * driver. The first is to test whether the device exists in the vfio
653 * group. The second is to test if the device exists on the group
654 * unbound_list, indicating it's in the middle of transitioning from
655 * a vfio driver to driver-less.
657 static int vfio_dev_viable(struct device *dev, void *data)
659 struct vfio_group *group = data;
660 struct vfio_device *device;
661 struct device_driver *drv = READ_ONCE(dev->driver);
662 struct vfio_unbound_dev *unbound;
665 mutex_lock(&group->unbound_lock);
666 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
667 if (dev == unbound->dev) {
672 mutex_unlock(&group->unbound_lock);
674 if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
677 device = vfio_group_get_device(group, dev);
679 vfio_device_put(device);
687 * Async device support
689 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
691 struct vfio_device *device;
693 /* Do we already know about it? We shouldn't */
694 device = vfio_group_get_device(group, dev);
695 if (WARN_ON_ONCE(device)) {
696 vfio_device_put(device);
700 /* Nothing to do for idle groups */
701 if (!atomic_read(&group->container_users))
704 /* TODO Prevent device auto probing */
705 dev_WARN(dev, "Device added to live group %d!\n",
706 iommu_group_id(group->iommu_group));
711 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
713 /* We don't care what happens when the group isn't in use */
714 if (!atomic_read(&group->container_users))
717 return vfio_dev_viable(dev, group);
720 static int vfio_iommu_group_notifier(struct notifier_block *nb,
721 unsigned long action, void *data)
723 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
724 struct device *dev = data;
725 struct vfio_unbound_dev *unbound;
728 * Need to go through a group_lock lookup to get a reference or we
729 * risk racing a group being removed. Ignore spurious notifies.
731 group = vfio_group_try_get(group);
736 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
737 vfio_group_nb_add_dev(group, dev);
739 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
741 * Nothing to do here. If the device is in use, then the
742 * vfio sub-driver should block the remove callback until
743 * it is unused. If the device is unused or attached to a
744 * stub driver, then it should be released and we don't
745 * care that it will be going away.
748 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
749 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
750 iommu_group_id(group->iommu_group));
752 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
753 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
754 iommu_group_id(group->iommu_group), dev->driver->name);
755 BUG_ON(vfio_group_nb_verify(group, dev));
757 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
758 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
759 __func__, iommu_group_id(group->iommu_group),
762 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
763 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
764 iommu_group_id(group->iommu_group));
766 * XXX An unbound device in a live group is ok, but we'd
767 * really like to avoid the above BUG_ON by preventing other
768 * drivers from binding to it. Once that occurs, we have to
769 * stop the system to maintain isolation. At a minimum, we'd
770 * want a toggle to disable driver auto probe for this device.
773 mutex_lock(&group->unbound_lock);
774 list_for_each_entry(unbound,
775 &group->unbound_list, unbound_next) {
776 if (dev == unbound->dev) {
777 list_del(&unbound->unbound_next);
782 mutex_unlock(&group->unbound_lock);
787 * If we're the last reference to the group, the group will be
788 * released, which includes unregistering the iommu group notifier.
789 * We hold a read-lock on that notifier list, unregistering needs
790 * a write-lock... deadlock. Release our reference asynchronously
791 * to avoid that situation.
793 vfio_group_schedule_put(group);
800 int vfio_add_group_dev(struct device *dev,
801 const struct vfio_device_ops *ops, void *device_data)
803 struct iommu_group *iommu_group;
804 struct vfio_group *group;
805 struct vfio_device *device;
807 iommu_group = iommu_group_get(dev);
811 group = vfio_group_get_from_iommu(iommu_group);
813 group = vfio_create_group(iommu_group);
815 iommu_group_put(iommu_group);
816 return PTR_ERR(group);
820 * A found vfio_group already holds a reference to the
821 * iommu_group. A created vfio_group keeps the reference.
823 iommu_group_put(iommu_group);
826 device = vfio_group_get_device(group, dev);
828 dev_WARN(dev, "Device already exists on group %d\n",
829 iommu_group_id(iommu_group));
830 vfio_device_put(device);
831 vfio_group_put(group);
835 device = vfio_group_create_device(group, dev, ops, device_data);
836 if (IS_ERR(device)) {
837 vfio_group_put(group);
838 return PTR_ERR(device);
842 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
845 * Get a reference to the vfio_device for a device. Even if the
846 * caller thinks they own the device, they could be racing with a
847 * release call path, so we can't trust drvdata for the shortcut.
848 * Go the long way around, from the iommu_group to the vfio_group
849 * to the vfio_device.
851 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
853 struct vfio_group *group;
854 struct vfio_device *device;
856 group = vfio_group_get_from_dev(dev);
860 device = vfio_group_get_device(group, dev);
861 vfio_group_put(group);
865 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
867 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
870 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
872 mutex_lock(&group->device_lock);
873 list_for_each_entry(it, &group->device_list, group_next) {
876 if (it->ops->match) {
877 ret = it->ops->match(it->device_data, buf);
879 device = ERR_PTR(ret);
883 ret = !strcmp(dev_name(it->dev), buf);
888 vfio_device_get(device);
892 mutex_unlock(&group->device_lock);
898 * Caller must hold a reference to the vfio_device
900 void *vfio_device_data(struct vfio_device *device)
902 return device->device_data;
904 EXPORT_SYMBOL_GPL(vfio_device_data);
907 * Decrement the device reference count and wait for the device to be
908 * removed. Open file descriptors for the device... */
909 void *vfio_del_group_dev(struct device *dev)
911 DEFINE_WAIT_FUNC(wait, woken_wake_function);
912 struct vfio_device *device = dev_get_drvdata(dev);
913 struct vfio_group *group = device->group;
914 void *device_data = device->device_data;
915 struct vfio_unbound_dev *unbound;
917 bool interrupted = false;
920 * When the device is removed from the group, the group suddenly
921 * becomes non-viable; the device has a driver (until the unbind
922 * completes), but it's not present in the group. This is bad news
923 * for any external users that need to re-acquire a group reference
924 * in order to match and release their existing reference. To
925 * solve this, we track such devices on the unbound_list to bridge
926 * the gap until they're fully unbound.
928 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
931 mutex_lock(&group->unbound_lock);
932 list_add(&unbound->unbound_next, &group->unbound_list);
933 mutex_unlock(&group->unbound_lock);
937 vfio_device_put(device);
940 * If the device is still present in the group after the above
941 * 'put', then it is in use and we need to request it from the
942 * bus driver. The driver may in turn need to request the
943 * device from the user. We send the request on an arbitrary
944 * interval with counter to allow the driver to take escalating
945 * measures to release the device if it has the ability to do so.
947 add_wait_queue(&vfio.release_q, &wait);
950 device = vfio_group_get_device(group, dev);
954 if (device->ops->request)
955 device->ops->request(device_data, i++);
957 vfio_device_put(device);
960 wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
962 wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
963 if (signal_pending(current)) {
966 "Device is currently in use, task"
968 "blocked until device is released",
969 current->comm, task_pid_nr(current));
975 remove_wait_queue(&vfio.release_q, &wait);
977 * In order to support multiple devices per group, devices can be
978 * plucked from the group while other devices in the group are still
979 * in use. The container persists with this group and those remaining
980 * devices still attached. If the user creates an isolation violation
981 * by binding this device to another driver while the group is still in
982 * use, that's their fault. However, in the case of removing the last,
983 * or potentially the only, device in the group there can be no other
984 * in-use devices in the group. The user has done their due diligence
985 * and we should lay no claims to those devices. In order to do that,
986 * we need to make sure the group is detached from the container.
987 * Without this stall, we're potentially racing with a user process
988 * that may attempt to immediately bind this device to another driver.
990 if (list_empty(&group->device_list))
991 wait_event(group->container_q, !group->container);
993 /* Matches the get in vfio_group_create_device() */
994 vfio_group_put(group);
998 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1001 * VFIO base fd, /dev/vfio/vfio
1003 static long vfio_ioctl_check_extension(struct vfio_container *container,
1006 struct vfio_iommu_driver *driver;
1009 down_read(&container->group_lock);
1011 driver = container->iommu_driver;
1014 /* No base extensions yet */
1017 * If no driver is set, poll all registered drivers for
1018 * extensions and return the first positive result. If
1019 * a driver is already set, further queries will be passed
1020 * only to that driver.
1023 mutex_lock(&vfio.iommu_drivers_lock);
1024 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1027 #ifdef CONFIG_VFIO_NOIOMMU
1028 if (!list_empty(&container->group_list) &&
1029 (container->noiommu !=
1030 (driver->ops == &vfio_noiommu_ops)))
1034 if (!try_module_get(driver->ops->owner))
1037 ret = driver->ops->ioctl(NULL,
1038 VFIO_CHECK_EXTENSION,
1040 module_put(driver->ops->owner);
1044 mutex_unlock(&vfio.iommu_drivers_lock);
1046 ret = driver->ops->ioctl(container->iommu_data,
1047 VFIO_CHECK_EXTENSION, arg);
1050 up_read(&container->group_lock);
1055 /* hold write lock on container->group_lock */
1056 static int __vfio_container_attach_groups(struct vfio_container *container,
1057 struct vfio_iommu_driver *driver,
1060 struct vfio_group *group;
1063 list_for_each_entry(group, &container->group_list, container_next) {
1064 ret = driver->ops->attach_group(data, group->iommu_group);
1072 list_for_each_entry_continue_reverse(group, &container->group_list,
1074 driver->ops->detach_group(data, group->iommu_group);
1080 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1083 struct vfio_iommu_driver *driver;
1086 down_write(&container->group_lock);
1089 * The container is designed to be an unprivileged interface while
1090 * the group can be assigned to specific users. Therefore, only by
1091 * adding a group to a container does the user get the privilege of
1092 * enabling the iommu, which may allocate finite resources. There
1093 * is no unset_iommu, but by removing all the groups from a container,
1094 * the container is deprivileged and returns to an unset state.
1096 if (list_empty(&container->group_list) || container->iommu_driver) {
1097 up_write(&container->group_lock);
1101 mutex_lock(&vfio.iommu_drivers_lock);
1102 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1105 #ifdef CONFIG_VFIO_NOIOMMU
1107 * Only noiommu containers can use vfio-noiommu and noiommu
1108 * containers can only use vfio-noiommu.
1110 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1114 if (!try_module_get(driver->ops->owner))
1118 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1119 * so test which iommu driver reported support for this
1120 * extension and call open on them. We also pass them the
1121 * magic, allowing a single driver to support multiple
1122 * interfaces if they'd like.
1124 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1125 module_put(driver->ops->owner);
1129 data = driver->ops->open(arg);
1131 ret = PTR_ERR(data);
1132 module_put(driver->ops->owner);
1136 ret = __vfio_container_attach_groups(container, driver, data);
1138 driver->ops->release(data);
1139 module_put(driver->ops->owner);
1143 container->iommu_driver = driver;
1144 container->iommu_data = data;
1148 mutex_unlock(&vfio.iommu_drivers_lock);
1149 up_write(&container->group_lock);
1154 static long vfio_fops_unl_ioctl(struct file *filep,
1155 unsigned int cmd, unsigned long arg)
1157 struct vfio_container *container = filep->private_data;
1158 struct vfio_iommu_driver *driver;
1166 case VFIO_GET_API_VERSION:
1167 ret = VFIO_API_VERSION;
1169 case VFIO_CHECK_EXTENSION:
1170 ret = vfio_ioctl_check_extension(container, arg);
1172 case VFIO_SET_IOMMU:
1173 ret = vfio_ioctl_set_iommu(container, arg);
1176 driver = container->iommu_driver;
1177 data = container->iommu_data;
1179 if (driver) /* passthrough all unrecognized ioctls */
1180 ret = driver->ops->ioctl(data, cmd, arg);
1186 static int vfio_fops_open(struct inode *inode, struct file *filep)
1188 struct vfio_container *container;
1190 container = kzalloc(sizeof(*container), GFP_KERNEL);
1194 INIT_LIST_HEAD(&container->group_list);
1195 init_rwsem(&container->group_lock);
1196 kref_init(&container->kref);
1198 filep->private_data = container;
1203 static int vfio_fops_release(struct inode *inode, struct file *filep)
1205 struct vfio_container *container = filep->private_data;
1206 struct vfio_iommu_driver *driver = container->iommu_driver;
1208 if (driver && driver->ops->notify)
1209 driver->ops->notify(container->iommu_data,
1210 VFIO_IOMMU_CONTAINER_CLOSE);
1212 filep->private_data = NULL;
1214 vfio_container_put(container);
1220 * Once an iommu driver is set, we optionally pass read/write/mmap
1221 * on to the driver, allowing management interfaces beyond ioctl.
1223 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1224 size_t count, loff_t *ppos)
1226 struct vfio_container *container = filep->private_data;
1227 struct vfio_iommu_driver *driver;
1228 ssize_t ret = -EINVAL;
1230 driver = container->iommu_driver;
1231 if (likely(driver && driver->ops->read))
1232 ret = driver->ops->read(container->iommu_data,
1238 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1239 size_t count, loff_t *ppos)
1241 struct vfio_container *container = filep->private_data;
1242 struct vfio_iommu_driver *driver;
1243 ssize_t ret = -EINVAL;
1245 driver = container->iommu_driver;
1246 if (likely(driver && driver->ops->write))
1247 ret = driver->ops->write(container->iommu_data,
1253 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1255 struct vfio_container *container = filep->private_data;
1256 struct vfio_iommu_driver *driver;
1259 driver = container->iommu_driver;
1260 if (likely(driver && driver->ops->mmap))
1261 ret = driver->ops->mmap(container->iommu_data, vma);
1266 static const struct file_operations vfio_fops = {
1267 .owner = THIS_MODULE,
1268 .open = vfio_fops_open,
1269 .release = vfio_fops_release,
1270 .read = vfio_fops_read,
1271 .write = vfio_fops_write,
1272 .unlocked_ioctl = vfio_fops_unl_ioctl,
1273 .compat_ioctl = compat_ptr_ioctl,
1274 .mmap = vfio_fops_mmap,
1278 * VFIO Group fd, /dev/vfio/$GROUP
1280 static void __vfio_group_unset_container(struct vfio_group *group)
1282 struct vfio_container *container = group->container;
1283 struct vfio_iommu_driver *driver;
1285 down_write(&container->group_lock);
1287 driver = container->iommu_driver;
1289 driver->ops->detach_group(container->iommu_data,
1290 group->iommu_group);
1292 group->container = NULL;
1293 wake_up(&group->container_q);
1294 list_del(&group->container_next);
1296 /* Detaching the last group deprivileges a container, remove iommu */
1297 if (driver && list_empty(&container->group_list)) {
1298 driver->ops->release(container->iommu_data);
1299 module_put(driver->ops->owner);
1300 container->iommu_driver = NULL;
1301 container->iommu_data = NULL;
1304 up_write(&container->group_lock);
1306 vfio_container_put(container);
1310 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1311 * if there was no container to unset. Since the ioctl is called on
1312 * the group, we know that still exists, therefore the only valid
1313 * transition here is 1->0.
1315 static int vfio_group_unset_container(struct vfio_group *group)
1317 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1324 __vfio_group_unset_container(group);
1330 * When removing container users, anything that removes the last user
1331 * implicitly removes the group from the container. That is, if the
1332 * group file descriptor is closed, as well as any device file descriptors,
1333 * the group is free.
1335 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1337 if (0 == atomic_dec_if_positive(&group->container_users))
1338 __vfio_group_unset_container(group);
1341 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1344 struct vfio_container *container;
1345 struct vfio_iommu_driver *driver;
1348 if (atomic_read(&group->container_users))
1351 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1354 f = fdget(container_fd);
1358 /* Sanity check, is this really our fd? */
1359 if (f.file->f_op != &vfio_fops) {
1364 container = f.file->private_data;
1365 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1367 down_write(&container->group_lock);
1369 /* Real groups and fake groups cannot mix */
1370 if (!list_empty(&container->group_list) &&
1371 container->noiommu != group->noiommu) {
1376 driver = container->iommu_driver;
1378 ret = driver->ops->attach_group(container->iommu_data,
1379 group->iommu_group);
1384 group->container = container;
1385 container->noiommu = group->noiommu;
1386 list_add(&group->container_next, &container->group_list);
1388 /* Get a reference on the container and mark a user within the group */
1389 vfio_container_get(container);
1390 atomic_inc(&group->container_users);
1393 up_write(&container->group_lock);
1398 static bool vfio_group_viable(struct vfio_group *group)
1400 return (iommu_group_for_each_dev(group->iommu_group,
1401 group, vfio_dev_viable) == 0);
1404 static int vfio_group_add_container_user(struct vfio_group *group)
1406 if (!atomic_inc_not_zero(&group->container_users))
1409 if (group->noiommu) {
1410 atomic_dec(&group->container_users);
1413 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1414 atomic_dec(&group->container_users);
1421 static const struct file_operations vfio_device_fops;
1423 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1425 struct vfio_device *device;
1429 if (0 == atomic_read(&group->container_users) ||
1430 !group->container->iommu_driver || !vfio_group_viable(group))
1433 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1436 device = vfio_device_get_from_name(group, buf);
1438 return PTR_ERR(device);
1440 ret = device->ops->open(device->device_data);
1442 vfio_device_put(device);
1447 * We can't use anon_inode_getfd() because we need to modify
1448 * the f_mode flags directly to allow more than just ioctls
1450 ret = get_unused_fd_flags(O_CLOEXEC);
1452 device->ops->release(device->device_data);
1453 vfio_device_put(device);
1457 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1459 if (IS_ERR(filep)) {
1461 ret = PTR_ERR(filep);
1462 device->ops->release(device->device_data);
1463 vfio_device_put(device);
1468 * TODO: add an anon_inode interface to do this.
1469 * Appears to be missing by lack of need rather than
1470 * explicitly prevented. Now there's need.
1472 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1474 atomic_inc(&group->container_users);
1476 fd_install(ret, filep);
1479 dev_warn(device->dev, "vfio-noiommu device opened by user "
1480 "(%s:%d)\n", current->comm, task_pid_nr(current));
1485 static long vfio_group_fops_unl_ioctl(struct file *filep,
1486 unsigned int cmd, unsigned long arg)
1488 struct vfio_group *group = filep->private_data;
1492 case VFIO_GROUP_GET_STATUS:
1494 struct vfio_group_status status;
1495 unsigned long minsz;
1497 minsz = offsetofend(struct vfio_group_status, flags);
1499 if (copy_from_user(&status, (void __user *)arg, minsz))
1502 if (status.argsz < minsz)
1507 if (vfio_group_viable(group))
1508 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1510 if (group->container)
1511 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1513 if (copy_to_user((void __user *)arg, &status, minsz))
1519 case VFIO_GROUP_SET_CONTAINER:
1523 if (get_user(fd, (int __user *)arg))
1529 ret = vfio_group_set_container(group, fd);
1532 case VFIO_GROUP_UNSET_CONTAINER:
1533 ret = vfio_group_unset_container(group);
1535 case VFIO_GROUP_GET_DEVICE_FD:
1539 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1541 return PTR_ERR(buf);
1543 ret = vfio_group_get_device_fd(group, buf);
1552 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1554 struct vfio_group *group;
1557 group = vfio_group_get_from_minor(iminor(inode));
1561 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1562 vfio_group_put(group);
1566 /* Do we need multiple instances of the group open? Seems not. */
1567 opened = atomic_cmpxchg(&group->opened, 0, 1);
1569 vfio_group_put(group);
1573 /* Is something still in use from a previous open? */
1574 if (group->container) {
1575 atomic_dec(&group->opened);
1576 vfio_group_put(group);
1580 /* Warn if previous user didn't cleanup and re-init to drop them */
1581 if (WARN_ON(group->notifier.head))
1582 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1584 filep->private_data = group;
1589 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1591 struct vfio_group *group = filep->private_data;
1593 filep->private_data = NULL;
1595 vfio_group_try_dissolve_container(group);
1597 atomic_dec(&group->opened);
1599 vfio_group_put(group);
1604 static const struct file_operations vfio_group_fops = {
1605 .owner = THIS_MODULE,
1606 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1607 .compat_ioctl = compat_ptr_ioctl,
1608 .open = vfio_group_fops_open,
1609 .release = vfio_group_fops_release,
1615 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1617 struct vfio_device *device = filep->private_data;
1619 device->ops->release(device->device_data);
1621 vfio_group_try_dissolve_container(device->group);
1623 vfio_device_put(device);
1628 static long vfio_device_fops_unl_ioctl(struct file *filep,
1629 unsigned int cmd, unsigned long arg)
1631 struct vfio_device *device = filep->private_data;
1633 if (unlikely(!device->ops->ioctl))
1636 return device->ops->ioctl(device->device_data, cmd, arg);
1639 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1640 size_t count, loff_t *ppos)
1642 struct vfio_device *device = filep->private_data;
1644 if (unlikely(!device->ops->read))
1647 return device->ops->read(device->device_data, buf, count, ppos);
1650 static ssize_t vfio_device_fops_write(struct file *filep,
1651 const char __user *buf,
1652 size_t count, loff_t *ppos)
1654 struct vfio_device *device = filep->private_data;
1656 if (unlikely(!device->ops->write))
1659 return device->ops->write(device->device_data, buf, count, ppos);
1662 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1664 struct vfio_device *device = filep->private_data;
1666 if (unlikely(!device->ops->mmap))
1669 return device->ops->mmap(device->device_data, vma);
1672 static const struct file_operations vfio_device_fops = {
1673 .owner = THIS_MODULE,
1674 .release = vfio_device_fops_release,
1675 .read = vfio_device_fops_read,
1676 .write = vfio_device_fops_write,
1677 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1678 .compat_ioctl = compat_ptr_ioctl,
1679 .mmap = vfio_device_fops_mmap,
1683 * External user API, exported by symbols to be linked dynamically.
1685 * The protocol includes:
1686 * 1. do normal VFIO init operation:
1687 * - opening a new container;
1688 * - attaching group(s) to it;
1689 * - setting an IOMMU driver for a container.
1690 * When IOMMU is set for a container, all groups in it are
1691 * considered ready to use by an external user.
1693 * 2. User space passes a group fd to an external user.
1694 * The external user calls vfio_group_get_external_user()
1696 * - the group is initialized;
1697 * - IOMMU is set for it.
1698 * If both checks passed, vfio_group_get_external_user()
1699 * increments the container user counter to prevent
1700 * the VFIO group from disposal before KVM exits.
1702 * 3. The external user calls vfio_external_user_iommu_id()
1703 * to know an IOMMU ID.
1705 * 4. When the external KVM finishes, it calls
1706 * vfio_group_put_external_user() to release the VFIO group.
1707 * This call decrements the container user counter.
1709 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1711 struct vfio_group *group = filep->private_data;
1714 if (filep->f_op != &vfio_group_fops)
1715 return ERR_PTR(-EINVAL);
1717 ret = vfio_group_add_container_user(group);
1719 return ERR_PTR(ret);
1721 vfio_group_get(group);
1725 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1728 * External user API, exported by symbols to be linked dynamically.
1729 * The external user passes in a device pointer
1731 * - A VFIO group is assiciated with the device;
1732 * - IOMMU is set for the group.
1733 * If both checks passed, vfio_group_get_external_user_from_dev()
1734 * increments the container user counter to prevent the VFIO group
1735 * from disposal before external user exits and returns the pointer
1736 * to the VFIO group.
1738 * When the external user finishes using the VFIO group, it calls
1739 * vfio_group_put_external_user() to release the VFIO group and
1740 * decrement the container user counter.
1742 * @dev [in] : device
1743 * Return error PTR or pointer to VFIO group.
1746 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1748 struct vfio_group *group;
1751 group = vfio_group_get_from_dev(dev);
1753 return ERR_PTR(-ENODEV);
1755 ret = vfio_group_add_container_user(group);
1757 vfio_group_put(group);
1758 return ERR_PTR(ret);
1763 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1765 void vfio_group_put_external_user(struct vfio_group *group)
1767 vfio_group_try_dissolve_container(group);
1768 vfio_group_put(group);
1770 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1772 bool vfio_external_group_match_file(struct vfio_group *test_group,
1775 struct vfio_group *group = filep->private_data;
1777 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1779 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1781 int vfio_external_user_iommu_id(struct vfio_group *group)
1783 return iommu_group_id(group->iommu_group);
1785 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1787 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1789 return vfio_ioctl_check_extension(group->container, arg);
1791 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1794 * Sub-module support
1797 * Helper for managing a buffer of info chain capabilities, allocate or
1798 * reallocate a buffer with additional @size, filling in @id and @version
1799 * of the capability. A pointer to the new capability is returned.
1801 * NB. The chain is based at the head of the buffer, so new entries are
1802 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1803 * next offsets prior to copying to the user buffer.
1805 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1806 size_t size, u16 id, u16 version)
1809 struct vfio_info_cap_header *header, *tmp;
1811 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1815 return ERR_PTR(-ENOMEM);
1819 header = buf + caps->size;
1821 /* Eventually copied to user buffer, zero */
1822 memset(header, 0, size);
1825 header->version = version;
1827 /* Add to the end of the capability chain */
1828 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1831 tmp->next = caps->size;
1836 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1838 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1840 struct vfio_info_cap_header *tmp;
1841 void *buf = (void *)caps->buf;
1843 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1844 tmp->next += offset;
1846 EXPORT_SYMBOL(vfio_info_cap_shift);
1848 int vfio_info_add_capability(struct vfio_info_cap *caps,
1849 struct vfio_info_cap_header *cap, size_t size)
1851 struct vfio_info_cap_header *header;
1853 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1855 return PTR_ERR(header);
1857 memcpy(header + 1, cap + 1, size - sizeof(*header));
1861 EXPORT_SYMBOL(vfio_info_add_capability);
1863 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1864 int max_irq_type, size_t *data_size)
1866 unsigned long minsz;
1869 minsz = offsetofend(struct vfio_irq_set, count);
1871 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1872 (hdr->count >= (U32_MAX - hdr->start)) ||
1873 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1874 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1880 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1883 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1884 case VFIO_IRQ_SET_DATA_NONE:
1887 case VFIO_IRQ_SET_DATA_BOOL:
1888 size = sizeof(uint8_t);
1890 case VFIO_IRQ_SET_DATA_EVENTFD:
1891 size = sizeof(int32_t);
1898 if (hdr->argsz - minsz < hdr->count * size)
1904 *data_size = hdr->count * size;
1909 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1912 * Pin a set of guest PFNs and return their associated host PFNs for local
1914 * @dev [in] : device
1915 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1916 * @npage [in] : count of elements in user_pfn array. This count should not
1917 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1918 * @prot [in] : protection flags
1919 * @phys_pfn[out]: array of host PFNs
1920 * Return error or number of pages pinned.
1922 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1923 int prot, unsigned long *phys_pfn)
1925 struct vfio_container *container;
1926 struct vfio_group *group;
1927 struct vfio_iommu_driver *driver;
1930 if (!dev || !user_pfn || !phys_pfn || !npage)
1933 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1936 group = vfio_group_get_from_dev(dev);
1940 if (group->dev_counter > 1) {
1945 ret = vfio_group_add_container_user(group);
1949 container = group->container;
1950 driver = container->iommu_driver;
1951 if (likely(driver && driver->ops->pin_pages))
1952 ret = driver->ops->pin_pages(container->iommu_data,
1953 group->iommu_group, user_pfn,
1954 npage, prot, phys_pfn);
1958 vfio_group_try_dissolve_container(group);
1961 vfio_group_put(group);
1964 EXPORT_SYMBOL(vfio_pin_pages);
1967 * Unpin set of host PFNs for local domain only.
1968 * @dev [in] : device
1969 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1970 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1971 * @npage [in] : count of elements in user_pfn array. This count should not
1972 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1973 * Return error or number of pages unpinned.
1975 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1977 struct vfio_container *container;
1978 struct vfio_group *group;
1979 struct vfio_iommu_driver *driver;
1982 if (!dev || !user_pfn || !npage)
1985 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1988 group = vfio_group_get_from_dev(dev);
1992 ret = vfio_group_add_container_user(group);
1994 goto err_unpin_pages;
1996 container = group->container;
1997 driver = container->iommu_driver;
1998 if (likely(driver && driver->ops->unpin_pages))
1999 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2004 vfio_group_try_dissolve_container(group);
2007 vfio_group_put(group);
2010 EXPORT_SYMBOL(vfio_unpin_pages);
2013 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2016 * The caller needs to call vfio_group_get_external_user() or
2017 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2018 * so as to prevent the VFIO group from disposal in the middle of the call.
2019 * But it can keep the reference to the VFIO group for several calls into
2021 * After finishing using of the VFIO group, the caller needs to release the
2022 * VFIO group by calling vfio_group_put_external_user().
2024 * @group [in] : VFIO group
2025 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned.
2026 * @npage [in] : count of elements in user_iova_pfn array.
2027 * This count should not be greater
2028 * VFIO_PIN_PAGES_MAX_ENTRIES.
2029 * @prot [in] : protection flags
2030 * @phys_pfn [out] : array of host PFNs
2031 * Return error or number of pages pinned.
2033 int vfio_group_pin_pages(struct vfio_group *group,
2034 unsigned long *user_iova_pfn, int npage,
2035 int prot, unsigned long *phys_pfn)
2037 struct vfio_container *container;
2038 struct vfio_iommu_driver *driver;
2041 if (!group || !user_iova_pfn || !phys_pfn || !npage)
2044 if (group->dev_counter > 1)
2047 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2050 container = group->container;
2051 driver = container->iommu_driver;
2052 if (likely(driver && driver->ops->pin_pages))
2053 ret = driver->ops->pin_pages(container->iommu_data,
2054 group->iommu_group, user_iova_pfn,
2055 npage, prot, phys_pfn);
2061 EXPORT_SYMBOL(vfio_group_pin_pages);
2064 * Unpin a set of guest IOVA PFNs for a VFIO group.
2066 * The caller needs to call vfio_group_get_external_user() or
2067 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2068 * so as to prevent the VFIO group from disposal in the middle of the call.
2069 * But it can keep the reference to the VFIO group for several calls into
2071 * After finishing using of the VFIO group, the caller needs to release the
2072 * VFIO group by calling vfio_group_put_external_user().
2074 * @group [in] : vfio group
2075 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned.
2076 * @npage [in] : count of elements in user_iova_pfn array.
2077 * This count should not be greater than
2078 * VFIO_PIN_PAGES_MAX_ENTRIES.
2079 * Return error or number of pages unpinned.
2081 int vfio_group_unpin_pages(struct vfio_group *group,
2082 unsigned long *user_iova_pfn, int npage)
2084 struct vfio_container *container;
2085 struct vfio_iommu_driver *driver;
2088 if (!group || !user_iova_pfn || !npage)
2091 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2094 container = group->container;
2095 driver = container->iommu_driver;
2096 if (likely(driver && driver->ops->unpin_pages))
2097 ret = driver->ops->unpin_pages(container->iommu_data,
2098 user_iova_pfn, npage);
2104 EXPORT_SYMBOL(vfio_group_unpin_pages);
2108 * This interface allows the CPUs to perform some sort of virtual DMA on
2109 * behalf of the device.
2111 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2112 * into/from a kernel buffer.
2114 * As the read/write of user space memory is conducted via the CPUs and is
2115 * not a real device DMA, it is not necessary to pin the user space memory.
2117 * The caller needs to call vfio_group_get_external_user() or
2118 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2119 * so as to prevent the VFIO group from disposal in the middle of the call.
2120 * But it can keep the reference to the VFIO group for several calls into
2122 * After finishing using of the VFIO group, the caller needs to release the
2123 * VFIO group by calling vfio_group_put_external_user().
2125 * @group [in] : VFIO group
2126 * @user_iova [in] : base IOVA of a user space buffer
2127 * @data [in] : pointer to kernel buffer
2128 * @len [in] : kernel buffer length
2129 * @write : indicate read or write
2130 * Return error code on failure or 0 on success.
2132 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2133 void *data, size_t len, bool write)
2135 struct vfio_container *container;
2136 struct vfio_iommu_driver *driver;
2139 if (!group || !data || len <= 0)
2142 container = group->container;
2143 driver = container->iommu_driver;
2145 if (likely(driver && driver->ops->dma_rw))
2146 ret = driver->ops->dma_rw(container->iommu_data,
2147 user_iova, data, len, write);
2153 EXPORT_SYMBOL(vfio_dma_rw);
2155 static int vfio_register_iommu_notifier(struct vfio_group *group,
2156 unsigned long *events,
2157 struct notifier_block *nb)
2159 struct vfio_container *container;
2160 struct vfio_iommu_driver *driver;
2163 ret = vfio_group_add_container_user(group);
2167 container = group->container;
2168 driver = container->iommu_driver;
2169 if (likely(driver && driver->ops->register_notifier))
2170 ret = driver->ops->register_notifier(container->iommu_data,
2175 vfio_group_try_dissolve_container(group);
2180 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2181 struct notifier_block *nb)
2183 struct vfio_container *container;
2184 struct vfio_iommu_driver *driver;
2187 ret = vfio_group_add_container_user(group);
2191 container = group->container;
2192 driver = container->iommu_driver;
2193 if (likely(driver && driver->ops->unregister_notifier))
2194 ret = driver->ops->unregister_notifier(container->iommu_data,
2199 vfio_group_try_dissolve_container(group);
2204 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2207 blocking_notifier_call_chain(&group->notifier,
2208 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2210 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2212 static int vfio_register_group_notifier(struct vfio_group *group,
2213 unsigned long *events,
2214 struct notifier_block *nb)
2217 bool set_kvm = false;
2219 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2222 /* clear known events */
2223 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2225 /* refuse to continue if still events remaining */
2229 ret = vfio_group_add_container_user(group);
2233 ret = blocking_notifier_chain_register(&group->notifier, nb);
2236 * The attaching of kvm and vfio_group might already happen, so
2237 * here we replay once upon registration.
2239 if (!ret && set_kvm && group->kvm)
2240 blocking_notifier_call_chain(&group->notifier,
2241 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2243 vfio_group_try_dissolve_container(group);
2248 static int vfio_unregister_group_notifier(struct vfio_group *group,
2249 struct notifier_block *nb)
2253 ret = vfio_group_add_container_user(group);
2257 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2259 vfio_group_try_dissolve_container(group);
2264 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2265 unsigned long *events, struct notifier_block *nb)
2267 struct vfio_group *group;
2270 if (!dev || !nb || !events || (*events == 0))
2273 group = vfio_group_get_from_dev(dev);
2278 case VFIO_IOMMU_NOTIFY:
2279 ret = vfio_register_iommu_notifier(group, events, nb);
2281 case VFIO_GROUP_NOTIFY:
2282 ret = vfio_register_group_notifier(group, events, nb);
2288 vfio_group_put(group);
2291 EXPORT_SYMBOL(vfio_register_notifier);
2293 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2294 struct notifier_block *nb)
2296 struct vfio_group *group;
2302 group = vfio_group_get_from_dev(dev);
2307 case VFIO_IOMMU_NOTIFY:
2308 ret = vfio_unregister_iommu_notifier(group, nb);
2310 case VFIO_GROUP_NOTIFY:
2311 ret = vfio_unregister_group_notifier(group, nb);
2317 vfio_group_put(group);
2320 EXPORT_SYMBOL(vfio_unregister_notifier);
2322 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2324 struct vfio_container *container;
2325 struct vfio_iommu_driver *driver;
2328 return ERR_PTR(-EINVAL);
2330 container = group->container;
2331 driver = container->iommu_driver;
2332 if (likely(driver && driver->ops->group_iommu_domain))
2333 return driver->ops->group_iommu_domain(container->iommu_data,
2334 group->iommu_group);
2336 return ERR_PTR(-ENOTTY);
2338 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2341 * Module/class support
2343 static char *vfio_devnode(struct device *dev, umode_t *mode)
2345 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2348 static struct miscdevice vfio_dev = {
2349 .minor = VFIO_MINOR,
2352 .nodename = "vfio/vfio",
2353 .mode = S_IRUGO | S_IWUGO,
2356 static int __init vfio_init(void)
2360 idr_init(&vfio.group_idr);
2361 mutex_init(&vfio.group_lock);
2362 mutex_init(&vfio.iommu_drivers_lock);
2363 INIT_LIST_HEAD(&vfio.group_list);
2364 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2365 init_waitqueue_head(&vfio.release_q);
2367 ret = misc_register(&vfio_dev);
2369 pr_err("vfio: misc device register failed\n");
2373 /* /dev/vfio/$GROUP */
2374 vfio.class = class_create(THIS_MODULE, "vfio");
2375 if (IS_ERR(vfio.class)) {
2376 ret = PTR_ERR(vfio.class);
2380 vfio.class->devnode = vfio_devnode;
2382 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2384 goto err_alloc_chrdev;
2386 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2387 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2391 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2393 #ifdef CONFIG_VFIO_NOIOMMU
2394 vfio_register_iommu_driver(&vfio_noiommu_ops);
2399 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2401 class_destroy(vfio.class);
2404 misc_deregister(&vfio_dev);
2408 static void __exit vfio_cleanup(void)
2410 WARN_ON(!list_empty(&vfio.group_list));
2412 #ifdef CONFIG_VFIO_NOIOMMU
2413 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2415 idr_destroy(&vfio.group_idr);
2416 cdev_del(&vfio.group_cdev);
2417 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2418 class_destroy(vfio.class);
2420 misc_deregister(&vfio_dev);
2423 module_init(vfio_init);
2424 module_exit(vfio_cleanup);
2426 MODULE_VERSION(DRIVER_VERSION);
2427 MODULE_LICENSE("GPL v2");
2428 MODULE_AUTHOR(DRIVER_AUTHOR);
2429 MODULE_DESCRIPTION(DRIVER_DESC);
2430 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2431 MODULE_ALIAS("devname:vfio/vfio");
2432 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");