1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
56 struct vfio_container {
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
69 unsigned int container_users;
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
74 struct list_head vfio_next;
75 struct list_head container_next;
76 enum vfio_group_type type;
77 unsigned int dev_counter;
78 struct rw_semaphore group_rwsem;
80 struct file *opened_file;
81 struct blocking_notifier_head notifier;
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
100 if (WARN_ON(!set_id))
104 * Atomically acquire a singleton object in the xarray for this set_id
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
110 xa_unlock(&vfio_device_set_xa);
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 dev_set = new_dev_set;
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
144 static void vfio_release_device_set(struct vfio_device *device)
146 struct vfio_device_set *dev_set = device->dev_set;
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
162 xa_unlock(&vfio_device_set_xa);
165 #ifdef CONFIG_VFIO_NOIOMMU
166 static void *vfio_noiommu_open(unsigned long arg)
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
176 static void vfio_noiommu_release(void *iommu_data)
180 static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
189 static int vfio_noiommu_attach_group(void *iommu_data,
190 struct iommu_group *iommu_group, enum vfio_group_type type)
195 static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
225 #endif /* CONFIG_VFIO_NOIOMMU */
228 * IOMMU driver registration
230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
232 struct vfio_iommu_driver *driver, *tmp;
234 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
240 mutex_lock(&vfio.iommu_drivers_lock);
242 /* Check for duplicates */
243 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
244 if (tmp->ops == ops) {
245 mutex_unlock(&vfio.iommu_drivers_lock);
251 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
253 mutex_unlock(&vfio.iommu_drivers_lock);
257 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
259 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
261 struct vfio_iommu_driver *driver;
263 mutex_lock(&vfio.iommu_drivers_lock);
264 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
265 if (driver->ops == ops) {
266 list_del(&driver->vfio_next);
267 mutex_unlock(&vfio.iommu_drivers_lock);
272 mutex_unlock(&vfio.iommu_drivers_lock);
274 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
276 static void vfio_group_get(struct vfio_group *group);
279 * Container objects - containers are created when /dev/vfio/vfio is
280 * opened, but their lifecycle extends until the last user is done, so
281 * it's freed via kref. Must support container/group/device being
282 * closed in any order.
284 static void vfio_container_get(struct vfio_container *container)
286 kref_get(&container->kref);
289 static void vfio_container_release(struct kref *kref)
291 struct vfio_container *container;
292 container = container_of(kref, struct vfio_container, kref);
297 static void vfio_container_put(struct vfio_container *container)
299 kref_put(&container->kref, vfio_container_release);
303 * Group objects - create, release, get, put, search
305 static struct vfio_group *
306 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
308 struct vfio_group *group;
310 list_for_each_entry(group, &vfio.group_list, vfio_next) {
311 if (group->iommu_group == iommu_group) {
312 vfio_group_get(group);
319 static struct vfio_group *
320 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
322 struct vfio_group *group;
324 mutex_lock(&vfio.group_lock);
325 group = __vfio_group_get_from_iommu(iommu_group);
326 mutex_unlock(&vfio.group_lock);
330 static void vfio_group_release(struct device *dev)
332 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
334 mutex_destroy(&group->device_lock);
335 iommu_group_put(group->iommu_group);
336 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
340 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
341 enum vfio_group_type type)
343 struct vfio_group *group;
346 group = kzalloc(sizeof(*group), GFP_KERNEL);
348 return ERR_PTR(-ENOMEM);
350 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
353 return ERR_PTR(minor);
356 device_initialize(&group->dev);
357 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
358 group->dev.class = vfio.class;
359 group->dev.release = vfio_group_release;
360 cdev_init(&group->cdev, &vfio_group_fops);
361 group->cdev.owner = THIS_MODULE;
363 refcount_set(&group->users, 1);
364 init_rwsem(&group->group_rwsem);
365 INIT_LIST_HEAD(&group->device_list);
366 mutex_init(&group->device_lock);
367 group->iommu_group = iommu_group;
368 /* put in vfio_group_release() */
369 iommu_group_ref_get(iommu_group);
371 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
376 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
377 enum vfio_group_type type)
379 struct vfio_group *group;
380 struct vfio_group *ret;
383 group = vfio_group_alloc(iommu_group, type);
387 err = dev_set_name(&group->dev, "%s%d",
388 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
389 iommu_group_id(iommu_group));
395 mutex_lock(&vfio.group_lock);
397 /* Did we race creating this group? */
398 ret = __vfio_group_get_from_iommu(iommu_group);
402 err = cdev_device_add(&group->cdev, &group->dev);
408 list_add(&group->vfio_next, &vfio.group_list);
410 mutex_unlock(&vfio.group_lock);
414 mutex_unlock(&vfio.group_lock);
416 put_device(&group->dev);
420 static void vfio_group_put(struct vfio_group *group)
422 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
426 * These data structures all have paired operations that can only be
427 * undone when the caller holds a live reference on the group. Since all
428 * pairs must be undone these WARN_ON's indicate some caller did not
429 * properly hold the group reference.
431 WARN_ON(!list_empty(&group->device_list));
432 WARN_ON(group->container || group->container_users);
433 WARN_ON(group->notifier.head);
435 list_del(&group->vfio_next);
436 cdev_device_del(&group->cdev, &group->dev);
437 mutex_unlock(&vfio.group_lock);
439 put_device(&group->dev);
442 static void vfio_group_get(struct vfio_group *group)
444 refcount_inc(&group->users);
448 * Device objects - create, release, get, put, search
450 /* Device reference always implies a group reference */
451 static void vfio_device_put(struct vfio_device *device)
453 if (refcount_dec_and_test(&device->refcount))
454 complete(&device->comp);
457 static bool vfio_device_try_get(struct vfio_device *device)
459 return refcount_inc_not_zero(&device->refcount);
462 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
465 struct vfio_device *device;
467 mutex_lock(&group->device_lock);
468 list_for_each_entry(device, &group->device_list, group_next) {
469 if (device->dev == dev && vfio_device_try_get(device)) {
470 mutex_unlock(&group->device_lock);
474 mutex_unlock(&group->device_lock);
481 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
482 const struct vfio_device_ops *ops)
484 init_completion(&device->comp);
488 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
490 void vfio_uninit_group_dev(struct vfio_device *device)
492 vfio_release_device_set(device);
494 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
496 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
497 enum vfio_group_type type)
499 struct iommu_group *iommu_group;
500 struct vfio_group *group;
503 iommu_group = iommu_group_alloc();
504 if (IS_ERR(iommu_group))
505 return ERR_CAST(iommu_group);
507 iommu_group_set_name(iommu_group, "vfio-noiommu");
508 ret = iommu_group_add_device(iommu_group, dev);
512 group = vfio_create_group(iommu_group, type);
514 ret = PTR_ERR(group);
515 goto out_remove_device;
517 iommu_group_put(iommu_group);
521 iommu_group_remove_device(dev);
523 iommu_group_put(iommu_group);
527 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
529 struct iommu_group *iommu_group;
530 struct vfio_group *group;
532 iommu_group = iommu_group_get(dev);
533 #ifdef CONFIG_VFIO_NOIOMMU
534 if (!iommu_group && noiommu) {
536 * With noiommu enabled, create an IOMMU group for devices that
537 * don't already have one, implying no IOMMU hardware/driver
538 * exists. Taint the kernel because we're about to give a DMA
539 * capable device to a user without IOMMU protection.
541 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
542 if (!IS_ERR(group)) {
543 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
544 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
550 return ERR_PTR(-EINVAL);
553 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
554 * restore cache coherency. It has to be checked here because it is only
555 * valid for cases where we are using iommu groups.
557 if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
558 iommu_group_put(iommu_group);
559 return ERR_PTR(-EINVAL);
562 group = vfio_group_get_from_iommu(iommu_group);
564 group = vfio_create_group(iommu_group, VFIO_IOMMU);
566 /* The vfio_group holds a reference to the iommu_group */
567 iommu_group_put(iommu_group);
571 static int __vfio_register_dev(struct vfio_device *device,
572 struct vfio_group *group)
574 struct vfio_device *existing_device;
577 return PTR_ERR(group);
580 * If the driver doesn't specify a set then the device is added to a
581 * singleton set just for itself.
583 if (!device->dev_set)
584 vfio_assign_device_set(device, device);
586 existing_device = vfio_group_get_device(group, device->dev);
587 if (existing_device) {
588 dev_WARN(device->dev, "Device already exists on group %d\n",
589 iommu_group_id(group->iommu_group));
590 vfio_device_put(existing_device);
591 if (group->type == VFIO_NO_IOMMU ||
592 group->type == VFIO_EMULATED_IOMMU)
593 iommu_group_remove_device(device->dev);
594 vfio_group_put(group);
598 /* Our reference on group is moved to the device */
599 device->group = group;
601 /* Refcounting can't start until the driver calls register */
602 refcount_set(&device->refcount, 1);
604 mutex_lock(&group->device_lock);
605 list_add(&device->group_next, &group->device_list);
606 group->dev_counter++;
607 mutex_unlock(&group->device_lock);
612 int vfio_register_group_dev(struct vfio_device *device)
614 return __vfio_register_dev(device,
615 vfio_group_find_or_alloc(device->dev));
617 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
620 * Register a virtual device without IOMMU backing. The user of this
621 * device must not be able to directly trigger unmediated DMA.
623 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
625 return __vfio_register_dev(device,
626 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
628 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
630 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
633 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
635 mutex_lock(&group->device_lock);
636 list_for_each_entry(it, &group->device_list, group_next) {
639 if (it->ops->match) {
640 ret = it->ops->match(it, buf);
642 device = ERR_PTR(ret);
646 ret = !strcmp(dev_name(it->dev), buf);
649 if (ret && vfio_device_try_get(it)) {
654 mutex_unlock(&group->device_lock);
660 * Decrement the device reference count and wait for the device to be
661 * removed. Open file descriptors for the device... */
662 void vfio_unregister_group_dev(struct vfio_device *device)
664 struct vfio_group *group = device->group;
666 bool interrupted = false;
669 vfio_device_put(device);
670 rc = try_wait_for_completion(&device->comp);
672 if (device->ops->request)
673 device->ops->request(device, i++);
676 rc = wait_for_completion_timeout(&device->comp,
679 rc = wait_for_completion_interruptible_timeout(
680 &device->comp, HZ * 10);
683 dev_warn(device->dev,
684 "Device is currently in use, task"
686 "blocked until device is released",
687 current->comm, task_pid_nr(current));
692 mutex_lock(&group->device_lock);
693 list_del(&device->group_next);
694 group->dev_counter--;
695 mutex_unlock(&group->device_lock);
697 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
698 iommu_group_remove_device(device->dev);
700 /* Matches the get in vfio_register_group_dev() */
701 vfio_group_put(group);
703 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
706 * VFIO base fd, /dev/vfio/vfio
708 static long vfio_ioctl_check_extension(struct vfio_container *container,
711 struct vfio_iommu_driver *driver;
714 down_read(&container->group_lock);
716 driver = container->iommu_driver;
719 /* No base extensions yet */
722 * If no driver is set, poll all registered drivers for
723 * extensions and return the first positive result. If
724 * a driver is already set, further queries will be passed
725 * only to that driver.
728 mutex_lock(&vfio.iommu_drivers_lock);
729 list_for_each_entry(driver, &vfio.iommu_drivers_list,
732 if (!list_empty(&container->group_list) &&
733 !vfio_iommu_driver_allowed(container,
736 if (!try_module_get(driver->ops->owner))
739 ret = driver->ops->ioctl(NULL,
740 VFIO_CHECK_EXTENSION,
742 module_put(driver->ops->owner);
746 mutex_unlock(&vfio.iommu_drivers_lock);
748 ret = driver->ops->ioctl(container->iommu_data,
749 VFIO_CHECK_EXTENSION, arg);
752 up_read(&container->group_lock);
757 /* hold write lock on container->group_lock */
758 static int __vfio_container_attach_groups(struct vfio_container *container,
759 struct vfio_iommu_driver *driver,
762 struct vfio_group *group;
765 list_for_each_entry(group, &container->group_list, container_next) {
766 ret = driver->ops->attach_group(data, group->iommu_group,
775 list_for_each_entry_continue_reverse(group, &container->group_list,
777 driver->ops->detach_group(data, group->iommu_group);
783 static long vfio_ioctl_set_iommu(struct vfio_container *container,
786 struct vfio_iommu_driver *driver;
789 down_write(&container->group_lock);
792 * The container is designed to be an unprivileged interface while
793 * the group can be assigned to specific users. Therefore, only by
794 * adding a group to a container does the user get the privilege of
795 * enabling the iommu, which may allocate finite resources. There
796 * is no unset_iommu, but by removing all the groups from a container,
797 * the container is deprivileged and returns to an unset state.
799 if (list_empty(&container->group_list) || container->iommu_driver) {
800 up_write(&container->group_lock);
804 mutex_lock(&vfio.iommu_drivers_lock);
805 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
808 if (!vfio_iommu_driver_allowed(container, driver))
810 if (!try_module_get(driver->ops->owner))
814 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
815 * so test which iommu driver reported support for this
816 * extension and call open on them. We also pass them the
817 * magic, allowing a single driver to support multiple
818 * interfaces if they'd like.
820 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
821 module_put(driver->ops->owner);
825 data = driver->ops->open(arg);
828 module_put(driver->ops->owner);
832 ret = __vfio_container_attach_groups(container, driver, data);
834 driver->ops->release(data);
835 module_put(driver->ops->owner);
839 container->iommu_driver = driver;
840 container->iommu_data = data;
844 mutex_unlock(&vfio.iommu_drivers_lock);
845 up_write(&container->group_lock);
850 static long vfio_fops_unl_ioctl(struct file *filep,
851 unsigned int cmd, unsigned long arg)
853 struct vfio_container *container = filep->private_data;
854 struct vfio_iommu_driver *driver;
862 case VFIO_GET_API_VERSION:
863 ret = VFIO_API_VERSION;
865 case VFIO_CHECK_EXTENSION:
866 ret = vfio_ioctl_check_extension(container, arg);
869 ret = vfio_ioctl_set_iommu(container, arg);
872 driver = container->iommu_driver;
873 data = container->iommu_data;
875 if (driver) /* passthrough all unrecognized ioctls */
876 ret = driver->ops->ioctl(data, cmd, arg);
882 static int vfio_fops_open(struct inode *inode, struct file *filep)
884 struct vfio_container *container;
886 container = kzalloc(sizeof(*container), GFP_KERNEL);
890 INIT_LIST_HEAD(&container->group_list);
891 init_rwsem(&container->group_lock);
892 kref_init(&container->kref);
894 filep->private_data = container;
899 static int vfio_fops_release(struct inode *inode, struct file *filep)
901 struct vfio_container *container = filep->private_data;
902 struct vfio_iommu_driver *driver = container->iommu_driver;
904 if (driver && driver->ops->notify)
905 driver->ops->notify(container->iommu_data,
906 VFIO_IOMMU_CONTAINER_CLOSE);
908 filep->private_data = NULL;
910 vfio_container_put(container);
915 static const struct file_operations vfio_fops = {
916 .owner = THIS_MODULE,
917 .open = vfio_fops_open,
918 .release = vfio_fops_release,
919 .unlocked_ioctl = vfio_fops_unl_ioctl,
920 .compat_ioctl = compat_ptr_ioctl,
924 * VFIO Group fd, /dev/vfio/$GROUP
926 static void __vfio_group_unset_container(struct vfio_group *group)
928 struct vfio_container *container = group->container;
929 struct vfio_iommu_driver *driver;
931 lockdep_assert_held_write(&group->group_rwsem);
933 down_write(&container->group_lock);
935 driver = container->iommu_driver;
937 driver->ops->detach_group(container->iommu_data,
940 if (group->type == VFIO_IOMMU)
941 iommu_group_release_dma_owner(group->iommu_group);
943 group->container = NULL;
944 group->container_users = 0;
945 list_del(&group->container_next);
947 /* Detaching the last group deprivileges a container, remove iommu */
948 if (driver && list_empty(&container->group_list)) {
949 driver->ops->release(container->iommu_data);
950 module_put(driver->ops->owner);
951 container->iommu_driver = NULL;
952 container->iommu_data = NULL;
955 up_write(&container->group_lock);
957 vfio_container_put(container);
961 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
962 * if there was no container to unset. Since the ioctl is called on
963 * the group, we know that still exists, therefore the only valid
964 * transition here is 1->0.
966 static int vfio_group_unset_container(struct vfio_group *group)
968 lockdep_assert_held_write(&group->group_rwsem);
970 if (!group->container)
972 if (group->container_users != 1)
974 __vfio_group_unset_container(group);
978 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
981 struct vfio_container *container;
982 struct vfio_iommu_driver *driver;
985 lockdep_assert_held_write(&group->group_rwsem);
987 if (group->container || WARN_ON(group->container_users))
990 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
993 f = fdget(container_fd);
997 /* Sanity check, is this really our fd? */
998 if (f.file->f_op != &vfio_fops) {
1003 container = f.file->private_data;
1004 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1006 down_write(&container->group_lock);
1008 /* Real groups and fake groups cannot mix */
1009 if (!list_empty(&container->group_list) &&
1010 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1015 if (group->type == VFIO_IOMMU) {
1016 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1021 driver = container->iommu_driver;
1023 ret = driver->ops->attach_group(container->iommu_data,
1027 if (group->type == VFIO_IOMMU)
1028 iommu_group_release_dma_owner(
1029 group->iommu_group);
1034 group->container = container;
1035 group->container_users = 1;
1036 container->noiommu = (group->type == VFIO_NO_IOMMU);
1037 list_add(&group->container_next, &container->group_list);
1039 /* Get a reference on the container and mark a user within the group */
1040 vfio_container_get(container);
1043 up_write(&container->group_lock);
1048 static const struct file_operations vfio_device_fops;
1050 /* true if the vfio_device has open_device() called but not close_device() */
1051 static bool vfio_assert_device_open(struct vfio_device *device)
1053 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1056 static int vfio_device_assign_container(struct vfio_device *device)
1058 struct vfio_group *group = device->group;
1060 lockdep_assert_held_write(&group->group_rwsem);
1062 if (!group->container || !group->container->iommu_driver ||
1063 WARN_ON(!group->container_users))
1066 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1069 get_file(group->opened_file);
1070 group->container_users++;
1074 static void vfio_device_unassign_container(struct vfio_device *device)
1076 down_write(&device->group->group_rwsem);
1077 WARN_ON(device->group->container_users <= 1);
1078 device->group->container_users--;
1079 fput(device->group->opened_file);
1080 up_write(&device->group->group_rwsem);
1083 static struct file *vfio_device_open(struct vfio_device *device)
1088 down_write(&device->group->group_rwsem);
1089 ret = vfio_device_assign_container(device);
1090 up_write(&device->group->group_rwsem);
1092 return ERR_PTR(ret);
1094 if (!try_module_get(device->dev->driver->owner)) {
1096 goto err_unassign_container;
1099 mutex_lock(&device->dev_set->lock);
1100 device->open_count++;
1101 if (device->open_count == 1) {
1103 * Here we pass the KVM pointer with the group under the read
1104 * lock. If the device driver will use it, it must obtain a
1105 * reference and release it during close_device.
1107 down_read(&device->group->group_rwsem);
1108 device->kvm = device->group->kvm;
1110 if (device->ops->open_device) {
1111 ret = device->ops->open_device(device);
1113 goto err_undo_count;
1115 up_read(&device->group->group_rwsem);
1117 mutex_unlock(&device->dev_set->lock);
1120 * We can't use anon_inode_getfd() because we need to modify
1121 * the f_mode flags directly to allow more than just ioctls
1123 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1125 if (IS_ERR(filep)) {
1126 ret = PTR_ERR(filep);
1127 goto err_close_device;
1131 * TODO: add an anon_inode interface to do this.
1132 * Appears to be missing by lack of need rather than
1133 * explicitly prevented. Now there's need.
1135 filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
1137 if (device->group->type == VFIO_NO_IOMMU)
1138 dev_warn(device->dev, "vfio-noiommu device opened by user "
1139 "(%s:%d)\n", current->comm, task_pid_nr(current));
1141 * On success the ref of device is moved to the file and
1142 * put in vfio_device_fops_release()
1147 mutex_lock(&device->dev_set->lock);
1148 down_read(&device->group->group_rwsem);
1149 if (device->open_count == 1 && device->ops->close_device)
1150 device->ops->close_device(device);
1152 device->open_count--;
1153 if (device->open_count == 0 && device->kvm)
1155 up_read(&device->group->group_rwsem);
1156 mutex_unlock(&device->dev_set->lock);
1157 module_put(device->dev->driver->owner);
1158 err_unassign_container:
1159 vfio_device_unassign_container(device);
1160 return ERR_PTR(ret);
1163 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1165 struct vfio_device *device;
1170 device = vfio_device_get_from_name(group, buf);
1172 return PTR_ERR(device);
1174 fdno = get_unused_fd_flags(O_CLOEXEC);
1177 goto err_put_device;
1180 filep = vfio_device_open(device);
1181 if (IS_ERR(filep)) {
1182 ret = PTR_ERR(filep);
1186 fd_install(fdno, filep);
1190 put_unused_fd(fdno);
1192 vfio_device_put(device);
1196 static long vfio_group_fops_unl_ioctl(struct file *filep,
1197 unsigned int cmd, unsigned long arg)
1199 struct vfio_group *group = filep->private_data;
1203 case VFIO_GROUP_GET_STATUS:
1205 struct vfio_group_status status;
1206 unsigned long minsz;
1208 minsz = offsetofend(struct vfio_group_status, flags);
1210 if (copy_from_user(&status, (void __user *)arg, minsz))
1213 if (status.argsz < minsz)
1218 down_read(&group->group_rwsem);
1219 if (group->container)
1220 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1221 VFIO_GROUP_FLAGS_VIABLE;
1222 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1223 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1224 up_read(&group->group_rwsem);
1226 if (copy_to_user((void __user *)arg, &status, minsz))
1232 case VFIO_GROUP_SET_CONTAINER:
1236 if (get_user(fd, (int __user *)arg))
1242 down_write(&group->group_rwsem);
1243 ret = vfio_group_set_container(group, fd);
1244 up_write(&group->group_rwsem);
1247 case VFIO_GROUP_UNSET_CONTAINER:
1248 down_write(&group->group_rwsem);
1249 ret = vfio_group_unset_container(group);
1250 up_write(&group->group_rwsem);
1252 case VFIO_GROUP_GET_DEVICE_FD:
1256 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1258 return PTR_ERR(buf);
1260 ret = vfio_group_get_device_fd(group, buf);
1269 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1271 struct vfio_group *group =
1272 container_of(inode->i_cdev, struct vfio_group, cdev);
1275 down_write(&group->group_rwsem);
1277 /* users can be zero if this races with vfio_group_put() */
1278 if (!refcount_inc_not_zero(&group->users)) {
1283 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1289 * Do we need multiple instances of the group open? Seems not.
1291 if (group->opened_file) {
1295 group->opened_file = filep;
1296 filep->private_data = group;
1298 up_write(&group->group_rwsem);
1301 vfio_group_put(group);
1303 up_write(&group->group_rwsem);
1307 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1309 struct vfio_group *group = filep->private_data;
1311 filep->private_data = NULL;
1313 down_write(&group->group_rwsem);
1315 * Device FDs hold a group file reference, therefore the group release
1316 * is only called when there are no open devices.
1318 WARN_ON(group->notifier.head);
1319 if (group->container) {
1320 WARN_ON(group->container_users != 1);
1321 __vfio_group_unset_container(group);
1323 group->opened_file = NULL;
1324 up_write(&group->group_rwsem);
1326 vfio_group_put(group);
1331 static const struct file_operations vfio_group_fops = {
1332 .owner = THIS_MODULE,
1333 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1334 .compat_ioctl = compat_ptr_ioctl,
1335 .open = vfio_group_fops_open,
1336 .release = vfio_group_fops_release,
1342 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1344 struct vfio_device *device = filep->private_data;
1346 mutex_lock(&device->dev_set->lock);
1347 vfio_assert_device_open(device);
1348 down_read(&device->group->group_rwsem);
1349 if (device->open_count == 1 && device->ops->close_device)
1350 device->ops->close_device(device);
1351 up_read(&device->group->group_rwsem);
1352 device->open_count--;
1353 if (device->open_count == 0)
1355 mutex_unlock(&device->dev_set->lock);
1357 module_put(device->dev->driver->owner);
1359 vfio_device_unassign_container(device);
1361 vfio_device_put(device);
1367 * vfio_mig_get_next_state - Compute the next step in the FSM
1368 * @cur_fsm - The current state the device is in
1369 * @new_fsm - The target state to reach
1370 * @next_fsm - Pointer to the next step to get to new_fsm
1372 * Return 0 upon success, otherwise -errno
1373 * Upon success the next step in the state progression between cur_fsm and
1374 * new_fsm will be set in next_fsm.
1376 * This breaks down requests for combination transitions into smaller steps and
1377 * returns the next step to get to new_fsm. The function may need to be called
1378 * multiple times before reaching new_fsm.
1381 int vfio_mig_get_next_state(struct vfio_device *device,
1382 enum vfio_device_mig_state cur_fsm,
1383 enum vfio_device_mig_state new_fsm,
1384 enum vfio_device_mig_state *next_fsm)
1386 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1388 * The coding in this table requires the driver to implement the
1389 * following FSM arcs:
1395 * If P2P is supported then the driver must also implement these FSM
1397 * RUNNING -> RUNNING_P2P
1398 * RUNNING_P2P -> RUNNING
1399 * RUNNING_P2P -> STOP
1400 * STOP -> RUNNING_P2P
1401 * Without P2P the driver must implement:
1405 * The coding will step through multiple states for some combination
1406 * transitions; if all optional features are supported, this means the
1408 * RESUMING -> STOP -> RUNNING_P2P
1409 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1410 * RESUMING -> STOP -> STOP_COPY
1411 * RUNNING -> RUNNING_P2P -> STOP
1412 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1413 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1414 * RUNNING_P2P -> STOP -> RESUMING
1415 * RUNNING_P2P -> STOP -> STOP_COPY
1416 * STOP -> RUNNING_P2P -> RUNNING
1417 * STOP_COPY -> STOP -> RESUMING
1418 * STOP_COPY -> STOP -> RUNNING_P2P
1419 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1421 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1422 [VFIO_DEVICE_STATE_STOP] = {
1423 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1424 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1425 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1426 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1427 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1428 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1430 [VFIO_DEVICE_STATE_RUNNING] = {
1431 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1432 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1433 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1434 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1435 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1436 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1438 [VFIO_DEVICE_STATE_STOP_COPY] = {
1439 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1440 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1441 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1442 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1443 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1444 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1446 [VFIO_DEVICE_STATE_RESUMING] = {
1447 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1449 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1450 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1452 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1454 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1455 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1456 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1458 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1459 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1462 [VFIO_DEVICE_STATE_ERROR] = {
1463 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1464 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1465 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1466 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1467 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1468 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1472 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1473 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1474 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1475 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1476 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1477 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1478 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1479 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1482 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1483 (state_flags_table[cur_fsm] & device->migration_flags) !=
1484 state_flags_table[cur_fsm]))
1487 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1488 (state_flags_table[new_fsm] & device->migration_flags) !=
1489 state_flags_table[new_fsm])
1493 * Arcs touching optional and unsupported states are skipped over. The
1494 * driver will instead see an arc from the original state to the next
1495 * logical state, as per the above comment.
1497 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1498 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1499 state_flags_table[*next_fsm])
1500 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1502 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1504 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1507 * Convert the drivers's struct file into a FD number and return it to userspace
1509 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1510 struct vfio_device_feature_mig_state *mig)
1515 fd = get_unused_fd_flags(O_CLOEXEC);
1522 if (copy_to_user(arg, mig, sizeof(*mig))) {
1524 goto out_put_unused;
1526 fd_install(fd, filp);
1537 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1538 u32 flags, void __user *arg,
1542 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1543 struct vfio_device_feature_mig_state mig;
1544 struct file *filp = NULL;
1547 if (!device->ops->migration_set_state ||
1548 !device->ops->migration_get_state)
1551 ret = vfio_check_feature(flags, argsz,
1552 VFIO_DEVICE_FEATURE_SET |
1553 VFIO_DEVICE_FEATURE_GET,
1558 if (copy_from_user(&mig, arg, minsz))
1561 if (flags & VFIO_DEVICE_FEATURE_GET) {
1562 enum vfio_device_mig_state curr_state;
1564 ret = device->ops->migration_get_state(device, &curr_state);
1567 mig.device_state = curr_state;
1571 /* Handle the VFIO_DEVICE_FEATURE_SET */
1572 filp = device->ops->migration_set_state(device, mig.device_state);
1573 if (IS_ERR(filp) || !filp)
1576 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1579 if (copy_to_user(arg, &mig, sizeof(mig)))
1582 return PTR_ERR(filp);
1586 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1587 u32 flags, void __user *arg,
1590 struct vfio_device_feature_migration mig = {
1591 .flags = device->migration_flags,
1595 if (!device->ops->migration_set_state ||
1596 !device->ops->migration_get_state)
1599 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1603 if (copy_to_user(arg, &mig, sizeof(mig)))
1608 static int vfio_ioctl_device_feature(struct vfio_device *device,
1609 struct vfio_device_feature __user *arg)
1611 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1612 struct vfio_device_feature feature;
1614 if (copy_from_user(&feature, arg, minsz))
1617 if (feature.argsz < minsz)
1620 /* Check unknown flags */
1622 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1623 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1626 /* GET & SET are mutually exclusive except with PROBE */
1627 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1628 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1629 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1632 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1633 case VFIO_DEVICE_FEATURE_MIGRATION:
1634 return vfio_ioctl_device_feature_migration(
1635 device, feature.flags, arg->data,
1636 feature.argsz - minsz);
1637 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1638 return vfio_ioctl_device_feature_mig_device_state(
1639 device, feature.flags, arg->data,
1640 feature.argsz - minsz);
1642 if (unlikely(!device->ops->device_feature))
1644 return device->ops->device_feature(device, feature.flags,
1646 feature.argsz - minsz);
1650 static long vfio_device_fops_unl_ioctl(struct file *filep,
1651 unsigned int cmd, unsigned long arg)
1653 struct vfio_device *device = filep->private_data;
1656 case VFIO_DEVICE_FEATURE:
1657 return vfio_ioctl_device_feature(device, (void __user *)arg);
1659 if (unlikely(!device->ops->ioctl))
1661 return device->ops->ioctl(device, cmd, arg);
1665 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1666 size_t count, loff_t *ppos)
1668 struct vfio_device *device = filep->private_data;
1670 if (unlikely(!device->ops->read))
1673 return device->ops->read(device, buf, count, ppos);
1676 static ssize_t vfio_device_fops_write(struct file *filep,
1677 const char __user *buf,
1678 size_t count, loff_t *ppos)
1680 struct vfio_device *device = filep->private_data;
1682 if (unlikely(!device->ops->write))
1685 return device->ops->write(device, buf, count, ppos);
1688 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1690 struct vfio_device *device = filep->private_data;
1692 if (unlikely(!device->ops->mmap))
1695 return device->ops->mmap(device, vma);
1698 static const struct file_operations vfio_device_fops = {
1699 .owner = THIS_MODULE,
1700 .release = vfio_device_fops_release,
1701 .read = vfio_device_fops_read,
1702 .write = vfio_device_fops_write,
1703 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1704 .compat_ioctl = compat_ptr_ioctl,
1705 .mmap = vfio_device_fops_mmap,
1709 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1710 * @file: VFIO group file
1712 * The returned iommu_group is valid as long as a ref is held on the file.
1714 struct iommu_group *vfio_file_iommu_group(struct file *file)
1716 struct vfio_group *group = file->private_data;
1718 if (file->f_op != &vfio_group_fops)
1720 return group->iommu_group;
1722 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1725 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1726 * is always CPU cache coherent
1727 * @file: VFIO group file
1729 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1730 * bit in DMA transactions. A return of false indicates that the user has
1731 * rights to access additional instructions such as wbinvd on x86.
1733 bool vfio_file_enforced_coherent(struct file *file)
1735 struct vfio_group *group = file->private_data;
1738 if (file->f_op != &vfio_group_fops)
1741 down_read(&group->group_rwsem);
1742 if (group->container) {
1743 ret = vfio_ioctl_check_extension(group->container,
1747 * Since the coherency state is determined only once a container
1748 * is attached the user must do so before they can prove they
1753 up_read(&group->group_rwsem);
1756 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1759 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1760 * @file: VFIO group file
1763 * When a VFIO device is first opened the KVM will be available in
1764 * device->kvm if one was associated with the group.
1766 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1768 struct vfio_group *group = file->private_data;
1770 if (file->f_op != &vfio_group_fops)
1773 down_write(&group->group_rwsem);
1775 up_write(&group->group_rwsem);
1777 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1780 * vfio_file_has_dev - True if the VFIO file is a handle for device
1781 * @file: VFIO file to check
1782 * @device: Device that must be part of the file
1784 * Returns true if given file has permission to manipulate the given device.
1786 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1788 struct vfio_group *group = file->private_data;
1790 if (file->f_op != &vfio_group_fops)
1793 return group == device->group;
1795 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1798 * Sub-module support
1801 * Helper for managing a buffer of info chain capabilities, allocate or
1802 * reallocate a buffer with additional @size, filling in @id and @version
1803 * of the capability. A pointer to the new capability is returned.
1805 * NB. The chain is based at the head of the buffer, so new entries are
1806 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1807 * next offsets prior to copying to the user buffer.
1809 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1810 size_t size, u16 id, u16 version)
1813 struct vfio_info_cap_header *header, *tmp;
1815 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1819 return ERR_PTR(-ENOMEM);
1823 header = buf + caps->size;
1825 /* Eventually copied to user buffer, zero */
1826 memset(header, 0, size);
1829 header->version = version;
1831 /* Add to the end of the capability chain */
1832 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1835 tmp->next = caps->size;
1840 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1842 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1844 struct vfio_info_cap_header *tmp;
1845 void *buf = (void *)caps->buf;
1847 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1848 tmp->next += offset;
1850 EXPORT_SYMBOL(vfio_info_cap_shift);
1852 int vfio_info_add_capability(struct vfio_info_cap *caps,
1853 struct vfio_info_cap_header *cap, size_t size)
1855 struct vfio_info_cap_header *header;
1857 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1859 return PTR_ERR(header);
1861 memcpy(header + 1, cap + 1, size - sizeof(*header));
1865 EXPORT_SYMBOL(vfio_info_add_capability);
1867 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1868 int max_irq_type, size_t *data_size)
1870 unsigned long minsz;
1873 minsz = offsetofend(struct vfio_irq_set, count);
1875 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1876 (hdr->count >= (U32_MAX - hdr->start)) ||
1877 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1878 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1884 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1887 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1888 case VFIO_IRQ_SET_DATA_NONE:
1891 case VFIO_IRQ_SET_DATA_BOOL:
1892 size = sizeof(uint8_t);
1894 case VFIO_IRQ_SET_DATA_EVENTFD:
1895 size = sizeof(int32_t);
1902 if (hdr->argsz - minsz < hdr->count * size)
1908 *data_size = hdr->count * size;
1913 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1916 * Pin a set of guest PFNs and return their associated host PFNs for local
1918 * @device [in] : device
1919 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1920 * @npage [in] : count of elements in user_pfn array. This count should not
1921 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1922 * @prot [in] : protection flags
1923 * @phys_pfn[out]: array of host PFNs
1924 * Return error or number of pages pinned.
1926 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1927 int npage, int prot, unsigned long *phys_pfn)
1929 struct vfio_container *container;
1930 struct vfio_group *group = device->group;
1931 struct vfio_iommu_driver *driver;
1934 if (!user_pfn || !phys_pfn || !npage ||
1935 !vfio_assert_device_open(device))
1938 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1941 if (group->dev_counter > 1)
1944 /* group->container cannot change while a vfio device is open */
1945 container = group->container;
1946 driver = container->iommu_driver;
1947 if (likely(driver && driver->ops->pin_pages))
1948 ret = driver->ops->pin_pages(container->iommu_data,
1949 group->iommu_group, user_pfn,
1950 npage, prot, phys_pfn);
1956 EXPORT_SYMBOL(vfio_pin_pages);
1959 * Unpin set of host PFNs for local domain only.
1960 * @device [in] : device
1961 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1962 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1963 * @npage [in] : count of elements in user_pfn array. This count should not
1964 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1965 * Return error or number of pages unpinned.
1967 int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
1970 struct vfio_container *container;
1971 struct vfio_iommu_driver *driver;
1974 if (!user_pfn || !npage || !vfio_assert_device_open(device))
1977 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1980 /* group->container cannot change while a vfio device is open */
1981 container = device->group->container;
1982 driver = container->iommu_driver;
1983 if (likely(driver && driver->ops->unpin_pages))
1984 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1991 EXPORT_SYMBOL(vfio_unpin_pages);
1994 * This interface allows the CPUs to perform some sort of virtual DMA on
1995 * behalf of the device.
1997 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1998 * into/from a kernel buffer.
2000 * As the read/write of user space memory is conducted via the CPUs and is
2001 * not a real device DMA, it is not necessary to pin the user space memory.
2003 * @device [in] : VFIO device
2004 * @user_iova [in] : base IOVA of a user space buffer
2005 * @data [in] : pointer to kernel buffer
2006 * @len [in] : kernel buffer length
2007 * @write : indicate read or write
2008 * Return error code on failure or 0 on success.
2010 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2011 size_t len, bool write)
2013 struct vfio_container *container;
2014 struct vfio_iommu_driver *driver;
2017 if (!data || len <= 0 || !vfio_assert_device_open(device))
2020 /* group->container cannot change while a vfio device is open */
2021 container = device->group->container;
2022 driver = container->iommu_driver;
2024 if (likely(driver && driver->ops->dma_rw))
2025 ret = driver->ops->dma_rw(container->iommu_data,
2026 user_iova, data, len, write);
2031 EXPORT_SYMBOL(vfio_dma_rw);
2033 static int vfio_register_iommu_notifier(struct vfio_group *group,
2034 unsigned long *events,
2035 struct notifier_block *nb)
2037 struct vfio_container *container;
2038 struct vfio_iommu_driver *driver;
2041 lockdep_assert_held_read(&group->group_rwsem);
2043 container = group->container;
2044 driver = container->iommu_driver;
2045 if (likely(driver && driver->ops->register_notifier))
2046 ret = driver->ops->register_notifier(container->iommu_data,
2054 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2055 struct notifier_block *nb)
2057 struct vfio_container *container;
2058 struct vfio_iommu_driver *driver;
2061 lockdep_assert_held_read(&group->group_rwsem);
2063 container = group->container;
2064 driver = container->iommu_driver;
2065 if (likely(driver && driver->ops->unregister_notifier))
2066 ret = driver->ops->unregister_notifier(container->iommu_data,
2074 int vfio_register_notifier(struct vfio_device *device,
2075 enum vfio_notify_type type, unsigned long *events,
2076 struct notifier_block *nb)
2078 struct vfio_group *group = device->group;
2081 if (!nb || !events || (*events == 0) ||
2082 !vfio_assert_device_open(device))
2086 case VFIO_IOMMU_NOTIFY:
2087 ret = vfio_register_iommu_notifier(group, events, nb);
2094 EXPORT_SYMBOL(vfio_register_notifier);
2096 int vfio_unregister_notifier(struct vfio_device *device,
2097 enum vfio_notify_type type,
2098 struct notifier_block *nb)
2100 struct vfio_group *group = device->group;
2103 if (!nb || !vfio_assert_device_open(device))
2107 case VFIO_IOMMU_NOTIFY:
2108 ret = vfio_unregister_iommu_notifier(group, nb);
2115 EXPORT_SYMBOL(vfio_unregister_notifier);
2118 * Module/class support
2120 static char *vfio_devnode(struct device *dev, umode_t *mode)
2122 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2125 static struct miscdevice vfio_dev = {
2126 .minor = VFIO_MINOR,
2129 .nodename = "vfio/vfio",
2130 .mode = S_IRUGO | S_IWUGO,
2133 static int __init vfio_init(void)
2137 ida_init(&vfio.group_ida);
2138 mutex_init(&vfio.group_lock);
2139 mutex_init(&vfio.iommu_drivers_lock);
2140 INIT_LIST_HEAD(&vfio.group_list);
2141 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2143 ret = misc_register(&vfio_dev);
2145 pr_err("vfio: misc device register failed\n");
2149 /* /dev/vfio/$GROUP */
2150 vfio.class = class_create(THIS_MODULE, "vfio");
2151 if (IS_ERR(vfio.class)) {
2152 ret = PTR_ERR(vfio.class);
2156 vfio.class->devnode = vfio_devnode;
2158 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2160 goto err_alloc_chrdev;
2162 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2164 #ifdef CONFIG_VFIO_NOIOMMU
2165 vfio_register_iommu_driver(&vfio_noiommu_ops);
2170 class_destroy(vfio.class);
2173 misc_deregister(&vfio_dev);
2177 static void __exit vfio_cleanup(void)
2179 WARN_ON(!list_empty(&vfio.group_list));
2181 #ifdef CONFIG_VFIO_NOIOMMU
2182 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2184 ida_destroy(&vfio.group_ida);
2185 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2186 class_destroy(vfio.class);
2188 misc_deregister(&vfio_dev);
2189 xa_destroy(&vfio_device_set_xa);
2192 module_init(vfio_init);
2193 module_exit(vfio_cleanup);
2195 MODULE_VERSION(DRIVER_VERSION);
2196 MODULE_LICENSE("GPL v2");
2197 MODULE_AUTHOR(DRIVER_AUTHOR);
2198 MODULE_DESCRIPTION(DRIVER_DESC);
2199 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2200 MODULE_ALIAS("devname:vfio/vfio");
2201 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");