1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
56 struct vfio_container {
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
69 unsigned int container_users;
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
74 struct list_head vfio_next;
75 struct list_head container_next;
76 enum vfio_group_type type;
77 unsigned int dev_counter;
78 struct rw_semaphore group_rwsem;
80 struct file *opened_file;
81 struct blocking_notifier_head notifier;
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
100 if (WARN_ON(!set_id))
104 * Atomically acquire a singleton object in the xarray for this set_id
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
110 xa_unlock(&vfio_device_set_xa);
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 dev_set = new_dev_set;
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
144 static void vfio_release_device_set(struct vfio_device *device)
146 struct vfio_device_set *dev_set = device->dev_set;
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
162 xa_unlock(&vfio_device_set_xa);
165 #ifdef CONFIG_VFIO_NOIOMMU
166 static void *vfio_noiommu_open(unsigned long arg)
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
176 static void vfio_noiommu_release(void *iommu_data)
180 static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
189 static int vfio_noiommu_attach_group(void *iommu_data,
190 struct iommu_group *iommu_group, enum vfio_group_type type)
195 static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
225 #endif /* CONFIG_VFIO_NOIOMMU */
228 * IOMMU driver registration
230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
232 struct vfio_iommu_driver *driver, *tmp;
234 if (WARN_ON(!ops->register_notifier != !ops->unregister_notifier))
237 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
243 mutex_lock(&vfio.iommu_drivers_lock);
245 /* Check for duplicates */
246 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
247 if (tmp->ops == ops) {
248 mutex_unlock(&vfio.iommu_drivers_lock);
254 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
256 mutex_unlock(&vfio.iommu_drivers_lock);
260 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
262 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
264 struct vfio_iommu_driver *driver;
266 mutex_lock(&vfio.iommu_drivers_lock);
267 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
268 if (driver->ops == ops) {
269 list_del(&driver->vfio_next);
270 mutex_unlock(&vfio.iommu_drivers_lock);
275 mutex_unlock(&vfio.iommu_drivers_lock);
277 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
279 static void vfio_group_get(struct vfio_group *group);
282 * Container objects - containers are created when /dev/vfio/vfio is
283 * opened, but their lifecycle extends until the last user is done, so
284 * it's freed via kref. Must support container/group/device being
285 * closed in any order.
287 static void vfio_container_get(struct vfio_container *container)
289 kref_get(&container->kref);
292 static void vfio_container_release(struct kref *kref)
294 struct vfio_container *container;
295 container = container_of(kref, struct vfio_container, kref);
300 static void vfio_container_put(struct vfio_container *container)
302 kref_put(&container->kref, vfio_container_release);
306 * Group objects - create, release, get, put, search
308 static struct vfio_group *
309 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
311 struct vfio_group *group;
313 list_for_each_entry(group, &vfio.group_list, vfio_next) {
314 if (group->iommu_group == iommu_group) {
315 vfio_group_get(group);
322 static struct vfio_group *
323 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
325 struct vfio_group *group;
327 mutex_lock(&vfio.group_lock);
328 group = __vfio_group_get_from_iommu(iommu_group);
329 mutex_unlock(&vfio.group_lock);
333 static void vfio_group_release(struct device *dev)
335 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
337 mutex_destroy(&group->device_lock);
338 iommu_group_put(group->iommu_group);
339 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
343 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
344 enum vfio_group_type type)
346 struct vfio_group *group;
349 group = kzalloc(sizeof(*group), GFP_KERNEL);
351 return ERR_PTR(-ENOMEM);
353 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
356 return ERR_PTR(minor);
359 device_initialize(&group->dev);
360 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
361 group->dev.class = vfio.class;
362 group->dev.release = vfio_group_release;
363 cdev_init(&group->cdev, &vfio_group_fops);
364 group->cdev.owner = THIS_MODULE;
366 refcount_set(&group->users, 1);
367 init_rwsem(&group->group_rwsem);
368 INIT_LIST_HEAD(&group->device_list);
369 mutex_init(&group->device_lock);
370 group->iommu_group = iommu_group;
371 /* put in vfio_group_release() */
372 iommu_group_ref_get(iommu_group);
374 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
379 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
380 enum vfio_group_type type)
382 struct vfio_group *group;
383 struct vfio_group *ret;
386 group = vfio_group_alloc(iommu_group, type);
390 err = dev_set_name(&group->dev, "%s%d",
391 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
392 iommu_group_id(iommu_group));
398 mutex_lock(&vfio.group_lock);
400 /* Did we race creating this group? */
401 ret = __vfio_group_get_from_iommu(iommu_group);
405 err = cdev_device_add(&group->cdev, &group->dev);
411 list_add(&group->vfio_next, &vfio.group_list);
413 mutex_unlock(&vfio.group_lock);
417 mutex_unlock(&vfio.group_lock);
419 put_device(&group->dev);
423 static void vfio_group_put(struct vfio_group *group)
425 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
429 * These data structures all have paired operations that can only be
430 * undone when the caller holds a live reference on the group. Since all
431 * pairs must be undone these WARN_ON's indicate some caller did not
432 * properly hold the group reference.
434 WARN_ON(!list_empty(&group->device_list));
435 WARN_ON(group->container || group->container_users);
436 WARN_ON(group->notifier.head);
438 list_del(&group->vfio_next);
439 cdev_device_del(&group->cdev, &group->dev);
440 mutex_unlock(&vfio.group_lock);
442 put_device(&group->dev);
445 static void vfio_group_get(struct vfio_group *group)
447 refcount_inc(&group->users);
451 * Device objects - create, release, get, put, search
453 /* Device reference always implies a group reference */
454 static void vfio_device_put(struct vfio_device *device)
456 if (refcount_dec_and_test(&device->refcount))
457 complete(&device->comp);
460 static bool vfio_device_try_get(struct vfio_device *device)
462 return refcount_inc_not_zero(&device->refcount);
465 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
468 struct vfio_device *device;
470 mutex_lock(&group->device_lock);
471 list_for_each_entry(device, &group->device_list, group_next) {
472 if (device->dev == dev && vfio_device_try_get(device)) {
473 mutex_unlock(&group->device_lock);
477 mutex_unlock(&group->device_lock);
484 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
485 const struct vfio_device_ops *ops)
487 init_completion(&device->comp);
491 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
493 void vfio_uninit_group_dev(struct vfio_device *device)
495 vfio_release_device_set(device);
497 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
499 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
500 enum vfio_group_type type)
502 struct iommu_group *iommu_group;
503 struct vfio_group *group;
506 iommu_group = iommu_group_alloc();
507 if (IS_ERR(iommu_group))
508 return ERR_CAST(iommu_group);
510 ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
513 ret = iommu_group_add_device(iommu_group, dev);
517 group = vfio_create_group(iommu_group, type);
519 ret = PTR_ERR(group);
520 goto out_remove_device;
522 iommu_group_put(iommu_group);
526 iommu_group_remove_device(dev);
528 iommu_group_put(iommu_group);
532 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
534 struct iommu_group *iommu_group;
535 struct vfio_group *group;
537 iommu_group = iommu_group_get(dev);
538 #ifdef CONFIG_VFIO_NOIOMMU
539 if (!iommu_group && noiommu) {
541 * With noiommu enabled, create an IOMMU group for devices that
542 * don't already have one, implying no IOMMU hardware/driver
543 * exists. Taint the kernel because we're about to give a DMA
544 * capable device to a user without IOMMU protection.
546 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
547 if (!IS_ERR(group)) {
548 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
549 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
555 return ERR_PTR(-EINVAL);
557 group = vfio_group_get_from_iommu(iommu_group);
559 group = vfio_create_group(iommu_group, VFIO_IOMMU);
561 /* The vfio_group holds a reference to the iommu_group */
562 iommu_group_put(iommu_group);
566 static int __vfio_register_dev(struct vfio_device *device,
567 struct vfio_group *group)
569 struct vfio_device *existing_device;
572 return PTR_ERR(group);
575 * If the driver doesn't specify a set then the device is added to a
576 * singleton set just for itself.
578 if (!device->dev_set)
579 vfio_assign_device_set(device, device);
581 existing_device = vfio_group_get_device(group, device->dev);
582 if (existing_device) {
583 dev_WARN(device->dev, "Device already exists on group %d\n",
584 iommu_group_id(group->iommu_group));
585 vfio_device_put(existing_device);
586 if (group->type == VFIO_NO_IOMMU ||
587 group->type == VFIO_EMULATED_IOMMU)
588 iommu_group_remove_device(device->dev);
589 vfio_group_put(group);
593 /* Our reference on group is moved to the device */
594 device->group = group;
596 /* Refcounting can't start until the driver calls register */
597 refcount_set(&device->refcount, 1);
599 mutex_lock(&group->device_lock);
600 list_add(&device->group_next, &group->device_list);
601 group->dev_counter++;
602 mutex_unlock(&group->device_lock);
607 int vfio_register_group_dev(struct vfio_device *device)
610 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
611 * restore cache coherency.
613 if (!device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY))
616 return __vfio_register_dev(device,
617 vfio_group_find_or_alloc(device->dev));
619 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
622 * Register a virtual device without IOMMU backing. The user of this
623 * device must not be able to directly trigger unmediated DMA.
625 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
627 return __vfio_register_dev(device,
628 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
630 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
632 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
635 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
637 mutex_lock(&group->device_lock);
638 list_for_each_entry(it, &group->device_list, group_next) {
641 if (it->ops->match) {
642 ret = it->ops->match(it, buf);
644 device = ERR_PTR(ret);
648 ret = !strcmp(dev_name(it->dev), buf);
651 if (ret && vfio_device_try_get(it)) {
656 mutex_unlock(&group->device_lock);
662 * Decrement the device reference count and wait for the device to be
663 * removed. Open file descriptors for the device... */
664 void vfio_unregister_group_dev(struct vfio_device *device)
666 struct vfio_group *group = device->group;
668 bool interrupted = false;
671 vfio_device_put(device);
672 rc = try_wait_for_completion(&device->comp);
674 if (device->ops->request)
675 device->ops->request(device, i++);
678 rc = wait_for_completion_timeout(&device->comp,
681 rc = wait_for_completion_interruptible_timeout(
682 &device->comp, HZ * 10);
685 dev_warn(device->dev,
686 "Device is currently in use, task"
688 "blocked until device is released",
689 current->comm, task_pid_nr(current));
694 mutex_lock(&group->device_lock);
695 list_del(&device->group_next);
696 group->dev_counter--;
697 mutex_unlock(&group->device_lock);
699 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
700 iommu_group_remove_device(device->dev);
702 /* Matches the get in vfio_register_group_dev() */
703 vfio_group_put(group);
705 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
708 * VFIO base fd, /dev/vfio/vfio
710 static long vfio_ioctl_check_extension(struct vfio_container *container,
713 struct vfio_iommu_driver *driver;
716 down_read(&container->group_lock);
718 driver = container->iommu_driver;
721 /* No base extensions yet */
724 * If no driver is set, poll all registered drivers for
725 * extensions and return the first positive result. If
726 * a driver is already set, further queries will be passed
727 * only to that driver.
730 mutex_lock(&vfio.iommu_drivers_lock);
731 list_for_each_entry(driver, &vfio.iommu_drivers_list,
734 if (!list_empty(&container->group_list) &&
735 !vfio_iommu_driver_allowed(container,
738 if (!try_module_get(driver->ops->owner))
741 ret = driver->ops->ioctl(NULL,
742 VFIO_CHECK_EXTENSION,
744 module_put(driver->ops->owner);
748 mutex_unlock(&vfio.iommu_drivers_lock);
750 ret = driver->ops->ioctl(container->iommu_data,
751 VFIO_CHECK_EXTENSION, arg);
754 up_read(&container->group_lock);
759 /* hold write lock on container->group_lock */
760 static int __vfio_container_attach_groups(struct vfio_container *container,
761 struct vfio_iommu_driver *driver,
764 struct vfio_group *group;
767 list_for_each_entry(group, &container->group_list, container_next) {
768 ret = driver->ops->attach_group(data, group->iommu_group,
777 list_for_each_entry_continue_reverse(group, &container->group_list,
779 driver->ops->detach_group(data, group->iommu_group);
785 static long vfio_ioctl_set_iommu(struct vfio_container *container,
788 struct vfio_iommu_driver *driver;
791 down_write(&container->group_lock);
794 * The container is designed to be an unprivileged interface while
795 * the group can be assigned to specific users. Therefore, only by
796 * adding a group to a container does the user get the privilege of
797 * enabling the iommu, which may allocate finite resources. There
798 * is no unset_iommu, but by removing all the groups from a container,
799 * the container is deprivileged and returns to an unset state.
801 if (list_empty(&container->group_list) || container->iommu_driver) {
802 up_write(&container->group_lock);
806 mutex_lock(&vfio.iommu_drivers_lock);
807 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
810 if (!vfio_iommu_driver_allowed(container, driver))
812 if (!try_module_get(driver->ops->owner))
816 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
817 * so test which iommu driver reported support for this
818 * extension and call open on them. We also pass them the
819 * magic, allowing a single driver to support multiple
820 * interfaces if they'd like.
822 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
823 module_put(driver->ops->owner);
827 data = driver->ops->open(arg);
830 module_put(driver->ops->owner);
834 ret = __vfio_container_attach_groups(container, driver, data);
836 driver->ops->release(data);
837 module_put(driver->ops->owner);
841 container->iommu_driver = driver;
842 container->iommu_data = data;
846 mutex_unlock(&vfio.iommu_drivers_lock);
847 up_write(&container->group_lock);
852 static long vfio_fops_unl_ioctl(struct file *filep,
853 unsigned int cmd, unsigned long arg)
855 struct vfio_container *container = filep->private_data;
856 struct vfio_iommu_driver *driver;
864 case VFIO_GET_API_VERSION:
865 ret = VFIO_API_VERSION;
867 case VFIO_CHECK_EXTENSION:
868 ret = vfio_ioctl_check_extension(container, arg);
871 ret = vfio_ioctl_set_iommu(container, arg);
874 driver = container->iommu_driver;
875 data = container->iommu_data;
877 if (driver) /* passthrough all unrecognized ioctls */
878 ret = driver->ops->ioctl(data, cmd, arg);
884 static int vfio_fops_open(struct inode *inode, struct file *filep)
886 struct vfio_container *container;
888 container = kzalloc(sizeof(*container), GFP_KERNEL);
892 INIT_LIST_HEAD(&container->group_list);
893 init_rwsem(&container->group_lock);
894 kref_init(&container->kref);
896 filep->private_data = container;
901 static int vfio_fops_release(struct inode *inode, struct file *filep)
903 struct vfio_container *container = filep->private_data;
904 struct vfio_iommu_driver *driver = container->iommu_driver;
906 if (driver && driver->ops->notify)
907 driver->ops->notify(container->iommu_data,
908 VFIO_IOMMU_CONTAINER_CLOSE);
910 filep->private_data = NULL;
912 vfio_container_put(container);
917 static const struct file_operations vfio_fops = {
918 .owner = THIS_MODULE,
919 .open = vfio_fops_open,
920 .release = vfio_fops_release,
921 .unlocked_ioctl = vfio_fops_unl_ioctl,
922 .compat_ioctl = compat_ptr_ioctl,
926 * VFIO Group fd, /dev/vfio/$GROUP
928 static void __vfio_group_unset_container(struct vfio_group *group)
930 struct vfio_container *container = group->container;
931 struct vfio_iommu_driver *driver;
933 lockdep_assert_held_write(&group->group_rwsem);
935 down_write(&container->group_lock);
937 driver = container->iommu_driver;
939 driver->ops->detach_group(container->iommu_data,
942 if (group->type == VFIO_IOMMU)
943 iommu_group_release_dma_owner(group->iommu_group);
945 group->container = NULL;
946 group->container_users = 0;
947 list_del(&group->container_next);
949 /* Detaching the last group deprivileges a container, remove iommu */
950 if (driver && list_empty(&container->group_list)) {
951 driver->ops->release(container->iommu_data);
952 module_put(driver->ops->owner);
953 container->iommu_driver = NULL;
954 container->iommu_data = NULL;
957 up_write(&container->group_lock);
959 vfio_container_put(container);
963 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
964 * if there was no container to unset. Since the ioctl is called on
965 * the group, we know that still exists, therefore the only valid
966 * transition here is 1->0.
968 static int vfio_group_unset_container(struct vfio_group *group)
970 lockdep_assert_held_write(&group->group_rwsem);
972 if (!group->container)
974 if (group->container_users != 1)
976 __vfio_group_unset_container(group);
980 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
983 struct vfio_container *container;
984 struct vfio_iommu_driver *driver;
987 lockdep_assert_held_write(&group->group_rwsem);
989 if (group->container || WARN_ON(group->container_users))
992 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
995 f = fdget(container_fd);
999 /* Sanity check, is this really our fd? */
1000 if (f.file->f_op != &vfio_fops) {
1005 container = f.file->private_data;
1006 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1008 down_write(&container->group_lock);
1010 /* Real groups and fake groups cannot mix */
1011 if (!list_empty(&container->group_list) &&
1012 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1017 if (group->type == VFIO_IOMMU) {
1018 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1023 driver = container->iommu_driver;
1025 ret = driver->ops->attach_group(container->iommu_data,
1029 if (group->type == VFIO_IOMMU)
1030 iommu_group_release_dma_owner(
1031 group->iommu_group);
1036 group->container = container;
1037 group->container_users = 1;
1038 container->noiommu = (group->type == VFIO_NO_IOMMU);
1039 list_add(&group->container_next, &container->group_list);
1041 /* Get a reference on the container and mark a user within the group */
1042 vfio_container_get(container);
1045 up_write(&container->group_lock);
1050 static const struct file_operations vfio_device_fops;
1052 /* true if the vfio_device has open_device() called but not close_device() */
1053 static bool vfio_assert_device_open(struct vfio_device *device)
1055 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1058 static int vfio_device_assign_container(struct vfio_device *device)
1060 struct vfio_group *group = device->group;
1062 lockdep_assert_held_write(&group->group_rwsem);
1064 if (!group->container || !group->container->iommu_driver ||
1065 WARN_ON(!group->container_users))
1068 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1071 get_file(group->opened_file);
1072 group->container_users++;
1076 static void vfio_device_unassign_container(struct vfio_device *device)
1078 down_write(&device->group->group_rwsem);
1079 WARN_ON(device->group->container_users <= 1);
1080 device->group->container_users--;
1081 fput(device->group->opened_file);
1082 up_write(&device->group->group_rwsem);
1085 static int vfio_iommu_notifier(struct notifier_block *nb, unsigned long action,
1088 struct vfio_device *vfio_device =
1089 container_of(nb, struct vfio_device, iommu_nb);
1090 struct vfio_iommu_type1_dma_unmap *unmap = data;
1092 vfio_device->ops->dma_unmap(vfio_device, unmap->iova, unmap->size);
1096 static struct file *vfio_device_open(struct vfio_device *device)
1098 struct vfio_iommu_driver *iommu_driver;
1102 down_write(&device->group->group_rwsem);
1103 ret = vfio_device_assign_container(device);
1104 up_write(&device->group->group_rwsem);
1106 return ERR_PTR(ret);
1108 if (!try_module_get(device->dev->driver->owner)) {
1110 goto err_unassign_container;
1113 mutex_lock(&device->dev_set->lock);
1114 device->open_count++;
1115 if (device->open_count == 1) {
1117 * Here we pass the KVM pointer with the group under the read
1118 * lock. If the device driver will use it, it must obtain a
1119 * reference and release it during close_device.
1121 down_read(&device->group->group_rwsem);
1122 device->kvm = device->group->kvm;
1124 if (device->ops->open_device) {
1125 ret = device->ops->open_device(device);
1127 goto err_undo_count;
1130 iommu_driver = device->group->container->iommu_driver;
1131 if (device->ops->dma_unmap && iommu_driver &&
1132 iommu_driver->ops->register_notifier) {
1133 unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
1135 device->iommu_nb.notifier_call = vfio_iommu_notifier;
1136 iommu_driver->ops->register_notifier(
1137 device->group->container->iommu_data, &events,
1141 up_read(&device->group->group_rwsem);
1143 mutex_unlock(&device->dev_set->lock);
1146 * We can't use anon_inode_getfd() because we need to modify
1147 * the f_mode flags directly to allow more than just ioctls
1149 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1151 if (IS_ERR(filep)) {
1152 ret = PTR_ERR(filep);
1153 goto err_close_device;
1157 * TODO: add an anon_inode interface to do this.
1158 * Appears to be missing by lack of need rather than
1159 * explicitly prevented. Now there's need.
1161 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1163 if (device->group->type == VFIO_NO_IOMMU)
1164 dev_warn(device->dev, "vfio-noiommu device opened by user "
1165 "(%s:%d)\n", current->comm, task_pid_nr(current));
1167 * On success the ref of device is moved to the file and
1168 * put in vfio_device_fops_release()
1173 mutex_lock(&device->dev_set->lock);
1174 down_read(&device->group->group_rwsem);
1175 if (device->open_count == 1 && device->ops->close_device) {
1176 device->ops->close_device(device);
1178 iommu_driver = device->group->container->iommu_driver;
1179 if (device->ops->dma_unmap && iommu_driver &&
1180 iommu_driver->ops->unregister_notifier)
1181 iommu_driver->ops->unregister_notifier(
1182 device->group->container->iommu_data,
1186 up_read(&device->group->group_rwsem);
1187 device->open_count--;
1188 if (device->open_count == 0 && device->kvm)
1190 mutex_unlock(&device->dev_set->lock);
1191 module_put(device->dev->driver->owner);
1192 err_unassign_container:
1193 vfio_device_unassign_container(device);
1194 return ERR_PTR(ret);
1197 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1199 struct vfio_device *device;
1204 device = vfio_device_get_from_name(group, buf);
1206 return PTR_ERR(device);
1208 fdno = get_unused_fd_flags(O_CLOEXEC);
1211 goto err_put_device;
1214 filep = vfio_device_open(device);
1215 if (IS_ERR(filep)) {
1216 ret = PTR_ERR(filep);
1220 fd_install(fdno, filep);
1224 put_unused_fd(fdno);
1226 vfio_device_put(device);
1230 static long vfio_group_fops_unl_ioctl(struct file *filep,
1231 unsigned int cmd, unsigned long arg)
1233 struct vfio_group *group = filep->private_data;
1237 case VFIO_GROUP_GET_STATUS:
1239 struct vfio_group_status status;
1240 unsigned long minsz;
1242 minsz = offsetofend(struct vfio_group_status, flags);
1244 if (copy_from_user(&status, (void __user *)arg, minsz))
1247 if (status.argsz < minsz)
1252 down_read(&group->group_rwsem);
1253 if (group->container)
1254 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1255 VFIO_GROUP_FLAGS_VIABLE;
1256 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1257 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1258 up_read(&group->group_rwsem);
1260 if (copy_to_user((void __user *)arg, &status, minsz))
1266 case VFIO_GROUP_SET_CONTAINER:
1270 if (get_user(fd, (int __user *)arg))
1276 down_write(&group->group_rwsem);
1277 ret = vfio_group_set_container(group, fd);
1278 up_write(&group->group_rwsem);
1281 case VFIO_GROUP_UNSET_CONTAINER:
1282 down_write(&group->group_rwsem);
1283 ret = vfio_group_unset_container(group);
1284 up_write(&group->group_rwsem);
1286 case VFIO_GROUP_GET_DEVICE_FD:
1290 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1292 return PTR_ERR(buf);
1294 ret = vfio_group_get_device_fd(group, buf);
1303 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1305 struct vfio_group *group =
1306 container_of(inode->i_cdev, struct vfio_group, cdev);
1309 down_write(&group->group_rwsem);
1311 /* users can be zero if this races with vfio_group_put() */
1312 if (!refcount_inc_not_zero(&group->users)) {
1317 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1323 * Do we need multiple instances of the group open? Seems not.
1325 if (group->opened_file) {
1329 group->opened_file = filep;
1330 filep->private_data = group;
1332 up_write(&group->group_rwsem);
1335 vfio_group_put(group);
1337 up_write(&group->group_rwsem);
1341 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1343 struct vfio_group *group = filep->private_data;
1345 filep->private_data = NULL;
1347 down_write(&group->group_rwsem);
1349 * Device FDs hold a group file reference, therefore the group release
1350 * is only called when there are no open devices.
1352 WARN_ON(group->notifier.head);
1353 if (group->container) {
1354 WARN_ON(group->container_users != 1);
1355 __vfio_group_unset_container(group);
1357 group->opened_file = NULL;
1358 up_write(&group->group_rwsem);
1360 vfio_group_put(group);
1365 static const struct file_operations vfio_group_fops = {
1366 .owner = THIS_MODULE,
1367 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1368 .compat_ioctl = compat_ptr_ioctl,
1369 .open = vfio_group_fops_open,
1370 .release = vfio_group_fops_release,
1376 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1378 struct vfio_device *device = filep->private_data;
1379 struct vfio_iommu_driver *iommu_driver;
1381 mutex_lock(&device->dev_set->lock);
1382 vfio_assert_device_open(device);
1383 down_read(&device->group->group_rwsem);
1384 if (device->open_count == 1 && device->ops->close_device)
1385 device->ops->close_device(device);
1387 iommu_driver = device->group->container->iommu_driver;
1388 if (device->ops->dma_unmap && iommu_driver &&
1389 iommu_driver->ops->unregister_notifier)
1390 iommu_driver->ops->unregister_notifier(
1391 device->group->container->iommu_data,
1393 up_read(&device->group->group_rwsem);
1394 device->open_count--;
1395 if (device->open_count == 0)
1397 mutex_unlock(&device->dev_set->lock);
1399 module_put(device->dev->driver->owner);
1401 vfio_device_unassign_container(device);
1403 vfio_device_put(device);
1409 * vfio_mig_get_next_state - Compute the next step in the FSM
1410 * @cur_fsm - The current state the device is in
1411 * @new_fsm - The target state to reach
1412 * @next_fsm - Pointer to the next step to get to new_fsm
1414 * Return 0 upon success, otherwise -errno
1415 * Upon success the next step in the state progression between cur_fsm and
1416 * new_fsm will be set in next_fsm.
1418 * This breaks down requests for combination transitions into smaller steps and
1419 * returns the next step to get to new_fsm. The function may need to be called
1420 * multiple times before reaching new_fsm.
1423 int vfio_mig_get_next_state(struct vfio_device *device,
1424 enum vfio_device_mig_state cur_fsm,
1425 enum vfio_device_mig_state new_fsm,
1426 enum vfio_device_mig_state *next_fsm)
1428 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1430 * The coding in this table requires the driver to implement the
1431 * following FSM arcs:
1437 * If P2P is supported then the driver must also implement these FSM
1439 * RUNNING -> RUNNING_P2P
1440 * RUNNING_P2P -> RUNNING
1441 * RUNNING_P2P -> STOP
1442 * STOP -> RUNNING_P2P
1443 * Without P2P the driver must implement:
1447 * The coding will step through multiple states for some combination
1448 * transitions; if all optional features are supported, this means the
1450 * RESUMING -> STOP -> RUNNING_P2P
1451 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1452 * RESUMING -> STOP -> STOP_COPY
1453 * RUNNING -> RUNNING_P2P -> STOP
1454 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1455 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1456 * RUNNING_P2P -> STOP -> RESUMING
1457 * RUNNING_P2P -> STOP -> STOP_COPY
1458 * STOP -> RUNNING_P2P -> RUNNING
1459 * STOP_COPY -> STOP -> RESUMING
1460 * STOP_COPY -> STOP -> RUNNING_P2P
1461 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1463 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1464 [VFIO_DEVICE_STATE_STOP] = {
1465 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1466 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1467 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1468 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1469 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1470 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1472 [VFIO_DEVICE_STATE_RUNNING] = {
1473 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1474 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1475 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1476 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1477 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1478 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1480 [VFIO_DEVICE_STATE_STOP_COPY] = {
1481 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1482 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1483 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1484 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1485 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1486 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1488 [VFIO_DEVICE_STATE_RESUMING] = {
1489 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1490 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1491 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1492 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1493 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1494 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1496 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1497 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1498 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1499 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1500 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1501 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1502 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1504 [VFIO_DEVICE_STATE_ERROR] = {
1505 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1506 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1507 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1508 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1509 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1510 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1514 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1515 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1516 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1517 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1518 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1519 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1520 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1521 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1524 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1525 (state_flags_table[cur_fsm] & device->migration_flags) !=
1526 state_flags_table[cur_fsm]))
1529 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1530 (state_flags_table[new_fsm] & device->migration_flags) !=
1531 state_flags_table[new_fsm])
1535 * Arcs touching optional and unsupported states are skipped over. The
1536 * driver will instead see an arc from the original state to the next
1537 * logical state, as per the above comment.
1539 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1540 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1541 state_flags_table[*next_fsm])
1542 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1544 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1546 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1549 * Convert the drivers's struct file into a FD number and return it to userspace
1551 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1552 struct vfio_device_feature_mig_state *mig)
1557 fd = get_unused_fd_flags(O_CLOEXEC);
1564 if (copy_to_user(arg, mig, sizeof(*mig))) {
1566 goto out_put_unused;
1568 fd_install(fd, filp);
1579 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1580 u32 flags, void __user *arg,
1584 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1585 struct vfio_device_feature_mig_state mig;
1586 struct file *filp = NULL;
1589 if (!device->mig_ops)
1592 ret = vfio_check_feature(flags, argsz,
1593 VFIO_DEVICE_FEATURE_SET |
1594 VFIO_DEVICE_FEATURE_GET,
1599 if (copy_from_user(&mig, arg, minsz))
1602 if (flags & VFIO_DEVICE_FEATURE_GET) {
1603 enum vfio_device_mig_state curr_state;
1605 ret = device->mig_ops->migration_get_state(device,
1609 mig.device_state = curr_state;
1613 /* Handle the VFIO_DEVICE_FEATURE_SET */
1614 filp = device->mig_ops->migration_set_state(device, mig.device_state);
1615 if (IS_ERR(filp) || !filp)
1618 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1621 if (copy_to_user(arg, &mig, sizeof(mig)))
1624 return PTR_ERR(filp);
1628 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1629 u32 flags, void __user *arg,
1632 struct vfio_device_feature_migration mig = {
1633 .flags = device->migration_flags,
1637 if (!device->mig_ops)
1640 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1644 if (copy_to_user(arg, &mig, sizeof(mig)))
1649 static int vfio_ioctl_device_feature(struct vfio_device *device,
1650 struct vfio_device_feature __user *arg)
1652 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1653 struct vfio_device_feature feature;
1655 if (copy_from_user(&feature, arg, minsz))
1658 if (feature.argsz < minsz)
1661 /* Check unknown flags */
1663 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1664 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1667 /* GET & SET are mutually exclusive except with PROBE */
1668 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1669 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1670 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1673 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1674 case VFIO_DEVICE_FEATURE_MIGRATION:
1675 return vfio_ioctl_device_feature_migration(
1676 device, feature.flags, arg->data,
1677 feature.argsz - minsz);
1678 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1679 return vfio_ioctl_device_feature_mig_device_state(
1680 device, feature.flags, arg->data,
1681 feature.argsz - minsz);
1683 if (unlikely(!device->ops->device_feature))
1685 return device->ops->device_feature(device, feature.flags,
1687 feature.argsz - minsz);
1691 static long vfio_device_fops_unl_ioctl(struct file *filep,
1692 unsigned int cmd, unsigned long arg)
1694 struct vfio_device *device = filep->private_data;
1697 case VFIO_DEVICE_FEATURE:
1698 return vfio_ioctl_device_feature(device, (void __user *)arg);
1700 if (unlikely(!device->ops->ioctl))
1702 return device->ops->ioctl(device, cmd, arg);
1706 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1707 size_t count, loff_t *ppos)
1709 struct vfio_device *device = filep->private_data;
1711 if (unlikely(!device->ops->read))
1714 return device->ops->read(device, buf, count, ppos);
1717 static ssize_t vfio_device_fops_write(struct file *filep,
1718 const char __user *buf,
1719 size_t count, loff_t *ppos)
1721 struct vfio_device *device = filep->private_data;
1723 if (unlikely(!device->ops->write))
1726 return device->ops->write(device, buf, count, ppos);
1729 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1731 struct vfio_device *device = filep->private_data;
1733 if (unlikely(!device->ops->mmap))
1736 return device->ops->mmap(device, vma);
1739 static const struct file_operations vfio_device_fops = {
1740 .owner = THIS_MODULE,
1741 .release = vfio_device_fops_release,
1742 .read = vfio_device_fops_read,
1743 .write = vfio_device_fops_write,
1744 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1745 .compat_ioctl = compat_ptr_ioctl,
1746 .mmap = vfio_device_fops_mmap,
1750 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1751 * @file: VFIO group file
1753 * The returned iommu_group is valid as long as a ref is held on the file.
1755 struct iommu_group *vfio_file_iommu_group(struct file *file)
1757 struct vfio_group *group = file->private_data;
1759 if (file->f_op != &vfio_group_fops)
1761 return group->iommu_group;
1763 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1766 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1767 * is always CPU cache coherent
1768 * @file: VFIO group file
1770 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1771 * bit in DMA transactions. A return of false indicates that the user has
1772 * rights to access additional instructions such as wbinvd on x86.
1774 bool vfio_file_enforced_coherent(struct file *file)
1776 struct vfio_group *group = file->private_data;
1779 if (file->f_op != &vfio_group_fops)
1782 down_read(&group->group_rwsem);
1783 if (group->container) {
1784 ret = vfio_ioctl_check_extension(group->container,
1788 * Since the coherency state is determined only once a container
1789 * is attached the user must do so before they can prove they
1794 up_read(&group->group_rwsem);
1797 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1800 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1801 * @file: VFIO group file
1804 * When a VFIO device is first opened the KVM will be available in
1805 * device->kvm if one was associated with the group.
1807 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1809 struct vfio_group *group = file->private_data;
1811 if (file->f_op != &vfio_group_fops)
1814 down_write(&group->group_rwsem);
1816 up_write(&group->group_rwsem);
1818 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1821 * vfio_file_has_dev - True if the VFIO file is a handle for device
1822 * @file: VFIO file to check
1823 * @device: Device that must be part of the file
1825 * Returns true if given file has permission to manipulate the given device.
1827 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1829 struct vfio_group *group = file->private_data;
1831 if (file->f_op != &vfio_group_fops)
1834 return group == device->group;
1836 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1839 * Sub-module support
1842 * Helper for managing a buffer of info chain capabilities, allocate or
1843 * reallocate a buffer with additional @size, filling in @id and @version
1844 * of the capability. A pointer to the new capability is returned.
1846 * NB. The chain is based at the head of the buffer, so new entries are
1847 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1848 * next offsets prior to copying to the user buffer.
1850 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1851 size_t size, u16 id, u16 version)
1854 struct vfio_info_cap_header *header, *tmp;
1856 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1861 return ERR_PTR(-ENOMEM);
1865 header = buf + caps->size;
1867 /* Eventually copied to user buffer, zero */
1868 memset(header, 0, size);
1871 header->version = version;
1873 /* Add to the end of the capability chain */
1874 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1877 tmp->next = caps->size;
1882 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1884 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1886 struct vfio_info_cap_header *tmp;
1887 void *buf = (void *)caps->buf;
1889 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1890 tmp->next += offset;
1892 EXPORT_SYMBOL(vfio_info_cap_shift);
1894 int vfio_info_add_capability(struct vfio_info_cap *caps,
1895 struct vfio_info_cap_header *cap, size_t size)
1897 struct vfio_info_cap_header *header;
1899 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1901 return PTR_ERR(header);
1903 memcpy(header + 1, cap + 1, size - sizeof(*header));
1907 EXPORT_SYMBOL(vfio_info_add_capability);
1909 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1910 int max_irq_type, size_t *data_size)
1912 unsigned long minsz;
1915 minsz = offsetofend(struct vfio_irq_set, count);
1917 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1918 (hdr->count >= (U32_MAX - hdr->start)) ||
1919 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1920 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1926 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1929 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1930 case VFIO_IRQ_SET_DATA_NONE:
1933 case VFIO_IRQ_SET_DATA_BOOL:
1934 size = sizeof(uint8_t);
1936 case VFIO_IRQ_SET_DATA_EVENTFD:
1937 size = sizeof(int32_t);
1944 if (hdr->argsz - minsz < hdr->count * size)
1950 *data_size = hdr->count * size;
1955 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1958 * Pin a set of guest PFNs and return their associated host PFNs for local
1960 * @device [in] : device
1961 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1962 * @npage [in] : count of elements in user_pfn array. This count should not
1963 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1964 * @prot [in] : protection flags
1965 * @phys_pfn[out]: array of host PFNs
1966 * Return error or number of pages pinned.
1968 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1969 int npage, int prot, unsigned long *phys_pfn)
1971 struct vfio_container *container;
1972 struct vfio_group *group = device->group;
1973 struct vfio_iommu_driver *driver;
1976 if (!user_pfn || !phys_pfn || !npage ||
1977 !vfio_assert_device_open(device))
1980 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1983 if (group->dev_counter > 1)
1986 /* group->container cannot change while a vfio device is open */
1987 container = group->container;
1988 driver = container->iommu_driver;
1989 if (likely(driver && driver->ops->pin_pages))
1990 ret = driver->ops->pin_pages(container->iommu_data,
1991 group->iommu_group, user_pfn,
1992 npage, prot, phys_pfn);
1998 EXPORT_SYMBOL(vfio_pin_pages);
2001 * Unpin set of host PFNs for local domain only.
2002 * @device [in] : device
2003 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2004 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2005 * @npage [in] : count of elements in user_pfn array. This count should not
2006 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2007 * Return error or number of pages unpinned.
2009 int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
2012 struct vfio_container *container;
2013 struct vfio_iommu_driver *driver;
2016 if (!user_pfn || !npage || !vfio_assert_device_open(device))
2019 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2022 /* group->container cannot change while a vfio device is open */
2023 container = device->group->container;
2024 driver = container->iommu_driver;
2025 if (likely(driver && driver->ops->unpin_pages))
2026 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2033 EXPORT_SYMBOL(vfio_unpin_pages);
2036 * This interface allows the CPUs to perform some sort of virtual DMA on
2037 * behalf of the device.
2039 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2040 * into/from a kernel buffer.
2042 * As the read/write of user space memory is conducted via the CPUs and is
2043 * not a real device DMA, it is not necessary to pin the user space memory.
2045 * @device [in] : VFIO device
2046 * @user_iova [in] : base IOVA of a user space buffer
2047 * @data [in] : pointer to kernel buffer
2048 * @len [in] : kernel buffer length
2049 * @write : indicate read or write
2050 * Return error code on failure or 0 on success.
2052 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2053 size_t len, bool write)
2055 struct vfio_container *container;
2056 struct vfio_iommu_driver *driver;
2059 if (!data || len <= 0 || !vfio_assert_device_open(device))
2062 /* group->container cannot change while a vfio device is open */
2063 container = device->group->container;
2064 driver = container->iommu_driver;
2066 if (likely(driver && driver->ops->dma_rw))
2067 ret = driver->ops->dma_rw(container->iommu_data,
2068 user_iova, data, len, write);
2073 EXPORT_SYMBOL(vfio_dma_rw);
2076 * Module/class support
2078 static char *vfio_devnode(struct device *dev, umode_t *mode)
2080 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2083 static struct miscdevice vfio_dev = {
2084 .minor = VFIO_MINOR,
2087 .nodename = "vfio/vfio",
2088 .mode = S_IRUGO | S_IWUGO,
2091 static int __init vfio_init(void)
2095 ida_init(&vfio.group_ida);
2096 mutex_init(&vfio.group_lock);
2097 mutex_init(&vfio.iommu_drivers_lock);
2098 INIT_LIST_HEAD(&vfio.group_list);
2099 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2101 ret = misc_register(&vfio_dev);
2103 pr_err("vfio: misc device register failed\n");
2107 /* /dev/vfio/$GROUP */
2108 vfio.class = class_create(THIS_MODULE, "vfio");
2109 if (IS_ERR(vfio.class)) {
2110 ret = PTR_ERR(vfio.class);
2114 vfio.class->devnode = vfio_devnode;
2116 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2118 goto err_alloc_chrdev;
2120 #ifdef CONFIG_VFIO_NOIOMMU
2121 ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
2124 goto err_driver_register;
2126 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2129 err_driver_register:
2130 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2132 class_destroy(vfio.class);
2135 misc_deregister(&vfio_dev);
2139 static void __exit vfio_cleanup(void)
2141 WARN_ON(!list_empty(&vfio.group_list));
2143 #ifdef CONFIG_VFIO_NOIOMMU
2144 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2146 ida_destroy(&vfio.group_ida);
2147 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2148 class_destroy(vfio.class);
2150 misc_deregister(&vfio_dev);
2151 xa_destroy(&vfio_device_set_xa);
2154 module_init(vfio_init);
2155 module_exit(vfio_cleanup);
2157 MODULE_VERSION(DRIVER_VERSION);
2158 MODULE_LICENSE("GPL v2");
2159 MODULE_AUTHOR(DRIVER_AUTHOR);
2160 MODULE_DESCRIPTION(DRIVER_DESC);
2161 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2162 MODULE_ALIAS("devname:vfio/vfio");
2163 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");