1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
56 struct vfio_container {
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
69 unsigned int container_users;
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
74 struct list_head vfio_next;
75 struct list_head container_next;
76 enum vfio_group_type type;
77 unsigned int dev_counter;
78 struct rw_semaphore group_rwsem;
80 struct file *opened_file;
81 struct blocking_notifier_head notifier;
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
100 if (WARN_ON(!set_id))
104 * Atomically acquire a singleton object in the xarray for this set_id
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
110 xa_unlock(&vfio_device_set_xa);
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 dev_set = new_dev_set;
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
144 static void vfio_release_device_set(struct vfio_device *device)
146 struct vfio_device_set *dev_set = device->dev_set;
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
162 xa_unlock(&vfio_device_set_xa);
165 #ifdef CONFIG_VFIO_NOIOMMU
166 static void *vfio_noiommu_open(unsigned long arg)
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
176 static void vfio_noiommu_release(void *iommu_data)
180 static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
189 static int vfio_noiommu_attach_group(void *iommu_data,
190 struct iommu_group *iommu_group, enum vfio_group_type type)
195 static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
225 #endif /* CONFIG_VFIO_NOIOMMU */
228 * IOMMU driver registration
230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
232 struct vfio_iommu_driver *driver, *tmp;
234 if (WARN_ON(!ops->register_device != !ops->unregister_device))
237 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
243 mutex_lock(&vfio.iommu_drivers_lock);
245 /* Check for duplicates */
246 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
247 if (tmp->ops == ops) {
248 mutex_unlock(&vfio.iommu_drivers_lock);
254 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
256 mutex_unlock(&vfio.iommu_drivers_lock);
260 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
262 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
264 struct vfio_iommu_driver *driver;
266 mutex_lock(&vfio.iommu_drivers_lock);
267 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
268 if (driver->ops == ops) {
269 list_del(&driver->vfio_next);
270 mutex_unlock(&vfio.iommu_drivers_lock);
275 mutex_unlock(&vfio.iommu_drivers_lock);
277 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
279 static void vfio_group_get(struct vfio_group *group);
282 * Container objects - containers are created when /dev/vfio/vfio is
283 * opened, but their lifecycle extends until the last user is done, so
284 * it's freed via kref. Must support container/group/device being
285 * closed in any order.
287 static void vfio_container_get(struct vfio_container *container)
289 kref_get(&container->kref);
292 static void vfio_container_release(struct kref *kref)
294 struct vfio_container *container;
295 container = container_of(kref, struct vfio_container, kref);
300 static void vfio_container_put(struct vfio_container *container)
302 kref_put(&container->kref, vfio_container_release);
306 * Group objects - create, release, get, put, search
308 static struct vfio_group *
309 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
311 struct vfio_group *group;
313 list_for_each_entry(group, &vfio.group_list, vfio_next) {
314 if (group->iommu_group == iommu_group) {
315 vfio_group_get(group);
322 static struct vfio_group *
323 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
325 struct vfio_group *group;
327 mutex_lock(&vfio.group_lock);
328 group = __vfio_group_get_from_iommu(iommu_group);
329 mutex_unlock(&vfio.group_lock);
333 static void vfio_group_release(struct device *dev)
335 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
337 mutex_destroy(&group->device_lock);
338 iommu_group_put(group->iommu_group);
339 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
343 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
344 enum vfio_group_type type)
346 struct vfio_group *group;
349 group = kzalloc(sizeof(*group), GFP_KERNEL);
351 return ERR_PTR(-ENOMEM);
353 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
356 return ERR_PTR(minor);
359 device_initialize(&group->dev);
360 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
361 group->dev.class = vfio.class;
362 group->dev.release = vfio_group_release;
363 cdev_init(&group->cdev, &vfio_group_fops);
364 group->cdev.owner = THIS_MODULE;
366 refcount_set(&group->users, 1);
367 init_rwsem(&group->group_rwsem);
368 INIT_LIST_HEAD(&group->device_list);
369 mutex_init(&group->device_lock);
370 group->iommu_group = iommu_group;
371 /* put in vfio_group_release() */
372 iommu_group_ref_get(iommu_group);
374 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
379 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
380 enum vfio_group_type type)
382 struct vfio_group *group;
383 struct vfio_group *ret;
386 group = vfio_group_alloc(iommu_group, type);
390 err = dev_set_name(&group->dev, "%s%d",
391 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
392 iommu_group_id(iommu_group));
398 mutex_lock(&vfio.group_lock);
400 /* Did we race creating this group? */
401 ret = __vfio_group_get_from_iommu(iommu_group);
405 err = cdev_device_add(&group->cdev, &group->dev);
411 list_add(&group->vfio_next, &vfio.group_list);
413 mutex_unlock(&vfio.group_lock);
417 mutex_unlock(&vfio.group_lock);
419 put_device(&group->dev);
423 static void vfio_group_put(struct vfio_group *group)
425 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
429 * These data structures all have paired operations that can only be
430 * undone when the caller holds a live reference on the group. Since all
431 * pairs must be undone these WARN_ON's indicate some caller did not
432 * properly hold the group reference.
434 WARN_ON(!list_empty(&group->device_list));
435 WARN_ON(group->container || group->container_users);
436 WARN_ON(group->notifier.head);
438 list_del(&group->vfio_next);
439 cdev_device_del(&group->cdev, &group->dev);
440 mutex_unlock(&vfio.group_lock);
442 put_device(&group->dev);
445 static void vfio_group_get(struct vfio_group *group)
447 refcount_inc(&group->users);
451 * Device objects - create, release, get, put, search
453 /* Device reference always implies a group reference */
454 static void vfio_device_put(struct vfio_device *device)
456 if (refcount_dec_and_test(&device->refcount))
457 complete(&device->comp);
460 static bool vfio_device_try_get(struct vfio_device *device)
462 return refcount_inc_not_zero(&device->refcount);
465 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
468 struct vfio_device *device;
470 mutex_lock(&group->device_lock);
471 list_for_each_entry(device, &group->device_list, group_next) {
472 if (device->dev == dev && vfio_device_try_get(device)) {
473 mutex_unlock(&group->device_lock);
477 mutex_unlock(&group->device_lock);
484 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
485 const struct vfio_device_ops *ops)
487 init_completion(&device->comp);
491 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
493 void vfio_uninit_group_dev(struct vfio_device *device)
495 vfio_release_device_set(device);
497 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
499 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
500 enum vfio_group_type type)
502 struct iommu_group *iommu_group;
503 struct vfio_group *group;
506 iommu_group = iommu_group_alloc();
507 if (IS_ERR(iommu_group))
508 return ERR_CAST(iommu_group);
510 ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
513 ret = iommu_group_add_device(iommu_group, dev);
517 group = vfio_create_group(iommu_group, type);
519 ret = PTR_ERR(group);
520 goto out_remove_device;
522 iommu_group_put(iommu_group);
526 iommu_group_remove_device(dev);
528 iommu_group_put(iommu_group);
532 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
534 struct iommu_group *iommu_group;
535 struct vfio_group *group;
537 iommu_group = iommu_group_get(dev);
538 #ifdef CONFIG_VFIO_NOIOMMU
539 if (!iommu_group && noiommu) {
541 * With noiommu enabled, create an IOMMU group for devices that
542 * don't already have one, implying no IOMMU hardware/driver
543 * exists. Taint the kernel because we're about to give a DMA
544 * capable device to a user without IOMMU protection.
546 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
547 if (!IS_ERR(group)) {
548 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
549 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
555 return ERR_PTR(-EINVAL);
558 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
559 * restore cache coherency. It has to be checked here because it is only
560 * valid for cases where we are using iommu groups.
562 if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
563 iommu_group_put(iommu_group);
564 return ERR_PTR(-EINVAL);
567 group = vfio_group_get_from_iommu(iommu_group);
569 group = vfio_create_group(iommu_group, VFIO_IOMMU);
571 /* The vfio_group holds a reference to the iommu_group */
572 iommu_group_put(iommu_group);
576 static int __vfio_register_dev(struct vfio_device *device,
577 struct vfio_group *group)
579 struct vfio_device *existing_device;
582 return PTR_ERR(group);
585 * If the driver doesn't specify a set then the device is added to a
586 * singleton set just for itself.
588 if (!device->dev_set)
589 vfio_assign_device_set(device, device);
591 existing_device = vfio_group_get_device(group, device->dev);
592 if (existing_device) {
593 dev_WARN(device->dev, "Device already exists on group %d\n",
594 iommu_group_id(group->iommu_group));
595 vfio_device_put(existing_device);
596 if (group->type == VFIO_NO_IOMMU ||
597 group->type == VFIO_EMULATED_IOMMU)
598 iommu_group_remove_device(device->dev);
599 vfio_group_put(group);
603 /* Our reference on group is moved to the device */
604 device->group = group;
606 /* Refcounting can't start until the driver calls register */
607 refcount_set(&device->refcount, 1);
609 mutex_lock(&group->device_lock);
610 list_add(&device->group_next, &group->device_list);
611 group->dev_counter++;
612 mutex_unlock(&group->device_lock);
617 int vfio_register_group_dev(struct vfio_device *device)
619 return __vfio_register_dev(device,
620 vfio_group_find_or_alloc(device->dev));
622 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
625 * Register a virtual device without IOMMU backing. The user of this
626 * device must not be able to directly trigger unmediated DMA.
628 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
630 return __vfio_register_dev(device,
631 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
633 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
635 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
638 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
640 mutex_lock(&group->device_lock);
641 list_for_each_entry(it, &group->device_list, group_next) {
644 if (it->ops->match) {
645 ret = it->ops->match(it, buf);
647 device = ERR_PTR(ret);
651 ret = !strcmp(dev_name(it->dev), buf);
654 if (ret && vfio_device_try_get(it)) {
659 mutex_unlock(&group->device_lock);
665 * Decrement the device reference count and wait for the device to be
666 * removed. Open file descriptors for the device... */
667 void vfio_unregister_group_dev(struct vfio_device *device)
669 struct vfio_group *group = device->group;
671 bool interrupted = false;
674 vfio_device_put(device);
675 rc = try_wait_for_completion(&device->comp);
677 if (device->ops->request)
678 device->ops->request(device, i++);
681 rc = wait_for_completion_timeout(&device->comp,
684 rc = wait_for_completion_interruptible_timeout(
685 &device->comp, HZ * 10);
688 dev_warn(device->dev,
689 "Device is currently in use, task"
691 "blocked until device is released",
692 current->comm, task_pid_nr(current));
697 mutex_lock(&group->device_lock);
698 list_del(&device->group_next);
699 group->dev_counter--;
700 mutex_unlock(&group->device_lock);
702 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
703 iommu_group_remove_device(device->dev);
705 /* Matches the get in vfio_register_group_dev() */
706 vfio_group_put(group);
708 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
711 * VFIO base fd, /dev/vfio/vfio
713 static long vfio_ioctl_check_extension(struct vfio_container *container,
716 struct vfio_iommu_driver *driver;
719 down_read(&container->group_lock);
721 driver = container->iommu_driver;
724 /* No base extensions yet */
727 * If no driver is set, poll all registered drivers for
728 * extensions and return the first positive result. If
729 * a driver is already set, further queries will be passed
730 * only to that driver.
733 mutex_lock(&vfio.iommu_drivers_lock);
734 list_for_each_entry(driver, &vfio.iommu_drivers_list,
737 if (!list_empty(&container->group_list) &&
738 !vfio_iommu_driver_allowed(container,
741 if (!try_module_get(driver->ops->owner))
744 ret = driver->ops->ioctl(NULL,
745 VFIO_CHECK_EXTENSION,
747 module_put(driver->ops->owner);
751 mutex_unlock(&vfio.iommu_drivers_lock);
753 ret = driver->ops->ioctl(container->iommu_data,
754 VFIO_CHECK_EXTENSION, arg);
757 up_read(&container->group_lock);
762 /* hold write lock on container->group_lock */
763 static int __vfio_container_attach_groups(struct vfio_container *container,
764 struct vfio_iommu_driver *driver,
767 struct vfio_group *group;
770 list_for_each_entry(group, &container->group_list, container_next) {
771 ret = driver->ops->attach_group(data, group->iommu_group,
780 list_for_each_entry_continue_reverse(group, &container->group_list,
782 driver->ops->detach_group(data, group->iommu_group);
788 static long vfio_ioctl_set_iommu(struct vfio_container *container,
791 struct vfio_iommu_driver *driver;
794 down_write(&container->group_lock);
797 * The container is designed to be an unprivileged interface while
798 * the group can be assigned to specific users. Therefore, only by
799 * adding a group to a container does the user get the privilege of
800 * enabling the iommu, which may allocate finite resources. There
801 * is no unset_iommu, but by removing all the groups from a container,
802 * the container is deprivileged and returns to an unset state.
804 if (list_empty(&container->group_list) || container->iommu_driver) {
805 up_write(&container->group_lock);
809 mutex_lock(&vfio.iommu_drivers_lock);
810 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
813 if (!vfio_iommu_driver_allowed(container, driver))
815 if (!try_module_get(driver->ops->owner))
819 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
820 * so test which iommu driver reported support for this
821 * extension and call open on them. We also pass them the
822 * magic, allowing a single driver to support multiple
823 * interfaces if they'd like.
825 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
826 module_put(driver->ops->owner);
830 data = driver->ops->open(arg);
833 module_put(driver->ops->owner);
837 ret = __vfio_container_attach_groups(container, driver, data);
839 driver->ops->release(data);
840 module_put(driver->ops->owner);
844 container->iommu_driver = driver;
845 container->iommu_data = data;
849 mutex_unlock(&vfio.iommu_drivers_lock);
850 up_write(&container->group_lock);
855 static long vfio_fops_unl_ioctl(struct file *filep,
856 unsigned int cmd, unsigned long arg)
858 struct vfio_container *container = filep->private_data;
859 struct vfio_iommu_driver *driver;
867 case VFIO_GET_API_VERSION:
868 ret = VFIO_API_VERSION;
870 case VFIO_CHECK_EXTENSION:
871 ret = vfio_ioctl_check_extension(container, arg);
874 ret = vfio_ioctl_set_iommu(container, arg);
877 driver = container->iommu_driver;
878 data = container->iommu_data;
880 if (driver) /* passthrough all unrecognized ioctls */
881 ret = driver->ops->ioctl(data, cmd, arg);
887 static int vfio_fops_open(struct inode *inode, struct file *filep)
889 struct vfio_container *container;
891 container = kzalloc(sizeof(*container), GFP_KERNEL);
895 INIT_LIST_HEAD(&container->group_list);
896 init_rwsem(&container->group_lock);
897 kref_init(&container->kref);
899 filep->private_data = container;
904 static int vfio_fops_release(struct inode *inode, struct file *filep)
906 struct vfio_container *container = filep->private_data;
907 struct vfio_iommu_driver *driver = container->iommu_driver;
909 if (driver && driver->ops->notify)
910 driver->ops->notify(container->iommu_data,
911 VFIO_IOMMU_CONTAINER_CLOSE);
913 filep->private_data = NULL;
915 vfio_container_put(container);
920 static const struct file_operations vfio_fops = {
921 .owner = THIS_MODULE,
922 .open = vfio_fops_open,
923 .release = vfio_fops_release,
924 .unlocked_ioctl = vfio_fops_unl_ioctl,
925 .compat_ioctl = compat_ptr_ioctl,
929 * VFIO Group fd, /dev/vfio/$GROUP
931 static void __vfio_group_unset_container(struct vfio_group *group)
933 struct vfio_container *container = group->container;
934 struct vfio_iommu_driver *driver;
936 lockdep_assert_held_write(&group->group_rwsem);
938 down_write(&container->group_lock);
940 driver = container->iommu_driver;
942 driver->ops->detach_group(container->iommu_data,
945 if (group->type == VFIO_IOMMU)
946 iommu_group_release_dma_owner(group->iommu_group);
948 group->container = NULL;
949 group->container_users = 0;
950 list_del(&group->container_next);
952 /* Detaching the last group deprivileges a container, remove iommu */
953 if (driver && list_empty(&container->group_list)) {
954 driver->ops->release(container->iommu_data);
955 module_put(driver->ops->owner);
956 container->iommu_driver = NULL;
957 container->iommu_data = NULL;
960 up_write(&container->group_lock);
962 vfio_container_put(container);
966 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
967 * if there was no container to unset. Since the ioctl is called on
968 * the group, we know that still exists, therefore the only valid
969 * transition here is 1->0.
971 static int vfio_group_unset_container(struct vfio_group *group)
973 lockdep_assert_held_write(&group->group_rwsem);
975 if (!group->container)
977 if (group->container_users != 1)
979 __vfio_group_unset_container(group);
983 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
986 struct vfio_container *container;
987 struct vfio_iommu_driver *driver;
990 lockdep_assert_held_write(&group->group_rwsem);
992 if (group->container || WARN_ON(group->container_users))
995 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
998 f = fdget(container_fd);
1002 /* Sanity check, is this really our fd? */
1003 if (f.file->f_op != &vfio_fops) {
1008 container = f.file->private_data;
1009 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1011 down_write(&container->group_lock);
1013 /* Real groups and fake groups cannot mix */
1014 if (!list_empty(&container->group_list) &&
1015 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1020 if (group->type == VFIO_IOMMU) {
1021 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1026 driver = container->iommu_driver;
1028 ret = driver->ops->attach_group(container->iommu_data,
1032 if (group->type == VFIO_IOMMU)
1033 iommu_group_release_dma_owner(
1034 group->iommu_group);
1039 group->container = container;
1040 group->container_users = 1;
1041 container->noiommu = (group->type == VFIO_NO_IOMMU);
1042 list_add(&group->container_next, &container->group_list);
1044 /* Get a reference on the container and mark a user within the group */
1045 vfio_container_get(container);
1048 up_write(&container->group_lock);
1053 static const struct file_operations vfio_device_fops;
1055 /* true if the vfio_device has open_device() called but not close_device() */
1056 static bool vfio_assert_device_open(struct vfio_device *device)
1058 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1061 static int vfio_device_assign_container(struct vfio_device *device)
1063 struct vfio_group *group = device->group;
1065 lockdep_assert_held_write(&group->group_rwsem);
1067 if (!group->container || !group->container->iommu_driver ||
1068 WARN_ON(!group->container_users))
1071 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1074 get_file(group->opened_file);
1075 group->container_users++;
1079 static void vfio_device_unassign_container(struct vfio_device *device)
1081 down_write(&device->group->group_rwsem);
1082 WARN_ON(device->group->container_users <= 1);
1083 device->group->container_users--;
1084 fput(device->group->opened_file);
1085 up_write(&device->group->group_rwsem);
1088 static struct file *vfio_device_open(struct vfio_device *device)
1090 struct vfio_iommu_driver *iommu_driver;
1094 down_write(&device->group->group_rwsem);
1095 ret = vfio_device_assign_container(device);
1096 up_write(&device->group->group_rwsem);
1098 return ERR_PTR(ret);
1100 if (!try_module_get(device->dev->driver->owner)) {
1102 goto err_unassign_container;
1105 mutex_lock(&device->dev_set->lock);
1106 device->open_count++;
1107 if (device->open_count == 1) {
1109 * Here we pass the KVM pointer with the group under the read
1110 * lock. If the device driver will use it, it must obtain a
1111 * reference and release it during close_device.
1113 down_read(&device->group->group_rwsem);
1114 device->kvm = device->group->kvm;
1116 if (device->ops->open_device) {
1117 ret = device->ops->open_device(device);
1119 goto err_undo_count;
1122 iommu_driver = device->group->container->iommu_driver;
1123 if (iommu_driver && iommu_driver->ops->register_device)
1124 iommu_driver->ops->register_device(
1125 device->group->container->iommu_data, device);
1127 up_read(&device->group->group_rwsem);
1129 mutex_unlock(&device->dev_set->lock);
1132 * We can't use anon_inode_getfd() because we need to modify
1133 * the f_mode flags directly to allow more than just ioctls
1135 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1137 if (IS_ERR(filep)) {
1138 ret = PTR_ERR(filep);
1139 goto err_close_device;
1143 * TODO: add an anon_inode interface to do this.
1144 * Appears to be missing by lack of need rather than
1145 * explicitly prevented. Now there's need.
1147 filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
1149 if (device->group->type == VFIO_NO_IOMMU)
1150 dev_warn(device->dev, "vfio-noiommu device opened by user "
1151 "(%s:%d)\n", current->comm, task_pid_nr(current));
1153 * On success the ref of device is moved to the file and
1154 * put in vfio_device_fops_release()
1159 mutex_lock(&device->dev_set->lock);
1160 down_read(&device->group->group_rwsem);
1161 if (device->open_count == 1 && device->ops->close_device) {
1162 device->ops->close_device(device);
1164 iommu_driver = device->group->container->iommu_driver;
1165 if (iommu_driver && iommu_driver->ops->unregister_device)
1166 iommu_driver->ops->unregister_device(
1167 device->group->container->iommu_data, device);
1170 up_read(&device->group->group_rwsem);
1171 device->open_count--;
1172 if (device->open_count == 0 && device->kvm)
1174 mutex_unlock(&device->dev_set->lock);
1175 module_put(device->dev->driver->owner);
1176 err_unassign_container:
1177 vfio_device_unassign_container(device);
1178 return ERR_PTR(ret);
1181 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1183 struct vfio_device *device;
1188 device = vfio_device_get_from_name(group, buf);
1190 return PTR_ERR(device);
1192 fdno = get_unused_fd_flags(O_CLOEXEC);
1195 goto err_put_device;
1198 filep = vfio_device_open(device);
1199 if (IS_ERR(filep)) {
1200 ret = PTR_ERR(filep);
1204 fd_install(fdno, filep);
1208 put_unused_fd(fdno);
1210 vfio_device_put(device);
1214 static long vfio_group_fops_unl_ioctl(struct file *filep,
1215 unsigned int cmd, unsigned long arg)
1217 struct vfio_group *group = filep->private_data;
1221 case VFIO_GROUP_GET_STATUS:
1223 struct vfio_group_status status;
1224 unsigned long minsz;
1226 minsz = offsetofend(struct vfio_group_status, flags);
1228 if (copy_from_user(&status, (void __user *)arg, minsz))
1231 if (status.argsz < minsz)
1236 down_read(&group->group_rwsem);
1237 if (group->container)
1238 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1239 VFIO_GROUP_FLAGS_VIABLE;
1240 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1241 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1242 up_read(&group->group_rwsem);
1244 if (copy_to_user((void __user *)arg, &status, minsz))
1250 case VFIO_GROUP_SET_CONTAINER:
1254 if (get_user(fd, (int __user *)arg))
1260 down_write(&group->group_rwsem);
1261 ret = vfio_group_set_container(group, fd);
1262 up_write(&group->group_rwsem);
1265 case VFIO_GROUP_UNSET_CONTAINER:
1266 down_write(&group->group_rwsem);
1267 ret = vfio_group_unset_container(group);
1268 up_write(&group->group_rwsem);
1270 case VFIO_GROUP_GET_DEVICE_FD:
1274 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1276 return PTR_ERR(buf);
1278 ret = vfio_group_get_device_fd(group, buf);
1287 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1289 struct vfio_group *group =
1290 container_of(inode->i_cdev, struct vfio_group, cdev);
1293 down_write(&group->group_rwsem);
1295 /* users can be zero if this races with vfio_group_put() */
1296 if (!refcount_inc_not_zero(&group->users)) {
1301 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1307 * Do we need multiple instances of the group open? Seems not.
1309 if (group->opened_file) {
1313 group->opened_file = filep;
1314 filep->private_data = group;
1316 up_write(&group->group_rwsem);
1319 vfio_group_put(group);
1321 up_write(&group->group_rwsem);
1325 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1327 struct vfio_group *group = filep->private_data;
1329 filep->private_data = NULL;
1331 down_write(&group->group_rwsem);
1333 * Device FDs hold a group file reference, therefore the group release
1334 * is only called when there are no open devices.
1336 WARN_ON(group->notifier.head);
1337 if (group->container) {
1338 WARN_ON(group->container_users != 1);
1339 __vfio_group_unset_container(group);
1341 group->opened_file = NULL;
1342 up_write(&group->group_rwsem);
1344 vfio_group_put(group);
1349 static const struct file_operations vfio_group_fops = {
1350 .owner = THIS_MODULE,
1351 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1352 .compat_ioctl = compat_ptr_ioctl,
1353 .open = vfio_group_fops_open,
1354 .release = vfio_group_fops_release,
1360 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1362 struct vfio_device *device = filep->private_data;
1363 struct vfio_iommu_driver *iommu_driver;
1365 mutex_lock(&device->dev_set->lock);
1366 vfio_assert_device_open(device);
1367 down_read(&device->group->group_rwsem);
1368 if (device->open_count == 1 && device->ops->close_device)
1369 device->ops->close_device(device);
1371 iommu_driver = device->group->container->iommu_driver;
1372 if (iommu_driver && iommu_driver->ops->unregister_device)
1373 iommu_driver->ops->unregister_device(
1374 device->group->container->iommu_data, device);
1375 up_read(&device->group->group_rwsem);
1376 device->open_count--;
1377 if (device->open_count == 0)
1379 mutex_unlock(&device->dev_set->lock);
1381 module_put(device->dev->driver->owner);
1383 vfio_device_unassign_container(device);
1385 vfio_device_put(device);
1391 * vfio_mig_get_next_state - Compute the next step in the FSM
1392 * @cur_fsm - The current state the device is in
1393 * @new_fsm - The target state to reach
1394 * @next_fsm - Pointer to the next step to get to new_fsm
1396 * Return 0 upon success, otherwise -errno
1397 * Upon success the next step in the state progression between cur_fsm and
1398 * new_fsm will be set in next_fsm.
1400 * This breaks down requests for combination transitions into smaller steps and
1401 * returns the next step to get to new_fsm. The function may need to be called
1402 * multiple times before reaching new_fsm.
1405 int vfio_mig_get_next_state(struct vfio_device *device,
1406 enum vfio_device_mig_state cur_fsm,
1407 enum vfio_device_mig_state new_fsm,
1408 enum vfio_device_mig_state *next_fsm)
1410 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1412 * The coding in this table requires the driver to implement the
1413 * following FSM arcs:
1419 * If P2P is supported then the driver must also implement these FSM
1421 * RUNNING -> RUNNING_P2P
1422 * RUNNING_P2P -> RUNNING
1423 * RUNNING_P2P -> STOP
1424 * STOP -> RUNNING_P2P
1425 * Without P2P the driver must implement:
1429 * The coding will step through multiple states for some combination
1430 * transitions; if all optional features are supported, this means the
1432 * RESUMING -> STOP -> RUNNING_P2P
1433 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1434 * RESUMING -> STOP -> STOP_COPY
1435 * RUNNING -> RUNNING_P2P -> STOP
1436 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1437 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1438 * RUNNING_P2P -> STOP -> RESUMING
1439 * RUNNING_P2P -> STOP -> STOP_COPY
1440 * STOP -> RUNNING_P2P -> RUNNING
1441 * STOP_COPY -> STOP -> RESUMING
1442 * STOP_COPY -> STOP -> RUNNING_P2P
1443 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1445 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1446 [VFIO_DEVICE_STATE_STOP] = {
1447 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1449 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1450 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1452 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1454 [VFIO_DEVICE_STATE_RUNNING] = {
1455 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1456 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1458 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1459 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1462 [VFIO_DEVICE_STATE_STOP_COPY] = {
1463 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1464 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1465 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1466 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1467 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1468 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1470 [VFIO_DEVICE_STATE_RESUMING] = {
1471 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1472 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1473 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1474 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1475 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1476 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1478 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1479 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1480 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1481 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1482 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1483 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1484 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1486 [VFIO_DEVICE_STATE_ERROR] = {
1487 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1488 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1489 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1490 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1491 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1492 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1496 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1497 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1498 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1499 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1500 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1501 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1502 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1503 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1506 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1507 (state_flags_table[cur_fsm] & device->migration_flags) !=
1508 state_flags_table[cur_fsm]))
1511 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1512 (state_flags_table[new_fsm] & device->migration_flags) !=
1513 state_flags_table[new_fsm])
1517 * Arcs touching optional and unsupported states are skipped over. The
1518 * driver will instead see an arc from the original state to the next
1519 * logical state, as per the above comment.
1521 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1522 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1523 state_flags_table[*next_fsm])
1524 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1526 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1528 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1531 * Convert the drivers's struct file into a FD number and return it to userspace
1533 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1534 struct vfio_device_feature_mig_state *mig)
1539 fd = get_unused_fd_flags(O_CLOEXEC);
1546 if (copy_to_user(arg, mig, sizeof(*mig))) {
1548 goto out_put_unused;
1550 fd_install(fd, filp);
1561 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1562 u32 flags, void __user *arg,
1566 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1567 struct vfio_device_feature_mig_state mig;
1568 struct file *filp = NULL;
1571 if (!device->mig_ops)
1574 ret = vfio_check_feature(flags, argsz,
1575 VFIO_DEVICE_FEATURE_SET |
1576 VFIO_DEVICE_FEATURE_GET,
1581 if (copy_from_user(&mig, arg, minsz))
1584 if (flags & VFIO_DEVICE_FEATURE_GET) {
1585 enum vfio_device_mig_state curr_state;
1587 ret = device->mig_ops->migration_get_state(device,
1591 mig.device_state = curr_state;
1595 /* Handle the VFIO_DEVICE_FEATURE_SET */
1596 filp = device->mig_ops->migration_set_state(device, mig.device_state);
1597 if (IS_ERR(filp) || !filp)
1600 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1603 if (copy_to_user(arg, &mig, sizeof(mig)))
1606 return PTR_ERR(filp);
1610 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1611 u32 flags, void __user *arg,
1614 struct vfio_device_feature_migration mig = {
1615 .flags = device->migration_flags,
1619 if (!device->mig_ops)
1622 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1626 if (copy_to_user(arg, &mig, sizeof(mig)))
1631 static int vfio_ioctl_device_feature(struct vfio_device *device,
1632 struct vfio_device_feature __user *arg)
1634 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1635 struct vfio_device_feature feature;
1637 if (copy_from_user(&feature, arg, minsz))
1640 if (feature.argsz < minsz)
1643 /* Check unknown flags */
1645 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1646 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1649 /* GET & SET are mutually exclusive except with PROBE */
1650 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1651 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1652 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1655 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1656 case VFIO_DEVICE_FEATURE_MIGRATION:
1657 return vfio_ioctl_device_feature_migration(
1658 device, feature.flags, arg->data,
1659 feature.argsz - minsz);
1660 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1661 return vfio_ioctl_device_feature_mig_device_state(
1662 device, feature.flags, arg->data,
1663 feature.argsz - minsz);
1665 if (unlikely(!device->ops->device_feature))
1667 return device->ops->device_feature(device, feature.flags,
1669 feature.argsz - minsz);
1673 static long vfio_device_fops_unl_ioctl(struct file *filep,
1674 unsigned int cmd, unsigned long arg)
1676 struct vfio_device *device = filep->private_data;
1679 case VFIO_DEVICE_FEATURE:
1680 return vfio_ioctl_device_feature(device, (void __user *)arg);
1682 if (unlikely(!device->ops->ioctl))
1684 return device->ops->ioctl(device, cmd, arg);
1688 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1689 size_t count, loff_t *ppos)
1691 struct vfio_device *device = filep->private_data;
1693 if (unlikely(!device->ops->read))
1696 return device->ops->read(device, buf, count, ppos);
1699 static ssize_t vfio_device_fops_write(struct file *filep,
1700 const char __user *buf,
1701 size_t count, loff_t *ppos)
1703 struct vfio_device *device = filep->private_data;
1705 if (unlikely(!device->ops->write))
1708 return device->ops->write(device, buf, count, ppos);
1711 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1713 struct vfio_device *device = filep->private_data;
1715 if (unlikely(!device->ops->mmap))
1718 return device->ops->mmap(device, vma);
1721 static const struct file_operations vfio_device_fops = {
1722 .owner = THIS_MODULE,
1723 .release = vfio_device_fops_release,
1724 .read = vfio_device_fops_read,
1725 .write = vfio_device_fops_write,
1726 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1727 .compat_ioctl = compat_ptr_ioctl,
1728 .mmap = vfio_device_fops_mmap,
1732 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1733 * @file: VFIO group file
1735 * The returned iommu_group is valid as long as a ref is held on the file.
1737 struct iommu_group *vfio_file_iommu_group(struct file *file)
1739 struct vfio_group *group = file->private_data;
1741 if (file->f_op != &vfio_group_fops)
1743 return group->iommu_group;
1745 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1748 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1749 * is always CPU cache coherent
1750 * @file: VFIO group file
1752 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1753 * bit in DMA transactions. A return of false indicates that the user has
1754 * rights to access additional instructions such as wbinvd on x86.
1756 bool vfio_file_enforced_coherent(struct file *file)
1758 struct vfio_group *group = file->private_data;
1761 if (file->f_op != &vfio_group_fops)
1764 down_read(&group->group_rwsem);
1765 if (group->container) {
1766 ret = vfio_ioctl_check_extension(group->container,
1770 * Since the coherency state is determined only once a container
1771 * is attached the user must do so before they can prove they
1776 up_read(&group->group_rwsem);
1779 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1782 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1783 * @file: VFIO group file
1786 * When a VFIO device is first opened the KVM will be available in
1787 * device->kvm if one was associated with the group.
1789 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1791 struct vfio_group *group = file->private_data;
1793 if (file->f_op != &vfio_group_fops)
1796 down_write(&group->group_rwsem);
1798 up_write(&group->group_rwsem);
1800 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1803 * vfio_file_has_dev - True if the VFIO file is a handle for device
1804 * @file: VFIO file to check
1805 * @device: Device that must be part of the file
1807 * Returns true if given file has permission to manipulate the given device.
1809 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1811 struct vfio_group *group = file->private_data;
1813 if (file->f_op != &vfio_group_fops)
1816 return group == device->group;
1818 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1821 * Sub-module support
1824 * Helper for managing a buffer of info chain capabilities, allocate or
1825 * reallocate a buffer with additional @size, filling in @id and @version
1826 * of the capability. A pointer to the new capability is returned.
1828 * NB. The chain is based at the head of the buffer, so new entries are
1829 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1830 * next offsets prior to copying to the user buffer.
1832 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1833 size_t size, u16 id, u16 version)
1836 struct vfio_info_cap_header *header, *tmp;
1838 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1843 return ERR_PTR(-ENOMEM);
1847 header = buf + caps->size;
1849 /* Eventually copied to user buffer, zero */
1850 memset(header, 0, size);
1853 header->version = version;
1855 /* Add to the end of the capability chain */
1856 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1859 tmp->next = caps->size;
1864 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1866 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1868 struct vfio_info_cap_header *tmp;
1869 void *buf = (void *)caps->buf;
1871 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1872 tmp->next += offset;
1874 EXPORT_SYMBOL(vfio_info_cap_shift);
1876 int vfio_info_add_capability(struct vfio_info_cap *caps,
1877 struct vfio_info_cap_header *cap, size_t size)
1879 struct vfio_info_cap_header *header;
1881 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1883 return PTR_ERR(header);
1885 memcpy(header + 1, cap + 1, size - sizeof(*header));
1889 EXPORT_SYMBOL(vfio_info_add_capability);
1891 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1892 int max_irq_type, size_t *data_size)
1894 unsigned long minsz;
1897 minsz = offsetofend(struct vfio_irq_set, count);
1899 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1900 (hdr->count >= (U32_MAX - hdr->start)) ||
1901 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1902 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1908 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1911 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1912 case VFIO_IRQ_SET_DATA_NONE:
1915 case VFIO_IRQ_SET_DATA_BOOL:
1916 size = sizeof(uint8_t);
1918 case VFIO_IRQ_SET_DATA_EVENTFD:
1919 size = sizeof(int32_t);
1926 if (hdr->argsz - minsz < hdr->count * size)
1932 *data_size = hdr->count * size;
1937 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1940 * Pin contiguous user pages and return their associated host pages for local
1942 * @device [in] : device
1943 * @iova [in] : starting IOVA of user pages to be pinned.
1944 * @npage [in] : count of pages to be pinned. This count should not
1945 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1946 * @prot [in] : protection flags
1947 * @pages[out] : array of host pages
1948 * Return error or number of pages pinned.
1950 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1951 int npage, int prot, struct page **pages)
1953 struct vfio_container *container;
1954 struct vfio_group *group = device->group;
1955 struct vfio_iommu_driver *driver;
1958 if (!pages || !npage || !vfio_assert_device_open(device))
1961 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1964 if (group->dev_counter > 1)
1967 /* group->container cannot change while a vfio device is open */
1968 container = group->container;
1969 driver = container->iommu_driver;
1970 if (likely(driver && driver->ops->pin_pages))
1971 ret = driver->ops->pin_pages(container->iommu_data,
1972 group->iommu_group, iova,
1973 npage, prot, pages);
1979 EXPORT_SYMBOL(vfio_pin_pages);
1982 * Unpin contiguous host pages for local domain only.
1983 * @device [in] : device
1984 * @iova [in] : starting address of user pages to be unpinned.
1985 * @npage [in] : count of pages to be unpinned. This count should not
1986 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1988 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1990 struct vfio_container *container;
1991 struct vfio_iommu_driver *driver;
1993 if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
1996 if (WARN_ON(!vfio_assert_device_open(device)))
1999 /* group->container cannot change while a vfio device is open */
2000 container = device->group->container;
2001 driver = container->iommu_driver;
2003 driver->ops->unpin_pages(container->iommu_data, iova, npage);
2005 EXPORT_SYMBOL(vfio_unpin_pages);
2008 * This interface allows the CPUs to perform some sort of virtual DMA on
2009 * behalf of the device.
2011 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2012 * into/from a kernel buffer.
2014 * As the read/write of user space memory is conducted via the CPUs and is
2015 * not a real device DMA, it is not necessary to pin the user space memory.
2017 * @device [in] : VFIO device
2018 * @iova [in] : base IOVA of a user space buffer
2019 * @data [in] : pointer to kernel buffer
2020 * @len [in] : kernel buffer length
2021 * @write : indicate read or write
2022 * Return error code on failure or 0 on success.
2024 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
2025 size_t len, bool write)
2027 struct vfio_container *container;
2028 struct vfio_iommu_driver *driver;
2031 if (!data || len <= 0 || !vfio_assert_device_open(device))
2034 /* group->container cannot change while a vfio device is open */
2035 container = device->group->container;
2036 driver = container->iommu_driver;
2038 if (likely(driver && driver->ops->dma_rw))
2039 ret = driver->ops->dma_rw(container->iommu_data,
2040 iova, data, len, write);
2045 EXPORT_SYMBOL(vfio_dma_rw);
2048 * Module/class support
2050 static char *vfio_devnode(struct device *dev, umode_t *mode)
2052 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2055 static struct miscdevice vfio_dev = {
2056 .minor = VFIO_MINOR,
2059 .nodename = "vfio/vfio",
2060 .mode = S_IRUGO | S_IWUGO,
2063 static int __init vfio_init(void)
2067 ida_init(&vfio.group_ida);
2068 mutex_init(&vfio.group_lock);
2069 mutex_init(&vfio.iommu_drivers_lock);
2070 INIT_LIST_HEAD(&vfio.group_list);
2071 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2073 ret = misc_register(&vfio_dev);
2075 pr_err("vfio: misc device register failed\n");
2079 /* /dev/vfio/$GROUP */
2080 vfio.class = class_create(THIS_MODULE, "vfio");
2081 if (IS_ERR(vfio.class)) {
2082 ret = PTR_ERR(vfio.class);
2086 vfio.class->devnode = vfio_devnode;
2088 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2090 goto err_alloc_chrdev;
2092 #ifdef CONFIG_VFIO_NOIOMMU
2093 ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
2096 goto err_driver_register;
2098 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2101 err_driver_register:
2102 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2104 class_destroy(vfio.class);
2107 misc_deregister(&vfio_dev);
2111 static void __exit vfio_cleanup(void)
2113 WARN_ON(!list_empty(&vfio.group_list));
2115 #ifdef CONFIG_VFIO_NOIOMMU
2116 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2118 ida_destroy(&vfio.group_ida);
2119 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2120 class_destroy(vfio.class);
2122 misc_deregister(&vfio_dev);
2123 xa_destroy(&vfio_device_set_xa);
2126 module_init(vfio_init);
2127 module_exit(vfio_cleanup);
2129 MODULE_VERSION(DRIVER_VERSION);
2130 MODULE_LICENSE("GPL v2");
2131 MODULE_AUTHOR(DRIVER_AUTHOR);
2132 MODULE_DESCRIPTION(DRIVER_DESC);
2133 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2134 MODULE_ALIAS("devname:vfio/vfio");
2135 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");