1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
42 #define DRIVER_VERSION "0.3"
43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC "VFIO - User Level meta-driver"
47 struct class *device_class;
48 struct ida device_ida;
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
58 static DEFINE_XARRAY(vfio_device_set_xa);
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
70 * Atomically acquire a singleton object in the xarray for this set_id
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
76 xa_unlock(&vfio_device_set_xa);
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
89 dev_set = new_dev_set;
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
110 static void vfio_release_device_set(struct vfio_device *device)
112 struct vfio_device_set *dev_set = device->dev_set;
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
128 xa_unlock(&vfio_device_set_xa);
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
136 lockdep_assert_held(&dev_set->lock);
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
145 * Device objects - create, release, get, put, search
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
150 if (refcount_dec_and_test(&device->refcount))
151 complete(&device->comp);
154 bool vfio_device_try_get_registration(struct vfio_device *device)
156 return refcount_inc_not_zero(&device->refcount);
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
165 struct vfio_device *device =
166 container_of(dev, struct vfio_device, device);
168 vfio_release_device_set(device);
169 ida_free(&vfio.device_ida, device->index);
171 if (device->ops->release)
172 device->ops->release(device);
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 const struct vfio_device_ops *ops);
181 * Allocate and initialize vfio_device so it can be registered to vfio
184 * Drivers should use the wrapper vfio_alloc_device() for allocation.
185 * @size is the size of the structure to be allocated, including any
186 * private data used by the driver.
188 * Driver may provide an @init callback to cover device private data.
190 * Use vfio_put_device() to release the structure after success return.
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 const struct vfio_device_ops *ops)
195 struct vfio_device *device;
198 if (WARN_ON(size < sizeof(struct vfio_device)))
199 return ERR_PTR(-EINVAL);
201 device = kvzalloc(size, GFP_KERNEL);
203 return ERR_PTR(-ENOMEM);
205 ret = vfio_init_device(device, dev, ops);
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
217 * Initialize a vfio_device so it can be registered to vfio core.
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 const struct vfio_device_ops *ops)
224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
226 dev_dbg(dev, "Error to alloc index\n");
231 init_completion(&device->comp);
236 ret = ops->init(device);
241 device_initialize(&device->device);
242 device->device.release = vfio_device_release;
243 device->device.class = vfio.device_class;
244 device->device.parent = device->dev;
248 vfio_release_device_set(device);
249 ida_free(&vfio.device_ida, device->index);
253 static int __vfio_register_dev(struct vfio_device *device,
254 enum vfio_group_type type)
258 if (WARN_ON(device->ops->bind_iommufd &&
259 (!device->ops->unbind_iommufd ||
260 !device->ops->attach_ioas)))
264 * If the driver doesn't specify a set then the device is added to a
265 * singleton set just for itself.
267 if (!device->dev_set)
268 vfio_assign_device_set(device, device);
270 ret = dev_set_name(&device->device, "vfio%d", device->index);
274 ret = vfio_device_set_group(device, type);
278 ret = device_add(&device->device);
282 /* Refcounting can't start until the driver calls register */
283 refcount_set(&device->refcount, 1);
285 vfio_device_group_register(device);
289 vfio_device_remove_group(device);
293 int vfio_register_group_dev(struct vfio_device *device)
295 return __vfio_register_dev(device, VFIO_IOMMU);
297 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
300 * Register a virtual device without IOMMU backing. The user of this
301 * device must not be able to directly trigger unmediated DMA.
303 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
305 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
307 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
310 * Decrement the device reference count and wait for the device to be
311 * removed. Open file descriptors for the device... */
312 void vfio_unregister_group_dev(struct vfio_device *device)
315 bool interrupted = false;
318 vfio_device_put_registration(device);
319 rc = try_wait_for_completion(&device->comp);
321 if (device->ops->request)
322 device->ops->request(device, i++);
325 rc = wait_for_completion_timeout(&device->comp,
328 rc = wait_for_completion_interruptible_timeout(
329 &device->comp, HZ * 10);
332 dev_warn(device->dev,
333 "Device is currently in use, task"
335 "blocked until device is released",
336 current->comm, task_pid_nr(current));
341 vfio_device_group_unregister(device);
343 /* Balances device_add in register path */
344 device_del(&device->device);
346 /* Balances vfio_device_set_group in register path */
347 vfio_device_remove_group(device);
349 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
351 #ifdef CONFIG_HAVE_KVM
352 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
354 void (*pfn)(struct kvm *kvm);
355 bool (*fn)(struct kvm *kvm);
358 lockdep_assert_held(&device->dev_set->lock);
360 pfn = symbol_get(kvm_put_kvm);
364 fn = symbol_get(kvm_get_kvm_safe);
366 symbol_put(kvm_put_kvm);
371 symbol_put(kvm_get_kvm_safe);
373 symbol_put(kvm_put_kvm);
377 device->put_kvm = pfn;
381 void vfio_device_put_kvm(struct vfio_device *device)
383 lockdep_assert_held(&device->dev_set->lock);
388 if (WARN_ON(!device->put_kvm))
391 device->put_kvm(device->kvm);
392 device->put_kvm = NULL;
393 symbol_put(kvm_put_kvm);
400 /* true if the vfio_device has open_device() called but not close_device() */
401 static bool vfio_assert_device_open(struct vfio_device *device)
403 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
406 static int vfio_device_first_open(struct vfio_device *device,
407 struct iommufd_ctx *iommufd)
411 lockdep_assert_held(&device->dev_set->lock);
413 if (!try_module_get(device->dev->driver->owner))
417 ret = vfio_iommufd_bind(device, iommufd);
419 ret = vfio_device_group_use_iommu(device);
423 if (device->ops->open_device) {
424 ret = device->ops->open_device(device);
426 goto err_unuse_iommu;
432 vfio_iommufd_unbind(device);
434 vfio_device_group_unuse_iommu(device);
436 module_put(device->dev->driver->owner);
440 static void vfio_device_last_close(struct vfio_device *device,
441 struct iommufd_ctx *iommufd)
443 lockdep_assert_held(&device->dev_set->lock);
445 if (device->ops->close_device)
446 device->ops->close_device(device);
448 vfio_iommufd_unbind(device);
450 vfio_device_group_unuse_iommu(device);
451 module_put(device->dev->driver->owner);
454 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
458 lockdep_assert_held(&device->dev_set->lock);
460 device->open_count++;
461 if (device->open_count == 1) {
462 ret = vfio_device_first_open(device, iommufd);
464 device->open_count--;
470 void vfio_device_close(struct vfio_device *device,
471 struct iommufd_ctx *iommufd)
473 lockdep_assert_held(&device->dev_set->lock);
475 vfio_assert_device_open(device);
476 if (device->open_count == 1)
477 vfio_device_last_close(device, iommufd);
478 device->open_count--;
482 * Wrapper around pm_runtime_resume_and_get().
483 * Return error code on failure or 0 on success.
485 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
487 struct device *dev = device->dev;
489 if (dev->driver && dev->driver->pm) {
492 ret = pm_runtime_resume_and_get(dev);
494 dev_info_ratelimited(dev,
495 "vfio: runtime resume failed %d\n", ret);
504 * Wrapper around pm_runtime_put().
506 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
508 struct device *dev = device->dev;
510 if (dev->driver && dev->driver->pm)
517 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
519 struct vfio_device *device = filep->private_data;
521 vfio_device_group_close(device);
523 vfio_device_put_registration(device);
529 * vfio_mig_get_next_state - Compute the next step in the FSM
530 * @cur_fsm - The current state the device is in
531 * @new_fsm - The target state to reach
532 * @next_fsm - Pointer to the next step to get to new_fsm
534 * Return 0 upon success, otherwise -errno
535 * Upon success the next step in the state progression between cur_fsm and
536 * new_fsm will be set in next_fsm.
538 * This breaks down requests for combination transitions into smaller steps and
539 * returns the next step to get to new_fsm. The function may need to be called
540 * multiple times before reaching new_fsm.
543 int vfio_mig_get_next_state(struct vfio_device *device,
544 enum vfio_device_mig_state cur_fsm,
545 enum vfio_device_mig_state new_fsm,
546 enum vfio_device_mig_state *next_fsm)
548 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
550 * The coding in this table requires the driver to implement the
551 * following FSM arcs:
557 * If P2P is supported then the driver must also implement these FSM
559 * RUNNING -> RUNNING_P2P
560 * RUNNING_P2P -> RUNNING
561 * RUNNING_P2P -> STOP
562 * STOP -> RUNNING_P2P
564 * If precopy is supported then the driver must support these additional
566 * RUNNING -> PRE_COPY
567 * PRE_COPY -> RUNNING
568 * PRE_COPY -> STOP_COPY
569 * However, if precopy and P2P are supported together then the driver
570 * must support these additional arcs beyond the P2P arcs above:
571 * PRE_COPY -> RUNNING
572 * PRE_COPY -> PRE_COPY_P2P
573 * PRE_COPY_P2P -> PRE_COPY
574 * PRE_COPY_P2P -> RUNNING_P2P
575 * PRE_COPY_P2P -> STOP_COPY
576 * RUNNING -> PRE_COPY
577 * RUNNING_P2P -> PRE_COPY_P2P
579 * Without P2P and precopy the driver must implement:
583 * The coding will step through multiple states for some combination
584 * transitions; if all optional features are supported, this means the
586 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
587 * PRE_COPY -> RUNNING -> RUNNING_P2P
588 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
590 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
591 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
593 * RESUMING -> STOP -> RUNNING_P2P
594 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
595 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
597 * RESUMING -> STOP -> STOP_COPY
598 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
599 * RUNNING -> RUNNING_P2P -> STOP
600 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
601 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
602 * RUNNING_P2P -> RUNNING -> PRE_COPY
603 * RUNNING_P2P -> STOP -> RESUMING
604 * RUNNING_P2P -> STOP -> STOP_COPY
605 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
606 * STOP -> RUNNING_P2P -> RUNNING
607 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
608 * STOP_COPY -> STOP -> RESUMING
609 * STOP_COPY -> STOP -> RUNNING_P2P
610 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
612 * The following transitions are blocked:
613 * STOP_COPY -> PRE_COPY
614 * STOP_COPY -> PRE_COPY_P2P
616 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
617 [VFIO_DEVICE_STATE_STOP] = {
618 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
619 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
620 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
623 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
624 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
625 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
627 [VFIO_DEVICE_STATE_RUNNING] = {
628 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
629 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
630 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
631 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
632 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
637 [VFIO_DEVICE_STATE_PRE_COPY] = {
638 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
639 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
640 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
641 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
642 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
644 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
645 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
647 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
648 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
649 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
651 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
652 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
653 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
654 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
657 [VFIO_DEVICE_STATE_STOP_COPY] = {
658 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
659 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
660 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
661 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
662 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
663 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
664 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
665 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
667 [VFIO_DEVICE_STATE_RESUMING] = {
668 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
669 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
670 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
671 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
672 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
673 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
674 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
675 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
677 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
678 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
680 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
681 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
682 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
683 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
684 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
685 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
687 [VFIO_DEVICE_STATE_ERROR] = {
688 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
689 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
690 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
691 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
692 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
693 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
694 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
695 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
699 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
700 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
701 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
702 [VFIO_DEVICE_STATE_PRE_COPY] =
703 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
706 VFIO_MIGRATION_PRE_COPY,
707 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
708 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
709 [VFIO_DEVICE_STATE_RUNNING_P2P] =
710 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
711 [VFIO_DEVICE_STATE_ERROR] = ~0U,
714 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
715 (state_flags_table[cur_fsm] & device->migration_flags) !=
716 state_flags_table[cur_fsm]))
719 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
720 (state_flags_table[new_fsm] & device->migration_flags) !=
721 state_flags_table[new_fsm])
725 * Arcs touching optional and unsupported states are skipped over. The
726 * driver will instead see an arc from the original state to the next
727 * logical state, as per the above comment.
729 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
730 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
731 state_flags_table[*next_fsm])
732 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
734 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
736 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
739 * Convert the drivers's struct file into a FD number and return it to userspace
741 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
742 struct vfio_device_feature_mig_state *mig)
747 fd = get_unused_fd_flags(O_CLOEXEC);
754 if (copy_to_user(arg, mig, sizeof(*mig))) {
758 fd_install(fd, filp);
769 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
770 u32 flags, void __user *arg,
774 offsetofend(struct vfio_device_feature_mig_state, data_fd);
775 struct vfio_device_feature_mig_state mig;
776 struct file *filp = NULL;
779 if (!device->mig_ops)
782 ret = vfio_check_feature(flags, argsz,
783 VFIO_DEVICE_FEATURE_SET |
784 VFIO_DEVICE_FEATURE_GET,
789 if (copy_from_user(&mig, arg, minsz))
792 if (flags & VFIO_DEVICE_FEATURE_GET) {
793 enum vfio_device_mig_state curr_state;
795 ret = device->mig_ops->migration_get_state(device,
799 mig.device_state = curr_state;
803 /* Handle the VFIO_DEVICE_FEATURE_SET */
804 filp = device->mig_ops->migration_set_state(device, mig.device_state);
805 if (IS_ERR(filp) || !filp)
808 return vfio_ioct_mig_return_fd(filp, arg, &mig);
811 if (copy_to_user(arg, &mig, sizeof(mig)))
814 return PTR_ERR(filp);
819 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
820 u32 flags, void __user *arg,
823 struct vfio_device_feature_mig_data_size data_size = {};
824 unsigned long stop_copy_length;
827 if (!device->mig_ops)
830 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
835 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
839 data_size.stop_copy_length = stop_copy_length;
840 if (copy_to_user(arg, &data_size, sizeof(data_size)))
846 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
847 u32 flags, void __user *arg,
850 struct vfio_device_feature_migration mig = {
851 .flags = device->migration_flags,
855 if (!device->mig_ops)
858 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
862 if (copy_to_user(arg, &mig, sizeof(mig)))
867 /* Ranges should fit into a single kernel page */
868 #define LOG_MAX_RANGES \
869 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
872 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
873 u32 flags, void __user *arg,
877 offsetofend(struct vfio_device_feature_dma_logging_control,
879 struct vfio_device_feature_dma_logging_range __user *ranges;
880 struct vfio_device_feature_dma_logging_control control;
881 struct vfio_device_feature_dma_logging_range range;
882 struct rb_root_cached root = RB_ROOT_CACHED;
883 struct interval_tree_node *nodes;
888 if (!device->log_ops)
891 ret = vfio_check_feature(flags, argsz,
892 VFIO_DEVICE_FEATURE_SET,
897 if (copy_from_user(&control, arg, minsz))
900 nnodes = control.num_ranges;
904 if (nnodes > LOG_MAX_RANGES)
907 ranges = u64_to_user_ptr(control.ranges);
908 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
913 for (i = 0; i < nnodes; i++) {
914 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
918 if (!IS_ALIGNED(range.iova, control.page_size) ||
919 !IS_ALIGNED(range.length, control.page_size)) {
924 if (check_add_overflow(range.iova, range.length, &iova_end) ||
925 iova_end > ULONG_MAX) {
930 nodes[i].start = range.iova;
931 nodes[i].last = range.iova + range.length - 1;
932 if (interval_tree_iter_first(&root, nodes[i].start,
934 /* Range overlapping */
938 interval_tree_insert(nodes + i, &root);
941 ret = device->log_ops->log_start(device, &root, nnodes,
946 if (copy_to_user(arg, &control, sizeof(control))) {
948 device->log_ops->log_stop(device);
957 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
958 u32 flags, void __user *arg,
963 if (!device->log_ops)
966 ret = vfio_check_feature(flags, argsz,
967 VFIO_DEVICE_FEATURE_SET, 0);
971 return device->log_ops->log_stop(device);
974 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
975 unsigned long iova, size_t length,
978 struct vfio_device *device = opaque;
980 return device->log_ops->log_read_and_clear(device, iova, length, iter);
984 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
985 u32 flags, void __user *arg,
989 offsetofend(struct vfio_device_feature_dma_logging_report,
991 struct vfio_device_feature_dma_logging_report report;
992 struct iova_bitmap *iter;
996 if (!device->log_ops)
999 ret = vfio_check_feature(flags, argsz,
1000 VFIO_DEVICE_FEATURE_GET,
1005 if (copy_from_user(&report, arg, minsz))
1008 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1011 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1012 iova_end > ULONG_MAX)
1015 iter = iova_bitmap_alloc(report.iova, report.length,
1017 u64_to_user_ptr(report.bitmap));
1019 return PTR_ERR(iter);
1021 ret = iova_bitmap_for_each(iter, device,
1022 vfio_device_log_read_and_clear);
1024 iova_bitmap_free(iter);
1028 static int vfio_ioctl_device_feature(struct vfio_device *device,
1029 struct vfio_device_feature __user *arg)
1031 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1032 struct vfio_device_feature feature;
1034 if (copy_from_user(&feature, arg, minsz))
1037 if (feature.argsz < minsz)
1040 /* Check unknown flags */
1042 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1043 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1046 /* GET & SET are mutually exclusive except with PROBE */
1047 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1048 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1049 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1052 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1053 case VFIO_DEVICE_FEATURE_MIGRATION:
1054 return vfio_ioctl_device_feature_migration(
1055 device, feature.flags, arg->data,
1056 feature.argsz - minsz);
1057 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1058 return vfio_ioctl_device_feature_mig_device_state(
1059 device, feature.flags, arg->data,
1060 feature.argsz - minsz);
1061 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1062 return vfio_ioctl_device_feature_logging_start(
1063 device, feature.flags, arg->data,
1064 feature.argsz - minsz);
1065 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1066 return vfio_ioctl_device_feature_logging_stop(
1067 device, feature.flags, arg->data,
1068 feature.argsz - minsz);
1069 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1070 return vfio_ioctl_device_feature_logging_report(
1071 device, feature.flags, arg->data,
1072 feature.argsz - minsz);
1073 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1074 return vfio_ioctl_device_feature_migration_data_size(
1075 device, feature.flags, arg->data,
1076 feature.argsz - minsz);
1078 if (unlikely(!device->ops->device_feature))
1080 return device->ops->device_feature(device, feature.flags,
1082 feature.argsz - minsz);
1086 static long vfio_device_fops_unl_ioctl(struct file *filep,
1087 unsigned int cmd, unsigned long arg)
1089 struct vfio_device *device = filep->private_data;
1092 ret = vfio_device_pm_runtime_get(device);
1097 case VFIO_DEVICE_FEATURE:
1098 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1102 if (unlikely(!device->ops->ioctl))
1105 ret = device->ops->ioctl(device, cmd, arg);
1109 vfio_device_pm_runtime_put(device);
1113 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1114 size_t count, loff_t *ppos)
1116 struct vfio_device *device = filep->private_data;
1118 if (unlikely(!device->ops->read))
1121 return device->ops->read(device, buf, count, ppos);
1124 static ssize_t vfio_device_fops_write(struct file *filep,
1125 const char __user *buf,
1126 size_t count, loff_t *ppos)
1128 struct vfio_device *device = filep->private_data;
1130 if (unlikely(!device->ops->write))
1133 return device->ops->write(device, buf, count, ppos);
1136 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1138 struct vfio_device *device = filep->private_data;
1140 if (unlikely(!device->ops->mmap))
1143 return device->ops->mmap(device, vma);
1146 const struct file_operations vfio_device_fops = {
1147 .owner = THIS_MODULE,
1148 .release = vfio_device_fops_release,
1149 .read = vfio_device_fops_read,
1150 .write = vfio_device_fops_write,
1151 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1152 .compat_ioctl = compat_ptr_ioctl,
1153 .mmap = vfio_device_fops_mmap,
1157 * Sub-module support
1160 * Helper for managing a buffer of info chain capabilities, allocate or
1161 * reallocate a buffer with additional @size, filling in @id and @version
1162 * of the capability. A pointer to the new capability is returned.
1164 * NB. The chain is based at the head of the buffer, so new entries are
1165 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1166 * next offsets prior to copying to the user buffer.
1168 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1169 size_t size, u16 id, u16 version)
1172 struct vfio_info_cap_header *header, *tmp;
1174 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1179 return ERR_PTR(-ENOMEM);
1183 header = buf + caps->size;
1185 /* Eventually copied to user buffer, zero */
1186 memset(header, 0, size);
1189 header->version = version;
1191 /* Add to the end of the capability chain */
1192 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1195 tmp->next = caps->size;
1200 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1202 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1204 struct vfio_info_cap_header *tmp;
1205 void *buf = (void *)caps->buf;
1207 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1208 tmp->next += offset;
1210 EXPORT_SYMBOL(vfio_info_cap_shift);
1212 int vfio_info_add_capability(struct vfio_info_cap *caps,
1213 struct vfio_info_cap_header *cap, size_t size)
1215 struct vfio_info_cap_header *header;
1217 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1219 return PTR_ERR(header);
1221 memcpy(header + 1, cap + 1, size - sizeof(*header));
1225 EXPORT_SYMBOL(vfio_info_add_capability);
1227 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1228 int max_irq_type, size_t *data_size)
1230 unsigned long minsz;
1233 minsz = offsetofend(struct vfio_irq_set, count);
1235 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1236 (hdr->count >= (U32_MAX - hdr->start)) ||
1237 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1238 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1244 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1247 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1248 case VFIO_IRQ_SET_DATA_NONE:
1251 case VFIO_IRQ_SET_DATA_BOOL:
1252 size = sizeof(uint8_t);
1254 case VFIO_IRQ_SET_DATA_EVENTFD:
1255 size = sizeof(int32_t);
1262 if (hdr->argsz - minsz < hdr->count * size)
1268 *data_size = hdr->count * size;
1273 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1276 * Pin contiguous user pages and return their associated host pages for local
1278 * @device [in] : device
1279 * @iova [in] : starting IOVA of user pages to be pinned.
1280 * @npage [in] : count of pages to be pinned. This count should not
1281 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1282 * @prot [in] : protection flags
1283 * @pages[out] : array of host pages
1284 * Return error or number of pages pinned.
1286 * A driver may only call this function if the vfio_device was created
1287 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1289 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1290 int npage, int prot, struct page **pages)
1292 /* group->container cannot change while a vfio device is open */
1293 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1295 if (vfio_device_has_container(device))
1296 return vfio_device_container_pin_pages(device, iova,
1297 npage, prot, pages);
1298 if (device->iommufd_access) {
1301 if (iova > ULONG_MAX)
1304 * VFIO ignores the sub page offset, npages is from the start of
1305 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1306 * the sub page offset by doing:
1307 * pages[0] + (iova % PAGE_SIZE)
1309 ret = iommufd_access_pin_pages(
1310 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1311 npage * PAGE_SIZE, pages,
1312 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1319 EXPORT_SYMBOL(vfio_pin_pages);
1322 * Unpin contiguous host pages for local domain only.
1323 * @device [in] : device
1324 * @iova [in] : starting address of user pages to be unpinned.
1325 * @npage [in] : count of pages to be unpinned. This count should not
1326 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1328 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1330 if (WARN_ON(!vfio_assert_device_open(device)))
1333 if (vfio_device_has_container(device)) {
1334 vfio_device_container_unpin_pages(device, iova, npage);
1337 if (device->iommufd_access) {
1338 if (WARN_ON(iova > ULONG_MAX))
1340 iommufd_access_unpin_pages(device->iommufd_access,
1341 ALIGN_DOWN(iova, PAGE_SIZE),
1346 EXPORT_SYMBOL(vfio_unpin_pages);
1349 * This interface allows the CPUs to perform some sort of virtual DMA on
1350 * behalf of the device.
1352 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1353 * into/from a kernel buffer.
1355 * As the read/write of user space memory is conducted via the CPUs and is
1356 * not a real device DMA, it is not necessary to pin the user space memory.
1358 * @device [in] : VFIO device
1359 * @iova [in] : base IOVA of a user space buffer
1360 * @data [in] : pointer to kernel buffer
1361 * @len [in] : kernel buffer length
1362 * @write : indicate read or write
1363 * Return error code on failure or 0 on success.
1365 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1366 size_t len, bool write)
1368 if (!data || len <= 0 || !vfio_assert_device_open(device))
1371 if (vfio_device_has_container(device))
1372 return vfio_device_container_dma_rw(device, iova,
1375 if (device->iommufd_access) {
1376 unsigned int flags = 0;
1378 if (iova > ULONG_MAX)
1381 /* VFIO historically tries to auto-detect a kthread */
1383 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1385 flags |= IOMMUFD_ACCESS_RW_WRITE;
1386 return iommufd_access_rw(device->iommufd_access, iova, data,
1391 EXPORT_SYMBOL(vfio_dma_rw);
1394 * Module/class support
1396 static int __init vfio_init(void)
1400 ida_init(&vfio.device_ida);
1402 ret = vfio_group_init();
1406 ret = vfio_virqfd_init();
1410 /* /sys/class/vfio-dev/vfioX */
1411 vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1412 if (IS_ERR(vfio.device_class)) {
1413 ret = PTR_ERR(vfio.device_class);
1417 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1423 vfio_group_cleanup();
1427 static void __exit vfio_cleanup(void)
1429 ida_destroy(&vfio.device_ida);
1430 class_destroy(vfio.device_class);
1431 vfio.device_class = NULL;
1433 vfio_group_cleanup();
1434 xa_destroy(&vfio_device_set_xa);
1437 module_init(vfio_init);
1438 module_exit(vfio_cleanup);
1440 MODULE_VERSION(DRIVER_VERSION);
1441 MODULE_LICENSE("GPL v2");
1442 MODULE_AUTHOR(DRIVER_AUTHOR);
1443 MODULE_DESCRIPTION(DRIVER_DESC);
1444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");