1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
42 #define DRIVER_VERSION "0.3"
43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC "VFIO - User Level meta-driver"
47 struct class *device_class;
48 struct ida device_ida;
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
58 static DEFINE_XARRAY(vfio_device_set_xa);
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
70 * Atomically acquire a singleton object in the xarray for this set_id
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
76 xa_unlock(&vfio_device_set_xa);
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
89 dev_set = new_dev_set;
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
110 static void vfio_release_device_set(struct vfio_device *device)
112 struct vfio_device_set *dev_set = device->dev_set;
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
128 xa_unlock(&vfio_device_set_xa);
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
136 lockdep_assert_held(&dev_set->lock);
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
145 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
148 struct vfio_device *cur;
150 lockdep_assert_held(&dev_set->lock);
152 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
160 * Device objects - create, release, get, put, search
162 /* Device reference always implies a group reference */
163 void vfio_device_put_registration(struct vfio_device *device)
165 if (refcount_dec_and_test(&device->refcount))
166 complete(&device->comp);
169 bool vfio_device_try_get_registration(struct vfio_device *device)
171 return refcount_inc_not_zero(&device->refcount);
177 /* Release helper called by vfio_put_device() */
178 static void vfio_device_release(struct device *dev)
180 struct vfio_device *device =
181 container_of(dev, struct vfio_device, device);
183 vfio_release_device_set(device);
184 ida_free(&vfio.device_ida, device->index);
186 if (device->ops->release)
187 device->ops->release(device);
192 static int vfio_init_device(struct vfio_device *device, struct device *dev,
193 const struct vfio_device_ops *ops);
196 * Allocate and initialize vfio_device so it can be registered to vfio
199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
200 * @size is the size of the structure to be allocated, including any
201 * private data used by the driver.
203 * Driver may provide an @init callback to cover device private data.
205 * Use vfio_put_device() to release the structure after success return.
207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208 const struct vfio_device_ops *ops)
210 struct vfio_device *device;
213 if (WARN_ON(size < sizeof(struct vfio_device)))
214 return ERR_PTR(-EINVAL);
216 device = kvzalloc(size, GFP_KERNEL);
218 return ERR_PTR(-ENOMEM);
220 ret = vfio_init_device(device, dev, ops);
229 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
232 * Initialize a vfio_device so it can be registered to vfio core.
234 static int vfio_init_device(struct vfio_device *device, struct device *dev,
235 const struct vfio_device_ops *ops)
239 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
241 dev_dbg(dev, "Error to alloc index\n");
246 init_completion(&device->comp);
251 ret = ops->init(device);
256 device_initialize(&device->device);
257 device->device.release = vfio_device_release;
258 device->device.class = vfio.device_class;
259 device->device.parent = device->dev;
263 vfio_release_device_set(device);
264 ida_free(&vfio.device_ida, device->index);
268 static int __vfio_register_dev(struct vfio_device *device,
269 enum vfio_group_type type)
273 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274 (!device->ops->bind_iommufd ||
275 !device->ops->unbind_iommufd ||
276 !device->ops->attach_ioas ||
277 !device->ops->detach_ioas)))
281 * If the driver doesn't specify a set then the device is added to a
282 * singleton set just for itself.
284 if (!device->dev_set)
285 vfio_assign_device_set(device, device);
287 ret = dev_set_name(&device->device, "vfio%d", device->index);
291 ret = vfio_device_set_group(device, type);
296 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297 * restore cache coherency. It has to be checked here because it is only
298 * valid for cases where we are using iommu groups.
300 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
301 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
306 ret = vfio_device_add(device);
310 /* Refcounting can't start until the driver calls register */
311 refcount_set(&device->refcount, 1);
313 vfio_device_group_register(device);
314 vfio_device_debugfs_init(device);
318 vfio_device_remove_group(device);
322 int vfio_register_group_dev(struct vfio_device *device)
324 return __vfio_register_dev(device, VFIO_IOMMU);
326 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
329 * Register a virtual device without IOMMU backing. The user of this
330 * device must not be able to directly trigger unmediated DMA.
332 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
334 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
336 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
339 * Decrement the device reference count and wait for the device to be
340 * removed. Open file descriptors for the device... */
341 void vfio_unregister_group_dev(struct vfio_device *device)
344 bool interrupted = false;
348 * Prevent new device opened by userspace via the
349 * VFIO_GROUP_GET_DEVICE_FD in the group path.
351 vfio_device_group_unregister(device);
354 * Balances vfio_device_add() in register path, also prevents
355 * new device opened by userspace in the cdev path.
357 vfio_device_del(device);
359 vfio_device_put_registration(device);
360 rc = try_wait_for_completion(&device->comp);
362 if (device->ops->request)
363 device->ops->request(device, i++);
366 rc = wait_for_completion_timeout(&device->comp,
369 rc = wait_for_completion_interruptible_timeout(
370 &device->comp, HZ * 10);
373 dev_warn(device->dev,
374 "Device is currently in use, task"
376 "blocked until device is released",
377 current->comm, task_pid_nr(current));
382 vfio_device_debugfs_exit(device);
383 /* Balances vfio_device_set_group in register path */
384 vfio_device_remove_group(device);
386 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
388 #ifdef CONFIG_HAVE_KVM
389 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
391 void (*pfn)(struct kvm *kvm);
392 bool (*fn)(struct kvm *kvm);
395 lockdep_assert_held(&device->dev_set->lock);
400 pfn = symbol_get(kvm_put_kvm);
404 fn = symbol_get(kvm_get_kvm_safe);
406 symbol_put(kvm_put_kvm);
411 symbol_put(kvm_get_kvm_safe);
413 symbol_put(kvm_put_kvm);
417 device->put_kvm = pfn;
421 void vfio_device_put_kvm(struct vfio_device *device)
423 lockdep_assert_held(&device->dev_set->lock);
428 if (WARN_ON(!device->put_kvm))
431 device->put_kvm(device->kvm);
432 device->put_kvm = NULL;
433 symbol_put(kvm_put_kvm);
440 /* true if the vfio_device has open_device() called but not close_device() */
441 static bool vfio_assert_device_open(struct vfio_device *device)
443 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
446 struct vfio_device_file *
447 vfio_allocate_device_file(struct vfio_device *device)
449 struct vfio_device_file *df;
451 df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
453 return ERR_PTR(-ENOMEM);
456 spin_lock_init(&df->kvm_ref_lock);
461 static int vfio_df_device_first_open(struct vfio_device_file *df)
463 struct vfio_device *device = df->device;
464 struct iommufd_ctx *iommufd = df->iommufd;
467 lockdep_assert_held(&device->dev_set->lock);
469 if (!try_module_get(device->dev->driver->owner))
473 ret = vfio_df_iommufd_bind(df);
475 ret = vfio_device_group_use_iommu(device);
479 if (device->ops->open_device) {
480 ret = device->ops->open_device(device);
482 goto err_unuse_iommu;
488 vfio_df_iommufd_unbind(df);
490 vfio_device_group_unuse_iommu(device);
492 module_put(device->dev->driver->owner);
496 static void vfio_df_device_last_close(struct vfio_device_file *df)
498 struct vfio_device *device = df->device;
499 struct iommufd_ctx *iommufd = df->iommufd;
501 lockdep_assert_held(&device->dev_set->lock);
503 if (device->ops->close_device)
504 device->ops->close_device(device);
506 vfio_df_iommufd_unbind(df);
508 vfio_device_group_unuse_iommu(device);
509 module_put(device->dev->driver->owner);
512 int vfio_df_open(struct vfio_device_file *df)
514 struct vfio_device *device = df->device;
517 lockdep_assert_held(&device->dev_set->lock);
520 * Only the group path allows the device to be opened multiple
521 * times. The device cdev path doesn't have a secure way for it.
523 if (device->open_count != 0 && !df->group)
526 device->open_count++;
527 if (device->open_count == 1) {
528 ret = vfio_df_device_first_open(df);
530 device->open_count--;
536 void vfio_df_close(struct vfio_device_file *df)
538 struct vfio_device *device = df->device;
540 lockdep_assert_held(&device->dev_set->lock);
542 vfio_assert_device_open(device);
543 if (device->open_count == 1)
544 vfio_df_device_last_close(df);
545 device->open_count--;
549 * Wrapper around pm_runtime_resume_and_get().
550 * Return error code on failure or 0 on success.
552 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
554 struct device *dev = device->dev;
556 if (dev->driver && dev->driver->pm) {
559 ret = pm_runtime_resume_and_get(dev);
561 dev_info_ratelimited(dev,
562 "vfio: runtime resume failed %d\n", ret);
571 * Wrapper around pm_runtime_put().
573 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
575 struct device *dev = device->dev;
577 if (dev->driver && dev->driver->pm)
584 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
586 struct vfio_device_file *df = filep->private_data;
587 struct vfio_device *device = df->device;
590 vfio_df_group_close(df);
592 vfio_df_unbind_iommufd(df);
594 vfio_device_put_registration(device);
602 * vfio_mig_get_next_state - Compute the next step in the FSM
603 * @cur_fsm - The current state the device is in
604 * @new_fsm - The target state to reach
605 * @next_fsm - Pointer to the next step to get to new_fsm
607 * Return 0 upon success, otherwise -errno
608 * Upon success the next step in the state progression between cur_fsm and
609 * new_fsm will be set in next_fsm.
611 * This breaks down requests for combination transitions into smaller steps and
612 * returns the next step to get to new_fsm. The function may need to be called
613 * multiple times before reaching new_fsm.
616 int vfio_mig_get_next_state(struct vfio_device *device,
617 enum vfio_device_mig_state cur_fsm,
618 enum vfio_device_mig_state new_fsm,
619 enum vfio_device_mig_state *next_fsm)
621 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
623 * The coding in this table requires the driver to implement the
624 * following FSM arcs:
630 * If P2P is supported then the driver must also implement these FSM
632 * RUNNING -> RUNNING_P2P
633 * RUNNING_P2P -> RUNNING
634 * RUNNING_P2P -> STOP
635 * STOP -> RUNNING_P2P
637 * If precopy is supported then the driver must support these additional
639 * RUNNING -> PRE_COPY
640 * PRE_COPY -> RUNNING
641 * PRE_COPY -> STOP_COPY
642 * However, if precopy and P2P are supported together then the driver
643 * must support these additional arcs beyond the P2P arcs above:
644 * PRE_COPY -> RUNNING
645 * PRE_COPY -> PRE_COPY_P2P
646 * PRE_COPY_P2P -> PRE_COPY
647 * PRE_COPY_P2P -> RUNNING_P2P
648 * PRE_COPY_P2P -> STOP_COPY
649 * RUNNING -> PRE_COPY
650 * RUNNING_P2P -> PRE_COPY_P2P
652 * Without P2P and precopy the driver must implement:
656 * The coding will step through multiple states for some combination
657 * transitions; if all optional features are supported, this means the
659 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
660 * PRE_COPY -> RUNNING -> RUNNING_P2P
661 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
662 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
663 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
664 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
665 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
666 * RESUMING -> STOP -> RUNNING_P2P
667 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
668 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
669 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
670 * RESUMING -> STOP -> STOP_COPY
671 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
672 * RUNNING -> RUNNING_P2P -> STOP
673 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
674 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
675 * RUNNING_P2P -> RUNNING -> PRE_COPY
676 * RUNNING_P2P -> STOP -> RESUMING
677 * RUNNING_P2P -> STOP -> STOP_COPY
678 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
679 * STOP -> RUNNING_P2P -> RUNNING
680 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
681 * STOP_COPY -> STOP -> RESUMING
682 * STOP_COPY -> STOP -> RUNNING_P2P
683 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
685 * The following transitions are blocked:
686 * STOP_COPY -> PRE_COPY
687 * STOP_COPY -> PRE_COPY_P2P
689 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
690 [VFIO_DEVICE_STATE_STOP] = {
691 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
692 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
693 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
694 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
696 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
697 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
698 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
700 [VFIO_DEVICE_STATE_RUNNING] = {
701 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
703 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
705 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
706 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
707 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
708 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
710 [VFIO_DEVICE_STATE_PRE_COPY] = {
711 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
712 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
713 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
714 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
715 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
716 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
717 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
718 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
720 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
721 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
722 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
723 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
724 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
725 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
726 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
727 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
728 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
730 [VFIO_DEVICE_STATE_STOP_COPY] = {
731 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
732 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
733 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
734 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
735 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
736 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
737 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
738 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
740 [VFIO_DEVICE_STATE_RESUMING] = {
741 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
742 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
743 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
744 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
745 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
746 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
747 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
748 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
750 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
751 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
752 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
753 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
754 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
755 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
756 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
757 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
758 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
760 [VFIO_DEVICE_STATE_ERROR] = {
761 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
762 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
763 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
764 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
765 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
766 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
767 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
768 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
772 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
773 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
774 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
775 [VFIO_DEVICE_STATE_PRE_COPY] =
776 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
777 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
779 VFIO_MIGRATION_PRE_COPY,
780 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
781 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
782 [VFIO_DEVICE_STATE_RUNNING_P2P] =
783 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
784 [VFIO_DEVICE_STATE_ERROR] = ~0U,
787 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
788 (state_flags_table[cur_fsm] & device->migration_flags) !=
789 state_flags_table[cur_fsm]))
792 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
793 (state_flags_table[new_fsm] & device->migration_flags) !=
794 state_flags_table[new_fsm])
798 * Arcs touching optional and unsupported states are skipped over. The
799 * driver will instead see an arc from the original state to the next
800 * logical state, as per the above comment.
802 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
803 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
804 state_flags_table[*next_fsm])
805 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
807 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
809 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
812 * Convert the drivers's struct file into a FD number and return it to userspace
814 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
815 struct vfio_device_feature_mig_state *mig)
820 fd = get_unused_fd_flags(O_CLOEXEC);
827 if (copy_to_user(arg, mig, sizeof(*mig))) {
831 fd_install(fd, filp);
842 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
843 u32 flags, void __user *arg,
847 offsetofend(struct vfio_device_feature_mig_state, data_fd);
848 struct vfio_device_feature_mig_state mig;
849 struct file *filp = NULL;
852 if (!device->mig_ops)
855 ret = vfio_check_feature(flags, argsz,
856 VFIO_DEVICE_FEATURE_SET |
857 VFIO_DEVICE_FEATURE_GET,
862 if (copy_from_user(&mig, arg, minsz))
865 if (flags & VFIO_DEVICE_FEATURE_GET) {
866 enum vfio_device_mig_state curr_state;
868 ret = device->mig_ops->migration_get_state(device,
872 mig.device_state = curr_state;
876 /* Handle the VFIO_DEVICE_FEATURE_SET */
877 filp = device->mig_ops->migration_set_state(device, mig.device_state);
878 if (IS_ERR(filp) || !filp)
881 return vfio_ioct_mig_return_fd(filp, arg, &mig);
884 if (copy_to_user(arg, &mig, sizeof(mig)))
887 return PTR_ERR(filp);
892 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
893 u32 flags, void __user *arg,
896 struct vfio_device_feature_mig_data_size data_size = {};
897 unsigned long stop_copy_length;
900 if (!device->mig_ops)
903 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
908 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
912 data_size.stop_copy_length = stop_copy_length;
913 if (copy_to_user(arg, &data_size, sizeof(data_size)))
919 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
920 u32 flags, void __user *arg,
923 struct vfio_device_feature_migration mig = {
924 .flags = device->migration_flags,
928 if (!device->mig_ops)
931 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
935 if (copy_to_user(arg, &mig, sizeof(mig)))
940 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
943 struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
944 unsigned long min_gap, curr_gap;
946 /* Special shortcut when a single range is required */
947 if (req_nodes == 1) {
950 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
953 if (WARN_ON_ONCE(!comb_start))
960 curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
961 if (prev != comb_start)
962 interval_tree_remove(prev, root);
964 comb_start->last = last;
968 /* Combine ranges which have the smallest gap */
969 while (cur_nodes > req_nodes) {
972 curr = interval_tree_iter_first(root, 0, ULONG_MAX);
975 curr_gap = curr->start - prev->last;
976 if (curr_gap < min_gap) {
983 curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
986 /* Empty list or no nodes to combine */
987 if (WARN_ON_ONCE(min_gap == ULONG_MAX))
990 comb_start->last = comb_end->last;
991 interval_tree_remove(comb_end, root);
995 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
997 /* Ranges should fit into a single kernel page */
998 #define LOG_MAX_RANGES \
999 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1002 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003 u32 flags, void __user *arg,
1007 offsetofend(struct vfio_device_feature_dma_logging_control,
1009 struct vfio_device_feature_dma_logging_range __user *ranges;
1010 struct vfio_device_feature_dma_logging_control control;
1011 struct vfio_device_feature_dma_logging_range range;
1012 struct rb_root_cached root = RB_ROOT_CACHED;
1013 struct interval_tree_node *nodes;
1018 if (!device->log_ops)
1021 ret = vfio_check_feature(flags, argsz,
1022 VFIO_DEVICE_FEATURE_SET,
1027 if (copy_from_user(&control, arg, minsz))
1030 nnodes = control.num_ranges;
1034 if (nnodes > LOG_MAX_RANGES)
1037 ranges = u64_to_user_ptr(control.ranges);
1038 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1043 for (i = 0; i < nnodes; i++) {
1044 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1048 if (!IS_ALIGNED(range.iova, control.page_size) ||
1049 !IS_ALIGNED(range.length, control.page_size)) {
1054 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055 iova_end > ULONG_MAX) {
1060 nodes[i].start = range.iova;
1061 nodes[i].last = range.iova + range.length - 1;
1062 if (interval_tree_iter_first(&root, nodes[i].start,
1064 /* Range overlapping */
1068 interval_tree_insert(nodes + i, &root);
1071 ret = device->log_ops->log_start(device, &root, nnodes,
1072 &control.page_size);
1076 if (copy_to_user(arg, &control, sizeof(control))) {
1078 device->log_ops->log_stop(device);
1087 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088 u32 flags, void __user *arg,
1093 if (!device->log_ops)
1096 ret = vfio_check_feature(flags, argsz,
1097 VFIO_DEVICE_FEATURE_SET, 0);
1101 return device->log_ops->log_stop(device);
1104 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105 unsigned long iova, size_t length,
1108 struct vfio_device *device = opaque;
1110 return device->log_ops->log_read_and_clear(device, iova, length, iter);
1114 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115 u32 flags, void __user *arg,
1119 offsetofend(struct vfio_device_feature_dma_logging_report,
1121 struct vfio_device_feature_dma_logging_report report;
1122 struct iova_bitmap *iter;
1126 if (!device->log_ops)
1129 ret = vfio_check_feature(flags, argsz,
1130 VFIO_DEVICE_FEATURE_GET,
1135 if (copy_from_user(&report, arg, minsz))
1138 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1141 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142 iova_end > ULONG_MAX)
1145 iter = iova_bitmap_alloc(report.iova, report.length,
1147 u64_to_user_ptr(report.bitmap));
1149 return PTR_ERR(iter);
1151 ret = iova_bitmap_for_each(iter, device,
1152 vfio_device_log_read_and_clear);
1154 iova_bitmap_free(iter);
1158 static int vfio_ioctl_device_feature(struct vfio_device *device,
1159 struct vfio_device_feature __user *arg)
1161 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162 struct vfio_device_feature feature;
1164 if (copy_from_user(&feature, arg, minsz))
1167 if (feature.argsz < minsz)
1170 /* Check unknown flags */
1172 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1176 /* GET & SET are mutually exclusive except with PROBE */
1177 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1182 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183 case VFIO_DEVICE_FEATURE_MIGRATION:
1184 return vfio_ioctl_device_feature_migration(
1185 device, feature.flags, arg->data,
1186 feature.argsz - minsz);
1187 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188 return vfio_ioctl_device_feature_mig_device_state(
1189 device, feature.flags, arg->data,
1190 feature.argsz - minsz);
1191 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192 return vfio_ioctl_device_feature_logging_start(
1193 device, feature.flags, arg->data,
1194 feature.argsz - minsz);
1195 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196 return vfio_ioctl_device_feature_logging_stop(
1197 device, feature.flags, arg->data,
1198 feature.argsz - minsz);
1199 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200 return vfio_ioctl_device_feature_logging_report(
1201 device, feature.flags, arg->data,
1202 feature.argsz - minsz);
1203 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204 return vfio_ioctl_device_feature_migration_data_size(
1205 device, feature.flags, arg->data,
1206 feature.argsz - minsz);
1208 if (unlikely(!device->ops->device_feature))
1210 return device->ops->device_feature(device, feature.flags,
1212 feature.argsz - minsz);
1216 static long vfio_device_fops_unl_ioctl(struct file *filep,
1217 unsigned int cmd, unsigned long arg)
1219 struct vfio_device_file *df = filep->private_data;
1220 struct vfio_device *device = df->device;
1221 void __user *uptr = (void __user *)arg;
1224 if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225 return vfio_df_ioctl_bind_iommufd(df, uptr);
1227 /* Paired with smp_store_release() following vfio_df_open() */
1228 if (!smp_load_acquire(&df->access_granted))
1231 ret = vfio_device_pm_runtime_get(device);
1235 /* cdev only ioctls */
1236 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1238 case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239 ret = vfio_df_ioctl_attach_pt(df, uptr);
1242 case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243 ret = vfio_df_ioctl_detach_pt(df, uptr);
1249 case VFIO_DEVICE_FEATURE:
1250 ret = vfio_ioctl_device_feature(device, uptr);
1254 if (unlikely(!device->ops->ioctl))
1257 ret = device->ops->ioctl(device, cmd, arg);
1261 vfio_device_pm_runtime_put(device);
1265 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266 size_t count, loff_t *ppos)
1268 struct vfio_device_file *df = filep->private_data;
1269 struct vfio_device *device = df->device;
1271 /* Paired with smp_store_release() following vfio_df_open() */
1272 if (!smp_load_acquire(&df->access_granted))
1275 if (unlikely(!device->ops->read))
1278 return device->ops->read(device, buf, count, ppos);
1281 static ssize_t vfio_device_fops_write(struct file *filep,
1282 const char __user *buf,
1283 size_t count, loff_t *ppos)
1285 struct vfio_device_file *df = filep->private_data;
1286 struct vfio_device *device = df->device;
1288 /* Paired with smp_store_release() following vfio_df_open() */
1289 if (!smp_load_acquire(&df->access_granted))
1292 if (unlikely(!device->ops->write))
1295 return device->ops->write(device, buf, count, ppos);
1298 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1300 struct vfio_device_file *df = filep->private_data;
1301 struct vfio_device *device = df->device;
1303 /* Paired with smp_store_release() following vfio_df_open() */
1304 if (!smp_load_acquire(&df->access_granted))
1307 if (unlikely(!device->ops->mmap))
1310 return device->ops->mmap(device, vma);
1313 const struct file_operations vfio_device_fops = {
1314 .owner = THIS_MODULE,
1315 .open = vfio_device_fops_cdev_open,
1316 .release = vfio_device_fops_release,
1317 .read = vfio_device_fops_read,
1318 .write = vfio_device_fops_write,
1319 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1320 .compat_ioctl = compat_ptr_ioctl,
1321 .mmap = vfio_device_fops_mmap,
1324 static struct vfio_device *vfio_device_from_file(struct file *file)
1326 struct vfio_device_file *df = file->private_data;
1328 if (file->f_op != &vfio_device_fops)
1334 * vfio_file_is_valid - True if the file is valid vfio file
1335 * @file: VFIO group file or VFIO device file
1337 bool vfio_file_is_valid(struct file *file)
1339 return vfio_group_from_file(file) ||
1340 vfio_device_from_file(file);
1342 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346 * is always CPU cache coherent
1347 * @file: VFIO group file or VFIO device file
1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350 * bit in DMA transactions. A return of false indicates that the user has
1351 * rights to access additional instructions such as wbinvd on x86.
1353 bool vfio_file_enforced_coherent(struct file *file)
1355 struct vfio_device *device;
1356 struct vfio_group *group;
1358 group = vfio_group_from_file(file);
1360 return vfio_group_enforced_coherent(group);
1362 device = vfio_device_from_file(file);
1364 return device_iommu_capable(device->dev,
1365 IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1369 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1371 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1373 struct vfio_device_file *df = file->private_data;
1376 * The kvm is first recorded in the vfio_device_file, and will
1377 * be propagated to vfio_device::kvm when the file is bound to
1378 * iommufd successfully in the vfio device cdev path.
1380 spin_lock(&df->kvm_ref_lock);
1382 spin_unlock(&df->kvm_ref_lock);
1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387 * @file: VFIO group file or VFIO device file
1390 * When a VFIO device is first opened the KVM will be available in
1391 * device->kvm if one was associated with the file.
1393 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1395 struct vfio_group *group;
1397 group = vfio_group_from_file(file);
1399 vfio_group_set_kvm(group, kvm);
1401 if (vfio_device_from_file(file))
1402 vfio_device_file_set_kvm(file, kvm);
1404 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1407 * Sub-module support
1410 * Helper for managing a buffer of info chain capabilities, allocate or
1411 * reallocate a buffer with additional @size, filling in @id and @version
1412 * of the capability. A pointer to the new capability is returned.
1414 * NB. The chain is based at the head of the buffer, so new entries are
1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416 * next offsets prior to copying to the user buffer.
1418 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419 size_t size, u16 id, u16 version)
1422 struct vfio_info_cap_header *header, *tmp;
1424 /* Ensure that the next capability struct will be aligned */
1425 size = ALIGN(size, sizeof(u64));
1427 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1432 return ERR_PTR(-ENOMEM);
1436 header = buf + caps->size;
1438 /* Eventually copied to user buffer, zero */
1439 memset(header, 0, size);
1442 header->version = version;
1444 /* Add to the end of the capability chain */
1445 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1448 tmp->next = caps->size;
1453 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1455 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1457 struct vfio_info_cap_header *tmp;
1458 void *buf = (void *)caps->buf;
1460 /* Capability structs should start with proper alignment */
1461 WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1463 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464 tmp->next += offset;
1466 EXPORT_SYMBOL(vfio_info_cap_shift);
1468 int vfio_info_add_capability(struct vfio_info_cap *caps,
1469 struct vfio_info_cap_header *cap, size_t size)
1471 struct vfio_info_cap_header *header;
1473 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1475 return PTR_ERR(header);
1477 memcpy(header + 1, cap + 1, size - sizeof(*header));
1481 EXPORT_SYMBOL(vfio_info_add_capability);
1483 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484 int max_irq_type, size_t *data_size)
1486 unsigned long minsz;
1489 minsz = offsetofend(struct vfio_irq_set, count);
1491 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492 (hdr->count >= (U32_MAX - hdr->start)) ||
1493 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1500 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1503 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504 case VFIO_IRQ_SET_DATA_NONE:
1507 case VFIO_IRQ_SET_DATA_BOOL:
1508 size = sizeof(uint8_t);
1510 case VFIO_IRQ_SET_DATA_EVENTFD:
1511 size = sizeof(int32_t);
1518 if (hdr->argsz - minsz < hdr->count * size)
1524 *data_size = hdr->count * size;
1529 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1532 * Pin contiguous user pages and return their associated host pages for local
1534 * @device [in] : device
1535 * @iova [in] : starting IOVA of user pages to be pinned.
1536 * @npage [in] : count of pages to be pinned. This count should not
1537 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538 * @prot [in] : protection flags
1539 * @pages[out] : array of host pages
1540 * Return error or number of pages pinned.
1542 * A driver may only call this function if the vfio_device was created
1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1545 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546 int npage, int prot, struct page **pages)
1548 /* group->container cannot change while a vfio device is open */
1549 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1551 if (!device->ops->dma_unmap)
1553 if (vfio_device_has_container(device))
1554 return vfio_device_container_pin_pages(device, iova,
1555 npage, prot, pages);
1556 if (device->iommufd_access) {
1559 if (iova > ULONG_MAX)
1562 * VFIO ignores the sub page offset, npages is from the start of
1563 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564 * the sub page offset by doing:
1565 * pages[0] + (iova % PAGE_SIZE)
1567 ret = iommufd_access_pin_pages(
1568 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569 npage * PAGE_SIZE, pages,
1570 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1577 EXPORT_SYMBOL(vfio_pin_pages);
1580 * Unpin contiguous host pages for local domain only.
1581 * @device [in] : device
1582 * @iova [in] : starting address of user pages to be unpinned.
1583 * @npage [in] : count of pages to be unpinned. This count should not
1584 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1586 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1588 if (WARN_ON(!vfio_assert_device_open(device)))
1590 if (WARN_ON(!device->ops->dma_unmap))
1593 if (vfio_device_has_container(device)) {
1594 vfio_device_container_unpin_pages(device, iova, npage);
1597 if (device->iommufd_access) {
1598 if (WARN_ON(iova > ULONG_MAX))
1600 iommufd_access_unpin_pages(device->iommufd_access,
1601 ALIGN_DOWN(iova, PAGE_SIZE),
1606 EXPORT_SYMBOL(vfio_unpin_pages);
1609 * This interface allows the CPUs to perform some sort of virtual DMA on
1610 * behalf of the device.
1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613 * into/from a kernel buffer.
1615 * As the read/write of user space memory is conducted via the CPUs and is
1616 * not a real device DMA, it is not necessary to pin the user space memory.
1618 * @device [in] : VFIO device
1619 * @iova [in] : base IOVA of a user space buffer
1620 * @data [in] : pointer to kernel buffer
1621 * @len [in] : kernel buffer length
1622 * @write : indicate read or write
1623 * Return error code on failure or 0 on success.
1625 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626 size_t len, bool write)
1628 if (!data || len <= 0 || !vfio_assert_device_open(device))
1631 if (vfio_device_has_container(device))
1632 return vfio_device_container_dma_rw(device, iova,
1635 if (device->iommufd_access) {
1636 unsigned int flags = 0;
1638 if (iova > ULONG_MAX)
1641 /* VFIO historically tries to auto-detect a kthread */
1643 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1645 flags |= IOMMUFD_ACCESS_RW_WRITE;
1646 return iommufd_access_rw(device->iommufd_access, iova, data,
1651 EXPORT_SYMBOL(vfio_dma_rw);
1654 * Module/class support
1656 static int __init vfio_init(void)
1660 ida_init(&vfio.device_ida);
1662 ret = vfio_group_init();
1666 ret = vfio_virqfd_init();
1670 /* /sys/class/vfio-dev/vfioX */
1671 vfio.device_class = class_create("vfio-dev");
1672 if (IS_ERR(vfio.device_class)) {
1673 ret = PTR_ERR(vfio.device_class);
1677 ret = vfio_cdev_init(vfio.device_class);
1679 goto err_alloc_dev_chrdev;
1681 vfio_debugfs_create_root();
1682 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1685 err_alloc_dev_chrdev:
1686 class_destroy(vfio.device_class);
1687 vfio.device_class = NULL;
1691 vfio_group_cleanup();
1695 static void __exit vfio_cleanup(void)
1697 vfio_debugfs_remove_root();
1698 ida_destroy(&vfio.device_ida);
1699 vfio_cdev_cleanup();
1700 class_destroy(vfio.device_class);
1701 vfio.device_class = NULL;
1703 vfio_group_cleanup();
1704 xa_destroy(&vfio_device_set_xa);
1707 module_init(vfio_init);
1708 module_exit(vfio_cleanup);
1710 MODULE_IMPORT_NS(IOMMUFD);
1711 MODULE_VERSION(DRIVER_VERSION);
1712 MODULE_LICENSE("GPL v2");
1713 MODULE_AUTHOR(DRIVER_AUTHOR);
1714 MODULE_DESCRIPTION(DRIVER_DESC);
1715 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");