Merge tag 'pull-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-microblaze.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41
42 #define DRIVER_VERSION  "0.3"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "VFIO - User Level meta-driver"
45
46 static struct vfio {
47         struct class                    *device_class;
48         struct ida                      device_ida;
49 } vfio;
50
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57
58 static DEFINE_XARRAY(vfio_device_set_xa);
59
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62         unsigned long idx = (unsigned long)set_id;
63         struct vfio_device_set *new_dev_set;
64         struct vfio_device_set *dev_set;
65
66         if (WARN_ON(!set_id))
67                 return -EINVAL;
68
69         /*
70          * Atomically acquire a singleton object in the xarray for this set_id
71          */
72         xa_lock(&vfio_device_set_xa);
73         dev_set = xa_load(&vfio_device_set_xa, idx);
74         if (dev_set)
75                 goto found_get_ref;
76         xa_unlock(&vfio_device_set_xa);
77
78         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79         if (!new_dev_set)
80                 return -ENOMEM;
81         mutex_init(&new_dev_set->lock);
82         INIT_LIST_HEAD(&new_dev_set->device_list);
83         new_dev_set->set_id = set_id;
84
85         xa_lock(&vfio_device_set_xa);
86         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87                                GFP_KERNEL);
88         if (!dev_set) {
89                 dev_set = new_dev_set;
90                 goto found_get_ref;
91         }
92
93         kfree(new_dev_set);
94         if (xa_is_err(dev_set)) {
95                 xa_unlock(&vfio_device_set_xa);
96                 return xa_err(dev_set);
97         }
98
99 found_get_ref:
100         dev_set->device_count++;
101         xa_unlock(&vfio_device_set_xa);
102         mutex_lock(&dev_set->lock);
103         device->dev_set = dev_set;
104         list_add_tail(&device->dev_set_list, &dev_set->device_list);
105         mutex_unlock(&dev_set->lock);
106         return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112         struct vfio_device_set *dev_set = device->dev_set;
113
114         if (!dev_set)
115                 return;
116
117         mutex_lock(&dev_set->lock);
118         list_del(&device->dev_set_list);
119         mutex_unlock(&dev_set->lock);
120
121         xa_lock(&vfio_device_set_xa);
122         if (!--dev_set->device_count) {
123                 __xa_erase(&vfio_device_set_xa,
124                            (unsigned long)dev_set->set_id);
125                 mutex_destroy(&dev_set->lock);
126                 kfree(dev_set);
127         }
128         xa_unlock(&vfio_device_set_xa);
129 }
130
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133         struct vfio_device *cur;
134         unsigned int open_count = 0;
135
136         lockdep_assert_held(&dev_set->lock);
137
138         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139                 open_count += cur->open_count;
140         return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144 /*
145  * Device objects - create, release, get, put, search
146  */
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
149 {
150         if (refcount_dec_and_test(&device->refcount))
151                 complete(&device->comp);
152 }
153
154 bool vfio_device_try_get_registration(struct vfio_device *device)
155 {
156         return refcount_inc_not_zero(&device->refcount);
157 }
158
159 /*
160  * VFIO driver API
161  */
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
164 {
165         struct vfio_device *device =
166                         container_of(dev, struct vfio_device, device);
167
168         vfio_release_device_set(device);
169         ida_free(&vfio.device_ida, device->index);
170
171         if (device->ops->release)
172                 device->ops->release(device);
173
174         kvfree(device);
175 }
176
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178                             const struct vfio_device_ops *ops);
179
180 /*
181  * Allocate and initialize vfio_device so it can be registered to vfio
182  * core.
183  *
184  * Drivers should use the wrapper vfio_alloc_device() for allocation.
185  * @size is the size of the structure to be allocated, including any
186  * private data used by the driver.
187  *
188  * Driver may provide an @init callback to cover device private data.
189  *
190  * Use vfio_put_device() to release the structure after success return.
191  */
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193                                        const struct vfio_device_ops *ops)
194 {
195         struct vfio_device *device;
196         int ret;
197
198         if (WARN_ON(size < sizeof(struct vfio_device)))
199                 return ERR_PTR(-EINVAL);
200
201         device = kvzalloc(size, GFP_KERNEL);
202         if (!device)
203                 return ERR_PTR(-ENOMEM);
204
205         ret = vfio_init_device(device, dev, ops);
206         if (ret)
207                 goto out_free;
208         return device;
209
210 out_free:
211         kvfree(device);
212         return ERR_PTR(ret);
213 }
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215
216 /*
217  * Initialize a vfio_device so it can be registered to vfio core.
218  */
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220                             const struct vfio_device_ops *ops)
221 {
222         int ret;
223
224         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225         if (ret < 0) {
226                 dev_dbg(dev, "Error to alloc index\n");
227                 return ret;
228         }
229
230         device->index = ret;
231         init_completion(&device->comp);
232         device->dev = dev;
233         device->ops = ops;
234
235         if (ops->init) {
236                 ret = ops->init(device);
237                 if (ret)
238                         goto out_uninit;
239         }
240
241         device_initialize(&device->device);
242         device->device.release = vfio_device_release;
243         device->device.class = vfio.device_class;
244         device->device.parent = device->dev;
245         return 0;
246
247 out_uninit:
248         vfio_release_device_set(device);
249         ida_free(&vfio.device_ida, device->index);
250         return ret;
251 }
252
253 static int __vfio_register_dev(struct vfio_device *device,
254                                enum vfio_group_type type)
255 {
256         int ret;
257
258         if (WARN_ON(device->ops->bind_iommufd &&
259                     (!device->ops->unbind_iommufd ||
260                      !device->ops->attach_ioas)))
261                 return -EINVAL;
262
263         /*
264          * If the driver doesn't specify a set then the device is added to a
265          * singleton set just for itself.
266          */
267         if (!device->dev_set)
268                 vfio_assign_device_set(device, device);
269
270         ret = dev_set_name(&device->device, "vfio%d", device->index);
271         if (ret)
272                 return ret;
273
274         ret = vfio_device_set_group(device, type);
275         if (ret)
276                 return ret;
277
278         ret = device_add(&device->device);
279         if (ret)
280                 goto err_out;
281
282         /* Refcounting can't start until the driver calls register */
283         refcount_set(&device->refcount, 1);
284
285         vfio_device_group_register(device);
286
287         return 0;
288 err_out:
289         vfio_device_remove_group(device);
290         return ret;
291 }
292
293 int vfio_register_group_dev(struct vfio_device *device)
294 {
295         return __vfio_register_dev(device, VFIO_IOMMU);
296 }
297 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
298
299 /*
300  * Register a virtual device without IOMMU backing.  The user of this
301  * device must not be able to directly trigger unmediated DMA.
302  */
303 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
304 {
305         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
306 }
307 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
308
309 /*
310  * Decrement the device reference count and wait for the device to be
311  * removed.  Open file descriptors for the device... */
312 void vfio_unregister_group_dev(struct vfio_device *device)
313 {
314         unsigned int i = 0;
315         bool interrupted = false;
316         long rc;
317
318         vfio_device_put_registration(device);
319         rc = try_wait_for_completion(&device->comp);
320         while (rc <= 0) {
321                 if (device->ops->request)
322                         device->ops->request(device, i++);
323
324                 if (interrupted) {
325                         rc = wait_for_completion_timeout(&device->comp,
326                                                          HZ * 10);
327                 } else {
328                         rc = wait_for_completion_interruptible_timeout(
329                                 &device->comp, HZ * 10);
330                         if (rc < 0) {
331                                 interrupted = true;
332                                 dev_warn(device->dev,
333                                          "Device is currently in use, task"
334                                          " \"%s\" (%d) "
335                                          "blocked until device is released",
336                                          current->comm, task_pid_nr(current));
337                         }
338                 }
339         }
340
341         vfio_device_group_unregister(device);
342
343         /* Balances device_add in register path */
344         device_del(&device->device);
345
346         /* Balances vfio_device_set_group in register path */
347         vfio_device_remove_group(device);
348 }
349 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
350
351 #ifdef CONFIG_HAVE_KVM
352 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
353 {
354         void (*pfn)(struct kvm *kvm);
355         bool (*fn)(struct kvm *kvm);
356         bool ret;
357
358         lockdep_assert_held(&device->dev_set->lock);
359
360         pfn = symbol_get(kvm_put_kvm);
361         if (WARN_ON(!pfn))
362                 return;
363
364         fn = symbol_get(kvm_get_kvm_safe);
365         if (WARN_ON(!fn)) {
366                 symbol_put(kvm_put_kvm);
367                 return;
368         }
369
370         ret = fn(kvm);
371         symbol_put(kvm_get_kvm_safe);
372         if (!ret) {
373                 symbol_put(kvm_put_kvm);
374                 return;
375         }
376
377         device->put_kvm = pfn;
378         device->kvm = kvm;
379 }
380
381 void vfio_device_put_kvm(struct vfio_device *device)
382 {
383         lockdep_assert_held(&device->dev_set->lock);
384
385         if (!device->kvm)
386                 return;
387
388         if (WARN_ON(!device->put_kvm))
389                 goto clear;
390
391         device->put_kvm(device->kvm);
392         device->put_kvm = NULL;
393         symbol_put(kvm_put_kvm);
394
395 clear:
396         device->kvm = NULL;
397 }
398 #endif
399
400 /* true if the vfio_device has open_device() called but not close_device() */
401 static bool vfio_assert_device_open(struct vfio_device *device)
402 {
403         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
404 }
405
406 static int vfio_device_first_open(struct vfio_device *device,
407                                   struct iommufd_ctx *iommufd)
408 {
409         int ret;
410
411         lockdep_assert_held(&device->dev_set->lock);
412
413         if (!try_module_get(device->dev->driver->owner))
414                 return -ENODEV;
415
416         if (iommufd)
417                 ret = vfio_iommufd_bind(device, iommufd);
418         else
419                 ret = vfio_device_group_use_iommu(device);
420         if (ret)
421                 goto err_module_put;
422
423         if (device->ops->open_device) {
424                 ret = device->ops->open_device(device);
425                 if (ret)
426                         goto err_unuse_iommu;
427         }
428         return 0;
429
430 err_unuse_iommu:
431         if (iommufd)
432                 vfio_iommufd_unbind(device);
433         else
434                 vfio_device_group_unuse_iommu(device);
435 err_module_put:
436         module_put(device->dev->driver->owner);
437         return ret;
438 }
439
440 static void vfio_device_last_close(struct vfio_device *device,
441                                    struct iommufd_ctx *iommufd)
442 {
443         lockdep_assert_held(&device->dev_set->lock);
444
445         if (device->ops->close_device)
446                 device->ops->close_device(device);
447         if (iommufd)
448                 vfio_iommufd_unbind(device);
449         else
450                 vfio_device_group_unuse_iommu(device);
451         module_put(device->dev->driver->owner);
452 }
453
454 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
455 {
456         int ret = 0;
457
458         lockdep_assert_held(&device->dev_set->lock);
459
460         device->open_count++;
461         if (device->open_count == 1) {
462                 ret = vfio_device_first_open(device, iommufd);
463                 if (ret)
464                         device->open_count--;
465         }
466
467         return ret;
468 }
469
470 void vfio_device_close(struct vfio_device *device,
471                        struct iommufd_ctx *iommufd)
472 {
473         lockdep_assert_held(&device->dev_set->lock);
474
475         vfio_assert_device_open(device);
476         if (device->open_count == 1)
477                 vfio_device_last_close(device, iommufd);
478         device->open_count--;
479 }
480
481 /*
482  * Wrapper around pm_runtime_resume_and_get().
483  * Return error code on failure or 0 on success.
484  */
485 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
486 {
487         struct device *dev = device->dev;
488
489         if (dev->driver && dev->driver->pm) {
490                 int ret;
491
492                 ret = pm_runtime_resume_and_get(dev);
493                 if (ret) {
494                         dev_info_ratelimited(dev,
495                                 "vfio: runtime resume failed %d\n", ret);
496                         return -EIO;
497                 }
498         }
499
500         return 0;
501 }
502
503 /*
504  * Wrapper around pm_runtime_put().
505  */
506 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
507 {
508         struct device *dev = device->dev;
509
510         if (dev->driver && dev->driver->pm)
511                 pm_runtime_put(dev);
512 }
513
514 /*
515  * VFIO Device fd
516  */
517 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
518 {
519         struct vfio_device *device = filep->private_data;
520
521         vfio_device_group_close(device);
522
523         vfio_device_put_registration(device);
524
525         return 0;
526 }
527
528 /*
529  * vfio_mig_get_next_state - Compute the next step in the FSM
530  * @cur_fsm - The current state the device is in
531  * @new_fsm - The target state to reach
532  * @next_fsm - Pointer to the next step to get to new_fsm
533  *
534  * Return 0 upon success, otherwise -errno
535  * Upon success the next step in the state progression between cur_fsm and
536  * new_fsm will be set in next_fsm.
537  *
538  * This breaks down requests for combination transitions into smaller steps and
539  * returns the next step to get to new_fsm. The function may need to be called
540  * multiple times before reaching new_fsm.
541  *
542  */
543 int vfio_mig_get_next_state(struct vfio_device *device,
544                             enum vfio_device_mig_state cur_fsm,
545                             enum vfio_device_mig_state new_fsm,
546                             enum vfio_device_mig_state *next_fsm)
547 {
548         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
549         /*
550          * The coding in this table requires the driver to implement the
551          * following FSM arcs:
552          *         RESUMING -> STOP
553          *         STOP -> RESUMING
554          *         STOP -> STOP_COPY
555          *         STOP_COPY -> STOP
556          *
557          * If P2P is supported then the driver must also implement these FSM
558          * arcs:
559          *         RUNNING -> RUNNING_P2P
560          *         RUNNING_P2P -> RUNNING
561          *         RUNNING_P2P -> STOP
562          *         STOP -> RUNNING_P2P
563          *
564          * If precopy is supported then the driver must support these additional
565          * FSM arcs:
566          *         RUNNING -> PRE_COPY
567          *         PRE_COPY -> RUNNING
568          *         PRE_COPY -> STOP_COPY
569          * However, if precopy and P2P are supported together then the driver
570          * must support these additional arcs beyond the P2P arcs above:
571          *         PRE_COPY -> RUNNING
572          *         PRE_COPY -> PRE_COPY_P2P
573          *         PRE_COPY_P2P -> PRE_COPY
574          *         PRE_COPY_P2P -> RUNNING_P2P
575          *         PRE_COPY_P2P -> STOP_COPY
576          *         RUNNING -> PRE_COPY
577          *         RUNNING_P2P -> PRE_COPY_P2P
578          *
579          * Without P2P and precopy the driver must implement:
580          *         RUNNING -> STOP
581          *         STOP -> RUNNING
582          *
583          * The coding will step through multiple states for some combination
584          * transitions; if all optional features are supported, this means the
585          * following ones:
586          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
587          *         PRE_COPY -> RUNNING -> RUNNING_P2P
588          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
589          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
590          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
591          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
592          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
593          *         RESUMING -> STOP -> RUNNING_P2P
594          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
595          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
596          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
597          *         RESUMING -> STOP -> STOP_COPY
598          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
599          *         RUNNING -> RUNNING_P2P -> STOP
600          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
601          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
602          *         RUNNING_P2P -> RUNNING -> PRE_COPY
603          *         RUNNING_P2P -> STOP -> RESUMING
604          *         RUNNING_P2P -> STOP -> STOP_COPY
605          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
606          *         STOP -> RUNNING_P2P -> RUNNING
607          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
608          *         STOP_COPY -> STOP -> RESUMING
609          *         STOP_COPY -> STOP -> RUNNING_P2P
610          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
611          *
612          *  The following transitions are blocked:
613          *         STOP_COPY -> PRE_COPY
614          *         STOP_COPY -> PRE_COPY_P2P
615          */
616         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
617                 [VFIO_DEVICE_STATE_STOP] = {
618                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
619                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
620                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
621                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
622                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
623                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
624                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
625                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
626                 },
627                 [VFIO_DEVICE_STATE_RUNNING] = {
628                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
629                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
630                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
631                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
632                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
633                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
634                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
635                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
636                 },
637                 [VFIO_DEVICE_STATE_PRE_COPY] = {
638                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
639                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
640                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
641                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
642                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
644                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
645                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
646                 },
647                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
648                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
649                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
650                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
651                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
652                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
653                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
654                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
655                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
656                 },
657                 [VFIO_DEVICE_STATE_STOP_COPY] = {
658                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
659                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
660                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
661                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
662                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
663                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
664                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
665                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
666                 },
667                 [VFIO_DEVICE_STATE_RESUMING] = {
668                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
669                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
670                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
671                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
672                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
673                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
674                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
675                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
676                 },
677                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
678                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
680                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
681                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
682                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
683                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
684                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
685                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
686                 },
687                 [VFIO_DEVICE_STATE_ERROR] = {
688                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
689                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
690                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
691                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
692                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
693                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
694                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
695                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
696                 },
697         };
698
699         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
700                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
701                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
702                 [VFIO_DEVICE_STATE_PRE_COPY] =
703                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
704                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
705                                                    VFIO_MIGRATION_P2P |
706                                                    VFIO_MIGRATION_PRE_COPY,
707                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
708                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
709                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
710                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
711                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
712         };
713
714         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
715                     (state_flags_table[cur_fsm] & device->migration_flags) !=
716                         state_flags_table[cur_fsm]))
717                 return -EINVAL;
718
719         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
720            (state_flags_table[new_fsm] & device->migration_flags) !=
721                         state_flags_table[new_fsm])
722                 return -EINVAL;
723
724         /*
725          * Arcs touching optional and unsupported states are skipped over. The
726          * driver will instead see an arc from the original state to the next
727          * logical state, as per the above comment.
728          */
729         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
730         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
731                         state_flags_table[*next_fsm])
732                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
733
734         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
735 }
736 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
737
738 /*
739  * Convert the drivers's struct file into a FD number and return it to userspace
740  */
741 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
742                                    struct vfio_device_feature_mig_state *mig)
743 {
744         int ret;
745         int fd;
746
747         fd = get_unused_fd_flags(O_CLOEXEC);
748         if (fd < 0) {
749                 ret = fd;
750                 goto out_fput;
751         }
752
753         mig->data_fd = fd;
754         if (copy_to_user(arg, mig, sizeof(*mig))) {
755                 ret = -EFAULT;
756                 goto out_put_unused;
757         }
758         fd_install(fd, filp);
759         return 0;
760
761 out_put_unused:
762         put_unused_fd(fd);
763 out_fput:
764         fput(filp);
765         return ret;
766 }
767
768 static int
769 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
770                                            u32 flags, void __user *arg,
771                                            size_t argsz)
772 {
773         size_t minsz =
774                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
775         struct vfio_device_feature_mig_state mig;
776         struct file *filp = NULL;
777         int ret;
778
779         if (!device->mig_ops)
780                 return -ENOTTY;
781
782         ret = vfio_check_feature(flags, argsz,
783                                  VFIO_DEVICE_FEATURE_SET |
784                                  VFIO_DEVICE_FEATURE_GET,
785                                  sizeof(mig));
786         if (ret != 1)
787                 return ret;
788
789         if (copy_from_user(&mig, arg, minsz))
790                 return -EFAULT;
791
792         if (flags & VFIO_DEVICE_FEATURE_GET) {
793                 enum vfio_device_mig_state curr_state;
794
795                 ret = device->mig_ops->migration_get_state(device,
796                                                            &curr_state);
797                 if (ret)
798                         return ret;
799                 mig.device_state = curr_state;
800                 goto out_copy;
801         }
802
803         /* Handle the VFIO_DEVICE_FEATURE_SET */
804         filp = device->mig_ops->migration_set_state(device, mig.device_state);
805         if (IS_ERR(filp) || !filp)
806                 goto out_copy;
807
808         return vfio_ioct_mig_return_fd(filp, arg, &mig);
809 out_copy:
810         mig.data_fd = -1;
811         if (copy_to_user(arg, &mig, sizeof(mig)))
812                 return -EFAULT;
813         if (IS_ERR(filp))
814                 return PTR_ERR(filp);
815         return 0;
816 }
817
818 static int
819 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
820                                               u32 flags, void __user *arg,
821                                               size_t argsz)
822 {
823         struct vfio_device_feature_mig_data_size data_size = {};
824         unsigned long stop_copy_length;
825         int ret;
826
827         if (!device->mig_ops)
828                 return -ENOTTY;
829
830         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
831                                  sizeof(data_size));
832         if (ret != 1)
833                 return ret;
834
835         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
836         if (ret)
837                 return ret;
838
839         data_size.stop_copy_length = stop_copy_length;
840         if (copy_to_user(arg, &data_size, sizeof(data_size)))
841                 return -EFAULT;
842
843         return 0;
844 }
845
846 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
847                                                u32 flags, void __user *arg,
848                                                size_t argsz)
849 {
850         struct vfio_device_feature_migration mig = {
851                 .flags = device->migration_flags,
852         };
853         int ret;
854
855         if (!device->mig_ops)
856                 return -ENOTTY;
857
858         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
859                                  sizeof(mig));
860         if (ret != 1)
861                 return ret;
862         if (copy_to_user(arg, &mig, sizeof(mig)))
863                 return -EFAULT;
864         return 0;
865 }
866
867 /* Ranges should fit into a single kernel page */
868 #define LOG_MAX_RANGES \
869         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
870
871 static int
872 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
873                                         u32 flags, void __user *arg,
874                                         size_t argsz)
875 {
876         size_t minsz =
877                 offsetofend(struct vfio_device_feature_dma_logging_control,
878                             ranges);
879         struct vfio_device_feature_dma_logging_range __user *ranges;
880         struct vfio_device_feature_dma_logging_control control;
881         struct vfio_device_feature_dma_logging_range range;
882         struct rb_root_cached root = RB_ROOT_CACHED;
883         struct interval_tree_node *nodes;
884         u64 iova_end;
885         u32 nnodes;
886         int i, ret;
887
888         if (!device->log_ops)
889                 return -ENOTTY;
890
891         ret = vfio_check_feature(flags, argsz,
892                                  VFIO_DEVICE_FEATURE_SET,
893                                  sizeof(control));
894         if (ret != 1)
895                 return ret;
896
897         if (copy_from_user(&control, arg, minsz))
898                 return -EFAULT;
899
900         nnodes = control.num_ranges;
901         if (!nnodes)
902                 return -EINVAL;
903
904         if (nnodes > LOG_MAX_RANGES)
905                 return -E2BIG;
906
907         ranges = u64_to_user_ptr(control.ranges);
908         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
909                               GFP_KERNEL);
910         if (!nodes)
911                 return -ENOMEM;
912
913         for (i = 0; i < nnodes; i++) {
914                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
915                         ret = -EFAULT;
916                         goto end;
917                 }
918                 if (!IS_ALIGNED(range.iova, control.page_size) ||
919                     !IS_ALIGNED(range.length, control.page_size)) {
920                         ret = -EINVAL;
921                         goto end;
922                 }
923
924                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
925                     iova_end > ULONG_MAX) {
926                         ret = -EOVERFLOW;
927                         goto end;
928                 }
929
930                 nodes[i].start = range.iova;
931                 nodes[i].last = range.iova + range.length - 1;
932                 if (interval_tree_iter_first(&root, nodes[i].start,
933                                              nodes[i].last)) {
934                         /* Range overlapping */
935                         ret = -EINVAL;
936                         goto end;
937                 }
938                 interval_tree_insert(nodes + i, &root);
939         }
940
941         ret = device->log_ops->log_start(device, &root, nnodes,
942                                          &control.page_size);
943         if (ret)
944                 goto end;
945
946         if (copy_to_user(arg, &control, sizeof(control))) {
947                 ret = -EFAULT;
948                 device->log_ops->log_stop(device);
949         }
950
951 end:
952         kfree(nodes);
953         return ret;
954 }
955
956 static int
957 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
958                                        u32 flags, void __user *arg,
959                                        size_t argsz)
960 {
961         int ret;
962
963         if (!device->log_ops)
964                 return -ENOTTY;
965
966         ret = vfio_check_feature(flags, argsz,
967                                  VFIO_DEVICE_FEATURE_SET, 0);
968         if (ret != 1)
969                 return ret;
970
971         return device->log_ops->log_stop(device);
972 }
973
974 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
975                                           unsigned long iova, size_t length,
976                                           void *opaque)
977 {
978         struct vfio_device *device = opaque;
979
980         return device->log_ops->log_read_and_clear(device, iova, length, iter);
981 }
982
983 static int
984 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
985                                          u32 flags, void __user *arg,
986                                          size_t argsz)
987 {
988         size_t minsz =
989                 offsetofend(struct vfio_device_feature_dma_logging_report,
990                             bitmap);
991         struct vfio_device_feature_dma_logging_report report;
992         struct iova_bitmap *iter;
993         u64 iova_end;
994         int ret;
995
996         if (!device->log_ops)
997                 return -ENOTTY;
998
999         ret = vfio_check_feature(flags, argsz,
1000                                  VFIO_DEVICE_FEATURE_GET,
1001                                  sizeof(report));
1002         if (ret != 1)
1003                 return ret;
1004
1005         if (copy_from_user(&report, arg, minsz))
1006                 return -EFAULT;
1007
1008         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1009                 return -EINVAL;
1010
1011         if (check_add_overflow(report.iova, report.length, &iova_end) ||
1012             iova_end > ULONG_MAX)
1013                 return -EOVERFLOW;
1014
1015         iter = iova_bitmap_alloc(report.iova, report.length,
1016                                  report.page_size,
1017                                  u64_to_user_ptr(report.bitmap));
1018         if (IS_ERR(iter))
1019                 return PTR_ERR(iter);
1020
1021         ret = iova_bitmap_for_each(iter, device,
1022                                    vfio_device_log_read_and_clear);
1023
1024         iova_bitmap_free(iter);
1025         return ret;
1026 }
1027
1028 static int vfio_ioctl_device_feature(struct vfio_device *device,
1029                                      struct vfio_device_feature __user *arg)
1030 {
1031         size_t minsz = offsetofend(struct vfio_device_feature, flags);
1032         struct vfio_device_feature feature;
1033
1034         if (copy_from_user(&feature, arg, minsz))
1035                 return -EFAULT;
1036
1037         if (feature.argsz < minsz)
1038                 return -EINVAL;
1039
1040         /* Check unknown flags */
1041         if (feature.flags &
1042             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1043               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1044                 return -EINVAL;
1045
1046         /* GET & SET are mutually exclusive except with PROBE */
1047         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1048             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1049             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1050                 return -EINVAL;
1051
1052         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1053         case VFIO_DEVICE_FEATURE_MIGRATION:
1054                 return vfio_ioctl_device_feature_migration(
1055                         device, feature.flags, arg->data,
1056                         feature.argsz - minsz);
1057         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1058                 return vfio_ioctl_device_feature_mig_device_state(
1059                         device, feature.flags, arg->data,
1060                         feature.argsz - minsz);
1061         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1062                 return vfio_ioctl_device_feature_logging_start(
1063                         device, feature.flags, arg->data,
1064                         feature.argsz - minsz);
1065         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1066                 return vfio_ioctl_device_feature_logging_stop(
1067                         device, feature.flags, arg->data,
1068                         feature.argsz - minsz);
1069         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1070                 return vfio_ioctl_device_feature_logging_report(
1071                         device, feature.flags, arg->data,
1072                         feature.argsz - minsz);
1073         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1074                 return vfio_ioctl_device_feature_migration_data_size(
1075                         device, feature.flags, arg->data,
1076                         feature.argsz - minsz);
1077         default:
1078                 if (unlikely(!device->ops->device_feature))
1079                         return -EINVAL;
1080                 return device->ops->device_feature(device, feature.flags,
1081                                                    arg->data,
1082                                                    feature.argsz - minsz);
1083         }
1084 }
1085
1086 static long vfio_device_fops_unl_ioctl(struct file *filep,
1087                                        unsigned int cmd, unsigned long arg)
1088 {
1089         struct vfio_device *device = filep->private_data;
1090         int ret;
1091
1092         ret = vfio_device_pm_runtime_get(device);
1093         if (ret)
1094                 return ret;
1095
1096         switch (cmd) {
1097         case VFIO_DEVICE_FEATURE:
1098                 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1099                 break;
1100
1101         default:
1102                 if (unlikely(!device->ops->ioctl))
1103                         ret = -EINVAL;
1104                 else
1105                         ret = device->ops->ioctl(device, cmd, arg);
1106                 break;
1107         }
1108
1109         vfio_device_pm_runtime_put(device);
1110         return ret;
1111 }
1112
1113 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1114                                      size_t count, loff_t *ppos)
1115 {
1116         struct vfio_device *device = filep->private_data;
1117
1118         if (unlikely(!device->ops->read))
1119                 return -EINVAL;
1120
1121         return device->ops->read(device, buf, count, ppos);
1122 }
1123
1124 static ssize_t vfio_device_fops_write(struct file *filep,
1125                                       const char __user *buf,
1126                                       size_t count, loff_t *ppos)
1127 {
1128         struct vfio_device *device = filep->private_data;
1129
1130         if (unlikely(!device->ops->write))
1131                 return -EINVAL;
1132
1133         return device->ops->write(device, buf, count, ppos);
1134 }
1135
1136 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1137 {
1138         struct vfio_device *device = filep->private_data;
1139
1140         if (unlikely(!device->ops->mmap))
1141                 return -EINVAL;
1142
1143         return device->ops->mmap(device, vma);
1144 }
1145
1146 const struct file_operations vfio_device_fops = {
1147         .owner          = THIS_MODULE,
1148         .release        = vfio_device_fops_release,
1149         .read           = vfio_device_fops_read,
1150         .write          = vfio_device_fops_write,
1151         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1152         .compat_ioctl   = compat_ptr_ioctl,
1153         .mmap           = vfio_device_fops_mmap,
1154 };
1155
1156 /*
1157  * Sub-module support
1158  */
1159 /*
1160  * Helper for managing a buffer of info chain capabilities, allocate or
1161  * reallocate a buffer with additional @size, filling in @id and @version
1162  * of the capability.  A pointer to the new capability is returned.
1163  *
1164  * NB. The chain is based at the head of the buffer, so new entries are
1165  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1166  * next offsets prior to copying to the user buffer.
1167  */
1168 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1169                                                size_t size, u16 id, u16 version)
1170 {
1171         void *buf;
1172         struct vfio_info_cap_header *header, *tmp;
1173
1174         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1175         if (!buf) {
1176                 kfree(caps->buf);
1177                 caps->buf = NULL;
1178                 caps->size = 0;
1179                 return ERR_PTR(-ENOMEM);
1180         }
1181
1182         caps->buf = buf;
1183         header = buf + caps->size;
1184
1185         /* Eventually copied to user buffer, zero */
1186         memset(header, 0, size);
1187
1188         header->id = id;
1189         header->version = version;
1190
1191         /* Add to the end of the capability chain */
1192         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1193                 ; /* nothing */
1194
1195         tmp->next = caps->size;
1196         caps->size += size;
1197
1198         return header;
1199 }
1200 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1201
1202 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1203 {
1204         struct vfio_info_cap_header *tmp;
1205         void *buf = (void *)caps->buf;
1206
1207         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1208                 tmp->next += offset;
1209 }
1210 EXPORT_SYMBOL(vfio_info_cap_shift);
1211
1212 int vfio_info_add_capability(struct vfio_info_cap *caps,
1213                              struct vfio_info_cap_header *cap, size_t size)
1214 {
1215         struct vfio_info_cap_header *header;
1216
1217         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1218         if (IS_ERR(header))
1219                 return PTR_ERR(header);
1220
1221         memcpy(header + 1, cap + 1, size - sizeof(*header));
1222
1223         return 0;
1224 }
1225 EXPORT_SYMBOL(vfio_info_add_capability);
1226
1227 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1228                                        int max_irq_type, size_t *data_size)
1229 {
1230         unsigned long minsz;
1231         size_t size;
1232
1233         minsz = offsetofend(struct vfio_irq_set, count);
1234
1235         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1236             (hdr->count >= (U32_MAX - hdr->start)) ||
1237             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1238                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1239                 return -EINVAL;
1240
1241         if (data_size)
1242                 *data_size = 0;
1243
1244         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1245                 return -EINVAL;
1246
1247         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1248         case VFIO_IRQ_SET_DATA_NONE:
1249                 size = 0;
1250                 break;
1251         case VFIO_IRQ_SET_DATA_BOOL:
1252                 size = sizeof(uint8_t);
1253                 break;
1254         case VFIO_IRQ_SET_DATA_EVENTFD:
1255                 size = sizeof(int32_t);
1256                 break;
1257         default:
1258                 return -EINVAL;
1259         }
1260
1261         if (size) {
1262                 if (hdr->argsz - minsz < hdr->count * size)
1263                         return -EINVAL;
1264
1265                 if (!data_size)
1266                         return -EINVAL;
1267
1268                 *data_size = hdr->count * size;
1269         }
1270
1271         return 0;
1272 }
1273 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1274
1275 /*
1276  * Pin contiguous user pages and return their associated host pages for local
1277  * domain only.
1278  * @device [in]  : device
1279  * @iova [in]    : starting IOVA of user pages to be pinned.
1280  * @npage [in]   : count of pages to be pinned.  This count should not
1281  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1282  * @prot [in]    : protection flags
1283  * @pages[out]   : array of host pages
1284  * Return error or number of pages pinned.
1285  *
1286  * A driver may only call this function if the vfio_device was created
1287  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1288  */
1289 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1290                    int npage, int prot, struct page **pages)
1291 {
1292         /* group->container cannot change while a vfio device is open */
1293         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1294                 return -EINVAL;
1295         if (vfio_device_has_container(device))
1296                 return vfio_device_container_pin_pages(device, iova,
1297                                                        npage, prot, pages);
1298         if (device->iommufd_access) {
1299                 int ret;
1300
1301                 if (iova > ULONG_MAX)
1302                         return -EINVAL;
1303                 /*
1304                  * VFIO ignores the sub page offset, npages is from the start of
1305                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1306                  * the sub page offset by doing:
1307                  *     pages[0] + (iova % PAGE_SIZE)
1308                  */
1309                 ret = iommufd_access_pin_pages(
1310                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1311                         npage * PAGE_SIZE, pages,
1312                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1313                 if (ret)
1314                         return ret;
1315                 return npage;
1316         }
1317         return -EINVAL;
1318 }
1319 EXPORT_SYMBOL(vfio_pin_pages);
1320
1321 /*
1322  * Unpin contiguous host pages for local domain only.
1323  * @device [in]  : device
1324  * @iova [in]    : starting address of user pages to be unpinned.
1325  * @npage [in]   : count of pages to be unpinned.  This count should not
1326  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1327  */
1328 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1329 {
1330         if (WARN_ON(!vfio_assert_device_open(device)))
1331                 return;
1332
1333         if (vfio_device_has_container(device)) {
1334                 vfio_device_container_unpin_pages(device, iova, npage);
1335                 return;
1336         }
1337         if (device->iommufd_access) {
1338                 if (WARN_ON(iova > ULONG_MAX))
1339                         return;
1340                 iommufd_access_unpin_pages(device->iommufd_access,
1341                                            ALIGN_DOWN(iova, PAGE_SIZE),
1342                                            npage * PAGE_SIZE);
1343                 return;
1344         }
1345 }
1346 EXPORT_SYMBOL(vfio_unpin_pages);
1347
1348 /*
1349  * This interface allows the CPUs to perform some sort of virtual DMA on
1350  * behalf of the device.
1351  *
1352  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1353  * into/from a kernel buffer.
1354  *
1355  * As the read/write of user space memory is conducted via the CPUs and is
1356  * not a real device DMA, it is not necessary to pin the user space memory.
1357  *
1358  * @device [in]         : VFIO device
1359  * @iova [in]           : base IOVA of a user space buffer
1360  * @data [in]           : pointer to kernel buffer
1361  * @len [in]            : kernel buffer length
1362  * @write               : indicate read or write
1363  * Return error code on failure or 0 on success.
1364  */
1365 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1366                 size_t len, bool write)
1367 {
1368         if (!data || len <= 0 || !vfio_assert_device_open(device))
1369                 return -EINVAL;
1370
1371         if (vfio_device_has_container(device))
1372                 return vfio_device_container_dma_rw(device, iova,
1373                                                     data, len, write);
1374
1375         if (device->iommufd_access) {
1376                 unsigned int flags = 0;
1377
1378                 if (iova > ULONG_MAX)
1379                         return -EINVAL;
1380
1381                 /* VFIO historically tries to auto-detect a kthread */
1382                 if (!current->mm)
1383                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1384                 if (write)
1385                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1386                 return iommufd_access_rw(device->iommufd_access, iova, data,
1387                                          len, flags);
1388         }
1389         return -EINVAL;
1390 }
1391 EXPORT_SYMBOL(vfio_dma_rw);
1392
1393 /*
1394  * Module/class support
1395  */
1396 static int __init vfio_init(void)
1397 {
1398         int ret;
1399
1400         ida_init(&vfio.device_ida);
1401
1402         ret = vfio_group_init();
1403         if (ret)
1404                 return ret;
1405
1406         ret = vfio_virqfd_init();
1407         if (ret)
1408                 goto err_virqfd;
1409
1410         /* /sys/class/vfio-dev/vfioX */
1411         vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1412         if (IS_ERR(vfio.device_class)) {
1413                 ret = PTR_ERR(vfio.device_class);
1414                 goto err_dev_class;
1415         }
1416
1417         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1418         return 0;
1419
1420 err_dev_class:
1421         vfio_virqfd_exit();
1422 err_virqfd:
1423         vfio_group_cleanup();
1424         return ret;
1425 }
1426
1427 static void __exit vfio_cleanup(void)
1428 {
1429         ida_destroy(&vfio.device_ida);
1430         class_destroy(vfio.device_class);
1431         vfio.device_class = NULL;
1432         vfio_virqfd_exit();
1433         vfio_group_cleanup();
1434         xa_destroy(&vfio_device_set_xa);
1435 }
1436
1437 module_init(vfio_init);
1438 module_exit(vfio_cleanup);
1439
1440 MODULE_VERSION(DRIVER_VERSION);
1441 MODULE_LICENSE("GPL v2");
1442 MODULE_AUTHOR(DRIVER_AUTHOR);
1443 MODULE_DESCRIPTION(DRIVER_DESC);
1444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");