Merge tag 'i2c-for-6.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa...
[linux-2.6-microblaze.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #include <linux/list.h>
20 #include <linux/miscdevice.h>
21 #include <linux/module.h>
22 #include <linux/mutex.h>
23 #include <linux/pci.h>
24 #include <linux/rwsem.h>
25 #include <linux/sched.h>
26 #include <linux/slab.h>
27 #include <linux/stat.h>
28 #include <linux/string.h>
29 #include <linux/uaccess.h>
30 #include <linux/vfio.h>
31 #include <linux/wait.h>
32 #include <linux/sched/signal.h>
33 #include <linux/pm_runtime.h>
34 #include <linux/interval_tree.h>
35 #include <linux/iova_bitmap.h>
36 #include <linux/iommufd.h>
37 #include "vfio.h"
38
39 #define DRIVER_VERSION  "0.3"
40 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
41 #define DRIVER_DESC     "VFIO - User Level meta-driver"
42
43 static struct vfio {
44         struct class                    *device_class;
45         struct ida                      device_ida;
46 } vfio;
47
48 #ifdef CONFIG_VFIO_NOIOMMU
49 bool vfio_noiommu __read_mostly;
50 module_param_named(enable_unsafe_noiommu_mode,
51                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
52 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
53 #endif
54
55 static DEFINE_XARRAY(vfio_device_set_xa);
56
57 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
58 {
59         unsigned long idx = (unsigned long)set_id;
60         struct vfio_device_set *new_dev_set;
61         struct vfio_device_set *dev_set;
62
63         if (WARN_ON(!set_id))
64                 return -EINVAL;
65
66         /*
67          * Atomically acquire a singleton object in the xarray for this set_id
68          */
69         xa_lock(&vfio_device_set_xa);
70         dev_set = xa_load(&vfio_device_set_xa, idx);
71         if (dev_set)
72                 goto found_get_ref;
73         xa_unlock(&vfio_device_set_xa);
74
75         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
76         if (!new_dev_set)
77                 return -ENOMEM;
78         mutex_init(&new_dev_set->lock);
79         INIT_LIST_HEAD(&new_dev_set->device_list);
80         new_dev_set->set_id = set_id;
81
82         xa_lock(&vfio_device_set_xa);
83         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
84                                GFP_KERNEL);
85         if (!dev_set) {
86                 dev_set = new_dev_set;
87                 goto found_get_ref;
88         }
89
90         kfree(new_dev_set);
91         if (xa_is_err(dev_set)) {
92                 xa_unlock(&vfio_device_set_xa);
93                 return xa_err(dev_set);
94         }
95
96 found_get_ref:
97         dev_set->device_count++;
98         xa_unlock(&vfio_device_set_xa);
99         mutex_lock(&dev_set->lock);
100         device->dev_set = dev_set;
101         list_add_tail(&device->dev_set_list, &dev_set->device_list);
102         mutex_unlock(&dev_set->lock);
103         return 0;
104 }
105 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
106
107 static void vfio_release_device_set(struct vfio_device *device)
108 {
109         struct vfio_device_set *dev_set = device->dev_set;
110
111         if (!dev_set)
112                 return;
113
114         mutex_lock(&dev_set->lock);
115         list_del(&device->dev_set_list);
116         mutex_unlock(&dev_set->lock);
117
118         xa_lock(&vfio_device_set_xa);
119         if (!--dev_set->device_count) {
120                 __xa_erase(&vfio_device_set_xa,
121                            (unsigned long)dev_set->set_id);
122                 mutex_destroy(&dev_set->lock);
123                 kfree(dev_set);
124         }
125         xa_unlock(&vfio_device_set_xa);
126 }
127
128 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
129 {
130         struct vfio_device *cur;
131         unsigned int open_count = 0;
132
133         lockdep_assert_held(&dev_set->lock);
134
135         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
136                 open_count += cur->open_count;
137         return open_count;
138 }
139 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
140
141 /*
142  * Device objects - create, release, get, put, search
143  */
144 /* Device reference always implies a group reference */
145 void vfio_device_put_registration(struct vfio_device *device)
146 {
147         if (refcount_dec_and_test(&device->refcount))
148                 complete(&device->comp);
149 }
150
151 bool vfio_device_try_get_registration(struct vfio_device *device)
152 {
153         return refcount_inc_not_zero(&device->refcount);
154 }
155
156 /*
157  * VFIO driver API
158  */
159 /* Release helper called by vfio_put_device() */
160 static void vfio_device_release(struct device *dev)
161 {
162         struct vfio_device *device =
163                         container_of(dev, struct vfio_device, device);
164
165         vfio_release_device_set(device);
166         ida_free(&vfio.device_ida, device->index);
167
168         if (device->ops->release)
169                 device->ops->release(device);
170
171         kvfree(device);
172 }
173
174 static int vfio_init_device(struct vfio_device *device, struct device *dev,
175                             const struct vfio_device_ops *ops);
176
177 /*
178  * Allocate and initialize vfio_device so it can be registered to vfio
179  * core.
180  *
181  * Drivers should use the wrapper vfio_alloc_device() for allocation.
182  * @size is the size of the structure to be allocated, including any
183  * private data used by the driver.
184  *
185  * Driver may provide an @init callback to cover device private data.
186  *
187  * Use vfio_put_device() to release the structure after success return.
188  */
189 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
190                                        const struct vfio_device_ops *ops)
191 {
192         struct vfio_device *device;
193         int ret;
194
195         if (WARN_ON(size < sizeof(struct vfio_device)))
196                 return ERR_PTR(-EINVAL);
197
198         device = kvzalloc(size, GFP_KERNEL);
199         if (!device)
200                 return ERR_PTR(-ENOMEM);
201
202         ret = vfio_init_device(device, dev, ops);
203         if (ret)
204                 goto out_free;
205         return device;
206
207 out_free:
208         kvfree(device);
209         return ERR_PTR(ret);
210 }
211 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
212
213 /*
214  * Initialize a vfio_device so it can be registered to vfio core.
215  */
216 static int vfio_init_device(struct vfio_device *device, struct device *dev,
217                             const struct vfio_device_ops *ops)
218 {
219         int ret;
220
221         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
222         if (ret < 0) {
223                 dev_dbg(dev, "Error to alloc index\n");
224                 return ret;
225         }
226
227         device->index = ret;
228         init_completion(&device->comp);
229         device->dev = dev;
230         device->ops = ops;
231
232         if (ops->init) {
233                 ret = ops->init(device);
234                 if (ret)
235                         goto out_uninit;
236         }
237
238         device_initialize(&device->device);
239         device->device.release = vfio_device_release;
240         device->device.class = vfio.device_class;
241         device->device.parent = device->dev;
242         return 0;
243
244 out_uninit:
245         vfio_release_device_set(device);
246         ida_free(&vfio.device_ida, device->index);
247         return ret;
248 }
249
250 static int __vfio_register_dev(struct vfio_device *device,
251                                enum vfio_group_type type)
252 {
253         int ret;
254
255         if (WARN_ON(device->ops->bind_iommufd &&
256                     (!device->ops->unbind_iommufd ||
257                      !device->ops->attach_ioas)))
258                 return -EINVAL;
259
260         /*
261          * If the driver doesn't specify a set then the device is added to a
262          * singleton set just for itself.
263          */
264         if (!device->dev_set)
265                 vfio_assign_device_set(device, device);
266
267         ret = dev_set_name(&device->device, "vfio%d", device->index);
268         if (ret)
269                 return ret;
270
271         ret = vfio_device_set_group(device, type);
272         if (ret)
273                 return ret;
274
275         ret = device_add(&device->device);
276         if (ret)
277                 goto err_out;
278
279         /* Refcounting can't start until the driver calls register */
280         refcount_set(&device->refcount, 1);
281
282         vfio_device_group_register(device);
283
284         return 0;
285 err_out:
286         vfio_device_remove_group(device);
287         return ret;
288 }
289
290 int vfio_register_group_dev(struct vfio_device *device)
291 {
292         return __vfio_register_dev(device, VFIO_IOMMU);
293 }
294 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
295
296 /*
297  * Register a virtual device without IOMMU backing.  The user of this
298  * device must not be able to directly trigger unmediated DMA.
299  */
300 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
301 {
302         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
303 }
304 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
305
306 /*
307  * Decrement the device reference count and wait for the device to be
308  * removed.  Open file descriptors for the device... */
309 void vfio_unregister_group_dev(struct vfio_device *device)
310 {
311         unsigned int i = 0;
312         bool interrupted = false;
313         long rc;
314
315         vfio_device_put_registration(device);
316         rc = try_wait_for_completion(&device->comp);
317         while (rc <= 0) {
318                 if (device->ops->request)
319                         device->ops->request(device, i++);
320
321                 if (interrupted) {
322                         rc = wait_for_completion_timeout(&device->comp,
323                                                          HZ * 10);
324                 } else {
325                         rc = wait_for_completion_interruptible_timeout(
326                                 &device->comp, HZ * 10);
327                         if (rc < 0) {
328                                 interrupted = true;
329                                 dev_warn(device->dev,
330                                          "Device is currently in use, task"
331                                          " \"%s\" (%d) "
332                                          "blocked until device is released",
333                                          current->comm, task_pid_nr(current));
334                         }
335                 }
336         }
337
338         vfio_device_group_unregister(device);
339
340         /* Balances device_add in register path */
341         device_del(&device->device);
342
343         /* Balances vfio_device_set_group in register path */
344         vfio_device_remove_group(device);
345 }
346 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
347
348 /* true if the vfio_device has open_device() called but not close_device() */
349 static bool vfio_assert_device_open(struct vfio_device *device)
350 {
351         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
352 }
353
354 static int vfio_device_first_open(struct vfio_device *device,
355                                   struct iommufd_ctx *iommufd, struct kvm *kvm)
356 {
357         int ret;
358
359         lockdep_assert_held(&device->dev_set->lock);
360
361         if (!try_module_get(device->dev->driver->owner))
362                 return -ENODEV;
363
364         if (iommufd)
365                 ret = vfio_iommufd_bind(device, iommufd);
366         else
367                 ret = vfio_device_group_use_iommu(device);
368         if (ret)
369                 goto err_module_put;
370
371         device->kvm = kvm;
372         if (device->ops->open_device) {
373                 ret = device->ops->open_device(device);
374                 if (ret)
375                         goto err_unuse_iommu;
376         }
377         return 0;
378
379 err_unuse_iommu:
380         device->kvm = NULL;
381         if (iommufd)
382                 vfio_iommufd_unbind(device);
383         else
384                 vfio_device_group_unuse_iommu(device);
385 err_module_put:
386         module_put(device->dev->driver->owner);
387         return ret;
388 }
389
390 static void vfio_device_last_close(struct vfio_device *device,
391                                    struct iommufd_ctx *iommufd)
392 {
393         lockdep_assert_held(&device->dev_set->lock);
394
395         if (device->ops->close_device)
396                 device->ops->close_device(device);
397         device->kvm = NULL;
398         if (iommufd)
399                 vfio_iommufd_unbind(device);
400         else
401                 vfio_device_group_unuse_iommu(device);
402         module_put(device->dev->driver->owner);
403 }
404
405 int vfio_device_open(struct vfio_device *device,
406                      struct iommufd_ctx *iommufd, struct kvm *kvm)
407 {
408         int ret = 0;
409
410         mutex_lock(&device->dev_set->lock);
411         device->open_count++;
412         if (device->open_count == 1) {
413                 ret = vfio_device_first_open(device, iommufd, kvm);
414                 if (ret)
415                         device->open_count--;
416         }
417         mutex_unlock(&device->dev_set->lock);
418
419         return ret;
420 }
421
422 void vfio_device_close(struct vfio_device *device,
423                        struct iommufd_ctx *iommufd)
424 {
425         mutex_lock(&device->dev_set->lock);
426         vfio_assert_device_open(device);
427         if (device->open_count == 1)
428                 vfio_device_last_close(device, iommufd);
429         device->open_count--;
430         mutex_unlock(&device->dev_set->lock);
431 }
432
433 /*
434  * Wrapper around pm_runtime_resume_and_get().
435  * Return error code on failure or 0 on success.
436  */
437 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
438 {
439         struct device *dev = device->dev;
440
441         if (dev->driver && dev->driver->pm) {
442                 int ret;
443
444                 ret = pm_runtime_resume_and_get(dev);
445                 if (ret) {
446                         dev_info_ratelimited(dev,
447                                 "vfio: runtime resume failed %d\n", ret);
448                         return -EIO;
449                 }
450         }
451
452         return 0;
453 }
454
455 /*
456  * Wrapper around pm_runtime_put().
457  */
458 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
459 {
460         struct device *dev = device->dev;
461
462         if (dev->driver && dev->driver->pm)
463                 pm_runtime_put(dev);
464 }
465
466 /*
467  * VFIO Device fd
468  */
469 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
470 {
471         struct vfio_device *device = filep->private_data;
472
473         vfio_device_group_close(device);
474
475         vfio_device_put_registration(device);
476
477         return 0;
478 }
479
480 /*
481  * vfio_mig_get_next_state - Compute the next step in the FSM
482  * @cur_fsm - The current state the device is in
483  * @new_fsm - The target state to reach
484  * @next_fsm - Pointer to the next step to get to new_fsm
485  *
486  * Return 0 upon success, otherwise -errno
487  * Upon success the next step in the state progression between cur_fsm and
488  * new_fsm will be set in next_fsm.
489  *
490  * This breaks down requests for combination transitions into smaller steps and
491  * returns the next step to get to new_fsm. The function may need to be called
492  * multiple times before reaching new_fsm.
493  *
494  */
495 int vfio_mig_get_next_state(struct vfio_device *device,
496                             enum vfio_device_mig_state cur_fsm,
497                             enum vfio_device_mig_state new_fsm,
498                             enum vfio_device_mig_state *next_fsm)
499 {
500         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
501         /*
502          * The coding in this table requires the driver to implement the
503          * following FSM arcs:
504          *         RESUMING -> STOP
505          *         STOP -> RESUMING
506          *         STOP -> STOP_COPY
507          *         STOP_COPY -> STOP
508          *
509          * If P2P is supported then the driver must also implement these FSM
510          * arcs:
511          *         RUNNING -> RUNNING_P2P
512          *         RUNNING_P2P -> RUNNING
513          *         RUNNING_P2P -> STOP
514          *         STOP -> RUNNING_P2P
515          *
516          * If precopy is supported then the driver must support these additional
517          * FSM arcs:
518          *         RUNNING -> PRE_COPY
519          *         PRE_COPY -> RUNNING
520          *         PRE_COPY -> STOP_COPY
521          * However, if precopy and P2P are supported together then the driver
522          * must support these additional arcs beyond the P2P arcs above:
523          *         PRE_COPY -> RUNNING
524          *         PRE_COPY -> PRE_COPY_P2P
525          *         PRE_COPY_P2P -> PRE_COPY
526          *         PRE_COPY_P2P -> RUNNING_P2P
527          *         PRE_COPY_P2P -> STOP_COPY
528          *         RUNNING -> PRE_COPY
529          *         RUNNING_P2P -> PRE_COPY_P2P
530          *
531          * Without P2P and precopy the driver must implement:
532          *         RUNNING -> STOP
533          *         STOP -> RUNNING
534          *
535          * The coding will step through multiple states for some combination
536          * transitions; if all optional features are supported, this means the
537          * following ones:
538          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
539          *         PRE_COPY -> RUNNING -> RUNNING_P2P
540          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
541          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
542          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
543          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
544          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
545          *         RESUMING -> STOP -> RUNNING_P2P
546          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
547          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
548          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
549          *         RESUMING -> STOP -> STOP_COPY
550          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
551          *         RUNNING -> RUNNING_P2P -> STOP
552          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
553          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
554          *         RUNNING_P2P -> RUNNING -> PRE_COPY
555          *         RUNNING_P2P -> STOP -> RESUMING
556          *         RUNNING_P2P -> STOP -> STOP_COPY
557          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
558          *         STOP -> RUNNING_P2P -> RUNNING
559          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
560          *         STOP_COPY -> STOP -> RESUMING
561          *         STOP_COPY -> STOP -> RUNNING_P2P
562          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
563          *
564          *  The following transitions are blocked:
565          *         STOP_COPY -> PRE_COPY
566          *         STOP_COPY -> PRE_COPY_P2P
567          */
568         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
569                 [VFIO_DEVICE_STATE_STOP] = {
570                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
571                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
572                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
573                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
574                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
575                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
576                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
577                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
578                 },
579                 [VFIO_DEVICE_STATE_RUNNING] = {
580                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
581                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
582                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
583                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
584                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
585                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
586                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
587                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
588                 },
589                 [VFIO_DEVICE_STATE_PRE_COPY] = {
590                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
591                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
592                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
593                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
594                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
595                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
596                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
597                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
598                 },
599                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
600                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
601                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
602                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
603                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
604                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
605                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
606                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
607                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
608                 },
609                 [VFIO_DEVICE_STATE_STOP_COPY] = {
610                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
611                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
612                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
613                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
614                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
615                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
616                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
617                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
618                 },
619                 [VFIO_DEVICE_STATE_RESUMING] = {
620                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
621                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
622                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
623                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
624                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
625                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
626                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
627                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
628                 },
629                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
630                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
631                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
632                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
633                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
634                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
635                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
636                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
637                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
638                 },
639                 [VFIO_DEVICE_STATE_ERROR] = {
640                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
641                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
642                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
643                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
644                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
645                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
646                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
647                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
648                 },
649         };
650
651         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
652                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
653                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
654                 [VFIO_DEVICE_STATE_PRE_COPY] =
655                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
656                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
657                                                    VFIO_MIGRATION_P2P |
658                                                    VFIO_MIGRATION_PRE_COPY,
659                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
660                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
661                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
662                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
663                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
664         };
665
666         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
667                     (state_flags_table[cur_fsm] & device->migration_flags) !=
668                         state_flags_table[cur_fsm]))
669                 return -EINVAL;
670
671         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
672            (state_flags_table[new_fsm] & device->migration_flags) !=
673                         state_flags_table[new_fsm])
674                 return -EINVAL;
675
676         /*
677          * Arcs touching optional and unsupported states are skipped over. The
678          * driver will instead see an arc from the original state to the next
679          * logical state, as per the above comment.
680          */
681         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
682         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
683                         state_flags_table[*next_fsm])
684                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
685
686         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
687 }
688 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
689
690 /*
691  * Convert the drivers's struct file into a FD number and return it to userspace
692  */
693 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
694                                    struct vfio_device_feature_mig_state *mig)
695 {
696         int ret;
697         int fd;
698
699         fd = get_unused_fd_flags(O_CLOEXEC);
700         if (fd < 0) {
701                 ret = fd;
702                 goto out_fput;
703         }
704
705         mig->data_fd = fd;
706         if (copy_to_user(arg, mig, sizeof(*mig))) {
707                 ret = -EFAULT;
708                 goto out_put_unused;
709         }
710         fd_install(fd, filp);
711         return 0;
712
713 out_put_unused:
714         put_unused_fd(fd);
715 out_fput:
716         fput(filp);
717         return ret;
718 }
719
720 static int
721 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
722                                            u32 flags, void __user *arg,
723                                            size_t argsz)
724 {
725         size_t minsz =
726                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
727         struct vfio_device_feature_mig_state mig;
728         struct file *filp = NULL;
729         int ret;
730
731         if (!device->mig_ops)
732                 return -ENOTTY;
733
734         ret = vfio_check_feature(flags, argsz,
735                                  VFIO_DEVICE_FEATURE_SET |
736                                  VFIO_DEVICE_FEATURE_GET,
737                                  sizeof(mig));
738         if (ret != 1)
739                 return ret;
740
741         if (copy_from_user(&mig, arg, minsz))
742                 return -EFAULT;
743
744         if (flags & VFIO_DEVICE_FEATURE_GET) {
745                 enum vfio_device_mig_state curr_state;
746
747                 ret = device->mig_ops->migration_get_state(device,
748                                                            &curr_state);
749                 if (ret)
750                         return ret;
751                 mig.device_state = curr_state;
752                 goto out_copy;
753         }
754
755         /* Handle the VFIO_DEVICE_FEATURE_SET */
756         filp = device->mig_ops->migration_set_state(device, mig.device_state);
757         if (IS_ERR(filp) || !filp)
758                 goto out_copy;
759
760         return vfio_ioct_mig_return_fd(filp, arg, &mig);
761 out_copy:
762         mig.data_fd = -1;
763         if (copy_to_user(arg, &mig, sizeof(mig)))
764                 return -EFAULT;
765         if (IS_ERR(filp))
766                 return PTR_ERR(filp);
767         return 0;
768 }
769
770 static int
771 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
772                                               u32 flags, void __user *arg,
773                                               size_t argsz)
774 {
775         struct vfio_device_feature_mig_data_size data_size = {};
776         unsigned long stop_copy_length;
777         int ret;
778
779         if (!device->mig_ops)
780                 return -ENOTTY;
781
782         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
783                                  sizeof(data_size));
784         if (ret != 1)
785                 return ret;
786
787         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
788         if (ret)
789                 return ret;
790
791         data_size.stop_copy_length = stop_copy_length;
792         if (copy_to_user(arg, &data_size, sizeof(data_size)))
793                 return -EFAULT;
794
795         return 0;
796 }
797
798 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
799                                                u32 flags, void __user *arg,
800                                                size_t argsz)
801 {
802         struct vfio_device_feature_migration mig = {
803                 .flags = device->migration_flags,
804         };
805         int ret;
806
807         if (!device->mig_ops)
808                 return -ENOTTY;
809
810         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
811                                  sizeof(mig));
812         if (ret != 1)
813                 return ret;
814         if (copy_to_user(arg, &mig, sizeof(mig)))
815                 return -EFAULT;
816         return 0;
817 }
818
819 /* Ranges should fit into a single kernel page */
820 #define LOG_MAX_RANGES \
821         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
822
823 static int
824 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
825                                         u32 flags, void __user *arg,
826                                         size_t argsz)
827 {
828         size_t minsz =
829                 offsetofend(struct vfio_device_feature_dma_logging_control,
830                             ranges);
831         struct vfio_device_feature_dma_logging_range __user *ranges;
832         struct vfio_device_feature_dma_logging_control control;
833         struct vfio_device_feature_dma_logging_range range;
834         struct rb_root_cached root = RB_ROOT_CACHED;
835         struct interval_tree_node *nodes;
836         u64 iova_end;
837         u32 nnodes;
838         int i, ret;
839
840         if (!device->log_ops)
841                 return -ENOTTY;
842
843         ret = vfio_check_feature(flags, argsz,
844                                  VFIO_DEVICE_FEATURE_SET,
845                                  sizeof(control));
846         if (ret != 1)
847                 return ret;
848
849         if (copy_from_user(&control, arg, minsz))
850                 return -EFAULT;
851
852         nnodes = control.num_ranges;
853         if (!nnodes)
854                 return -EINVAL;
855
856         if (nnodes > LOG_MAX_RANGES)
857                 return -E2BIG;
858
859         ranges = u64_to_user_ptr(control.ranges);
860         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
861                               GFP_KERNEL);
862         if (!nodes)
863                 return -ENOMEM;
864
865         for (i = 0; i < nnodes; i++) {
866                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
867                         ret = -EFAULT;
868                         goto end;
869                 }
870                 if (!IS_ALIGNED(range.iova, control.page_size) ||
871                     !IS_ALIGNED(range.length, control.page_size)) {
872                         ret = -EINVAL;
873                         goto end;
874                 }
875
876                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
877                     iova_end > ULONG_MAX) {
878                         ret = -EOVERFLOW;
879                         goto end;
880                 }
881
882                 nodes[i].start = range.iova;
883                 nodes[i].last = range.iova + range.length - 1;
884                 if (interval_tree_iter_first(&root, nodes[i].start,
885                                              nodes[i].last)) {
886                         /* Range overlapping */
887                         ret = -EINVAL;
888                         goto end;
889                 }
890                 interval_tree_insert(nodes + i, &root);
891         }
892
893         ret = device->log_ops->log_start(device, &root, nnodes,
894                                          &control.page_size);
895         if (ret)
896                 goto end;
897
898         if (copy_to_user(arg, &control, sizeof(control))) {
899                 ret = -EFAULT;
900                 device->log_ops->log_stop(device);
901         }
902
903 end:
904         kfree(nodes);
905         return ret;
906 }
907
908 static int
909 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
910                                        u32 flags, void __user *arg,
911                                        size_t argsz)
912 {
913         int ret;
914
915         if (!device->log_ops)
916                 return -ENOTTY;
917
918         ret = vfio_check_feature(flags, argsz,
919                                  VFIO_DEVICE_FEATURE_SET, 0);
920         if (ret != 1)
921                 return ret;
922
923         return device->log_ops->log_stop(device);
924 }
925
926 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
927                                           unsigned long iova, size_t length,
928                                           void *opaque)
929 {
930         struct vfio_device *device = opaque;
931
932         return device->log_ops->log_read_and_clear(device, iova, length, iter);
933 }
934
935 static int
936 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
937                                          u32 flags, void __user *arg,
938                                          size_t argsz)
939 {
940         size_t minsz =
941                 offsetofend(struct vfio_device_feature_dma_logging_report,
942                             bitmap);
943         struct vfio_device_feature_dma_logging_report report;
944         struct iova_bitmap *iter;
945         u64 iova_end;
946         int ret;
947
948         if (!device->log_ops)
949                 return -ENOTTY;
950
951         ret = vfio_check_feature(flags, argsz,
952                                  VFIO_DEVICE_FEATURE_GET,
953                                  sizeof(report));
954         if (ret != 1)
955                 return ret;
956
957         if (copy_from_user(&report, arg, minsz))
958                 return -EFAULT;
959
960         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
961                 return -EINVAL;
962
963         if (check_add_overflow(report.iova, report.length, &iova_end) ||
964             iova_end > ULONG_MAX)
965                 return -EOVERFLOW;
966
967         iter = iova_bitmap_alloc(report.iova, report.length,
968                                  report.page_size,
969                                  u64_to_user_ptr(report.bitmap));
970         if (IS_ERR(iter))
971                 return PTR_ERR(iter);
972
973         ret = iova_bitmap_for_each(iter, device,
974                                    vfio_device_log_read_and_clear);
975
976         iova_bitmap_free(iter);
977         return ret;
978 }
979
980 static int vfio_ioctl_device_feature(struct vfio_device *device,
981                                      struct vfio_device_feature __user *arg)
982 {
983         size_t minsz = offsetofend(struct vfio_device_feature, flags);
984         struct vfio_device_feature feature;
985
986         if (copy_from_user(&feature, arg, minsz))
987                 return -EFAULT;
988
989         if (feature.argsz < minsz)
990                 return -EINVAL;
991
992         /* Check unknown flags */
993         if (feature.flags &
994             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
995               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
996                 return -EINVAL;
997
998         /* GET & SET are mutually exclusive except with PROBE */
999         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1000             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1001             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1002                 return -EINVAL;
1003
1004         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1005         case VFIO_DEVICE_FEATURE_MIGRATION:
1006                 return vfio_ioctl_device_feature_migration(
1007                         device, feature.flags, arg->data,
1008                         feature.argsz - minsz);
1009         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1010                 return vfio_ioctl_device_feature_mig_device_state(
1011                         device, feature.flags, arg->data,
1012                         feature.argsz - minsz);
1013         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1014                 return vfio_ioctl_device_feature_logging_start(
1015                         device, feature.flags, arg->data,
1016                         feature.argsz - minsz);
1017         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1018                 return vfio_ioctl_device_feature_logging_stop(
1019                         device, feature.flags, arg->data,
1020                         feature.argsz - minsz);
1021         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1022                 return vfio_ioctl_device_feature_logging_report(
1023                         device, feature.flags, arg->data,
1024                         feature.argsz - minsz);
1025         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1026                 return vfio_ioctl_device_feature_migration_data_size(
1027                         device, feature.flags, arg->data,
1028                         feature.argsz - minsz);
1029         default:
1030                 if (unlikely(!device->ops->device_feature))
1031                         return -EINVAL;
1032                 return device->ops->device_feature(device, feature.flags,
1033                                                    arg->data,
1034                                                    feature.argsz - minsz);
1035         }
1036 }
1037
1038 static long vfio_device_fops_unl_ioctl(struct file *filep,
1039                                        unsigned int cmd, unsigned long arg)
1040 {
1041         struct vfio_device *device = filep->private_data;
1042         int ret;
1043
1044         ret = vfio_device_pm_runtime_get(device);
1045         if (ret)
1046                 return ret;
1047
1048         switch (cmd) {
1049         case VFIO_DEVICE_FEATURE:
1050                 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1051                 break;
1052
1053         default:
1054                 if (unlikely(!device->ops->ioctl))
1055                         ret = -EINVAL;
1056                 else
1057                         ret = device->ops->ioctl(device, cmd, arg);
1058                 break;
1059         }
1060
1061         vfio_device_pm_runtime_put(device);
1062         return ret;
1063 }
1064
1065 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1066                                      size_t count, loff_t *ppos)
1067 {
1068         struct vfio_device *device = filep->private_data;
1069
1070         if (unlikely(!device->ops->read))
1071                 return -EINVAL;
1072
1073         return device->ops->read(device, buf, count, ppos);
1074 }
1075
1076 static ssize_t vfio_device_fops_write(struct file *filep,
1077                                       const char __user *buf,
1078                                       size_t count, loff_t *ppos)
1079 {
1080         struct vfio_device *device = filep->private_data;
1081
1082         if (unlikely(!device->ops->write))
1083                 return -EINVAL;
1084
1085         return device->ops->write(device, buf, count, ppos);
1086 }
1087
1088 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1089 {
1090         struct vfio_device *device = filep->private_data;
1091
1092         if (unlikely(!device->ops->mmap))
1093                 return -EINVAL;
1094
1095         return device->ops->mmap(device, vma);
1096 }
1097
1098 const struct file_operations vfio_device_fops = {
1099         .owner          = THIS_MODULE,
1100         .release        = vfio_device_fops_release,
1101         .read           = vfio_device_fops_read,
1102         .write          = vfio_device_fops_write,
1103         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1104         .compat_ioctl   = compat_ptr_ioctl,
1105         .mmap           = vfio_device_fops_mmap,
1106 };
1107
1108 /*
1109  * Sub-module support
1110  */
1111 /*
1112  * Helper for managing a buffer of info chain capabilities, allocate or
1113  * reallocate a buffer with additional @size, filling in @id and @version
1114  * of the capability.  A pointer to the new capability is returned.
1115  *
1116  * NB. The chain is based at the head of the buffer, so new entries are
1117  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1118  * next offsets prior to copying to the user buffer.
1119  */
1120 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1121                                                size_t size, u16 id, u16 version)
1122 {
1123         void *buf;
1124         struct vfio_info_cap_header *header, *tmp;
1125
1126         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1127         if (!buf) {
1128                 kfree(caps->buf);
1129                 caps->buf = NULL;
1130                 caps->size = 0;
1131                 return ERR_PTR(-ENOMEM);
1132         }
1133
1134         caps->buf = buf;
1135         header = buf + caps->size;
1136
1137         /* Eventually copied to user buffer, zero */
1138         memset(header, 0, size);
1139
1140         header->id = id;
1141         header->version = version;
1142
1143         /* Add to the end of the capability chain */
1144         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1145                 ; /* nothing */
1146
1147         tmp->next = caps->size;
1148         caps->size += size;
1149
1150         return header;
1151 }
1152 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1153
1154 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1155 {
1156         struct vfio_info_cap_header *tmp;
1157         void *buf = (void *)caps->buf;
1158
1159         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1160                 tmp->next += offset;
1161 }
1162 EXPORT_SYMBOL(vfio_info_cap_shift);
1163
1164 int vfio_info_add_capability(struct vfio_info_cap *caps,
1165                              struct vfio_info_cap_header *cap, size_t size)
1166 {
1167         struct vfio_info_cap_header *header;
1168
1169         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1170         if (IS_ERR(header))
1171                 return PTR_ERR(header);
1172
1173         memcpy(header + 1, cap + 1, size - sizeof(*header));
1174
1175         return 0;
1176 }
1177 EXPORT_SYMBOL(vfio_info_add_capability);
1178
1179 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1180                                        int max_irq_type, size_t *data_size)
1181 {
1182         unsigned long minsz;
1183         size_t size;
1184
1185         minsz = offsetofend(struct vfio_irq_set, count);
1186
1187         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1188             (hdr->count >= (U32_MAX - hdr->start)) ||
1189             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1190                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1191                 return -EINVAL;
1192
1193         if (data_size)
1194                 *data_size = 0;
1195
1196         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1197                 return -EINVAL;
1198
1199         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1200         case VFIO_IRQ_SET_DATA_NONE:
1201                 size = 0;
1202                 break;
1203         case VFIO_IRQ_SET_DATA_BOOL:
1204                 size = sizeof(uint8_t);
1205                 break;
1206         case VFIO_IRQ_SET_DATA_EVENTFD:
1207                 size = sizeof(int32_t);
1208                 break;
1209         default:
1210                 return -EINVAL;
1211         }
1212
1213         if (size) {
1214                 if (hdr->argsz - minsz < hdr->count * size)
1215                         return -EINVAL;
1216
1217                 if (!data_size)
1218                         return -EINVAL;
1219
1220                 *data_size = hdr->count * size;
1221         }
1222
1223         return 0;
1224 }
1225 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1226
1227 /*
1228  * Pin contiguous user pages and return their associated host pages for local
1229  * domain only.
1230  * @device [in]  : device
1231  * @iova [in]    : starting IOVA of user pages to be pinned.
1232  * @npage [in]   : count of pages to be pinned.  This count should not
1233  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1234  * @prot [in]    : protection flags
1235  * @pages[out]   : array of host pages
1236  * Return error or number of pages pinned.
1237  *
1238  * A driver may only call this function if the vfio_device was created
1239  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1240  */
1241 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1242                    int npage, int prot, struct page **pages)
1243 {
1244         /* group->container cannot change while a vfio device is open */
1245         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1246                 return -EINVAL;
1247         if (vfio_device_has_container(device))
1248                 return vfio_device_container_pin_pages(device, iova,
1249                                                        npage, prot, pages);
1250         if (device->iommufd_access) {
1251                 int ret;
1252
1253                 if (iova > ULONG_MAX)
1254                         return -EINVAL;
1255                 /*
1256                  * VFIO ignores the sub page offset, npages is from the start of
1257                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1258                  * the sub page offset by doing:
1259                  *     pages[0] + (iova % PAGE_SIZE)
1260                  */
1261                 ret = iommufd_access_pin_pages(
1262                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1263                         npage * PAGE_SIZE, pages,
1264                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1265                 if (ret)
1266                         return ret;
1267                 return npage;
1268         }
1269         return -EINVAL;
1270 }
1271 EXPORT_SYMBOL(vfio_pin_pages);
1272
1273 /*
1274  * Unpin contiguous host pages for local domain only.
1275  * @device [in]  : device
1276  * @iova [in]    : starting address of user pages to be unpinned.
1277  * @npage [in]   : count of pages to be unpinned.  This count should not
1278  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1279  */
1280 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1281 {
1282         if (WARN_ON(!vfio_assert_device_open(device)))
1283                 return;
1284
1285         if (vfio_device_has_container(device)) {
1286                 vfio_device_container_unpin_pages(device, iova, npage);
1287                 return;
1288         }
1289         if (device->iommufd_access) {
1290                 if (WARN_ON(iova > ULONG_MAX))
1291                         return;
1292                 iommufd_access_unpin_pages(device->iommufd_access,
1293                                            ALIGN_DOWN(iova, PAGE_SIZE),
1294                                            npage * PAGE_SIZE);
1295                 return;
1296         }
1297 }
1298 EXPORT_SYMBOL(vfio_unpin_pages);
1299
1300 /*
1301  * This interface allows the CPUs to perform some sort of virtual DMA on
1302  * behalf of the device.
1303  *
1304  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1305  * into/from a kernel buffer.
1306  *
1307  * As the read/write of user space memory is conducted via the CPUs and is
1308  * not a real device DMA, it is not necessary to pin the user space memory.
1309  *
1310  * @device [in]         : VFIO device
1311  * @iova [in]           : base IOVA of a user space buffer
1312  * @data [in]           : pointer to kernel buffer
1313  * @len [in]            : kernel buffer length
1314  * @write               : indicate read or write
1315  * Return error code on failure or 0 on success.
1316  */
1317 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1318                 size_t len, bool write)
1319 {
1320         if (!data || len <= 0 || !vfio_assert_device_open(device))
1321                 return -EINVAL;
1322
1323         if (vfio_device_has_container(device))
1324                 return vfio_device_container_dma_rw(device, iova,
1325                                                     data, len, write);
1326
1327         if (device->iommufd_access) {
1328                 unsigned int flags = 0;
1329
1330                 if (iova > ULONG_MAX)
1331                         return -EINVAL;
1332
1333                 /* VFIO historically tries to auto-detect a kthread */
1334                 if (!current->mm)
1335                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1336                 if (write)
1337                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1338                 return iommufd_access_rw(device->iommufd_access, iova, data,
1339                                          len, flags);
1340         }
1341         return -EINVAL;
1342 }
1343 EXPORT_SYMBOL(vfio_dma_rw);
1344
1345 /*
1346  * Module/class support
1347  */
1348 static int __init vfio_init(void)
1349 {
1350         int ret;
1351
1352         ida_init(&vfio.device_ida);
1353
1354         ret = vfio_group_init();
1355         if (ret)
1356                 return ret;
1357
1358         ret = vfio_virqfd_init();
1359         if (ret)
1360                 goto err_virqfd;
1361
1362         /* /sys/class/vfio-dev/vfioX */
1363         vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1364         if (IS_ERR(vfio.device_class)) {
1365                 ret = PTR_ERR(vfio.device_class);
1366                 goto err_dev_class;
1367         }
1368
1369         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1370         return 0;
1371
1372 err_dev_class:
1373         vfio_virqfd_exit();
1374 err_virqfd:
1375         vfio_group_cleanup();
1376         return ret;
1377 }
1378
1379 static void __exit vfio_cleanup(void)
1380 {
1381         ida_destroy(&vfio.device_ida);
1382         class_destroy(vfio.device_class);
1383         vfio.device_class = NULL;
1384         vfio_virqfd_exit();
1385         vfio_group_cleanup();
1386         xa_destroy(&vfio_device_set_xa);
1387 }
1388
1389 module_init(vfio_init);
1390 module_exit(vfio_cleanup);
1391
1392 MODULE_VERSION(DRIVER_VERSION);
1393 MODULE_LICENSE("GPL v2");
1394 MODULE_AUTHOR(DRIVER_AUTHOR);
1395 MODULE_DESCRIPTION(DRIVER_DESC);
1396 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");