Merge tag 'nfsd-6.2-5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
[linux-2.6-microblaze.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #include <linux/list.h>
20 #include <linux/miscdevice.h>
21 #include <linux/module.h>
22 #include <linux/mutex.h>
23 #include <linux/pci.h>
24 #include <linux/rwsem.h>
25 #include <linux/sched.h>
26 #include <linux/slab.h>
27 #include <linux/stat.h>
28 #include <linux/string.h>
29 #include <linux/uaccess.h>
30 #include <linux/vfio.h>
31 #include <linux/wait.h>
32 #include <linux/sched/signal.h>
33 #include <linux/pm_runtime.h>
34 #include <linux/interval_tree.h>
35 #include <linux/iova_bitmap.h>
36 #include <linux/iommufd.h>
37 #include "vfio.h"
38
39 #define DRIVER_VERSION  "0.3"
40 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
41 #define DRIVER_DESC     "VFIO - User Level meta-driver"
42
43 static struct vfio {
44         struct class                    *device_class;
45         struct ida                      device_ida;
46 } vfio;
47
48 static DEFINE_XARRAY(vfio_device_set_xa);
49
50 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
51 {
52         unsigned long idx = (unsigned long)set_id;
53         struct vfio_device_set *new_dev_set;
54         struct vfio_device_set *dev_set;
55
56         if (WARN_ON(!set_id))
57                 return -EINVAL;
58
59         /*
60          * Atomically acquire a singleton object in the xarray for this set_id
61          */
62         xa_lock(&vfio_device_set_xa);
63         dev_set = xa_load(&vfio_device_set_xa, idx);
64         if (dev_set)
65                 goto found_get_ref;
66         xa_unlock(&vfio_device_set_xa);
67
68         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
69         if (!new_dev_set)
70                 return -ENOMEM;
71         mutex_init(&new_dev_set->lock);
72         INIT_LIST_HEAD(&new_dev_set->device_list);
73         new_dev_set->set_id = set_id;
74
75         xa_lock(&vfio_device_set_xa);
76         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
77                                GFP_KERNEL);
78         if (!dev_set) {
79                 dev_set = new_dev_set;
80                 goto found_get_ref;
81         }
82
83         kfree(new_dev_set);
84         if (xa_is_err(dev_set)) {
85                 xa_unlock(&vfio_device_set_xa);
86                 return xa_err(dev_set);
87         }
88
89 found_get_ref:
90         dev_set->device_count++;
91         xa_unlock(&vfio_device_set_xa);
92         mutex_lock(&dev_set->lock);
93         device->dev_set = dev_set;
94         list_add_tail(&device->dev_set_list, &dev_set->device_list);
95         mutex_unlock(&dev_set->lock);
96         return 0;
97 }
98 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
99
100 static void vfio_release_device_set(struct vfio_device *device)
101 {
102         struct vfio_device_set *dev_set = device->dev_set;
103
104         if (!dev_set)
105                 return;
106
107         mutex_lock(&dev_set->lock);
108         list_del(&device->dev_set_list);
109         mutex_unlock(&dev_set->lock);
110
111         xa_lock(&vfio_device_set_xa);
112         if (!--dev_set->device_count) {
113                 __xa_erase(&vfio_device_set_xa,
114                            (unsigned long)dev_set->set_id);
115                 mutex_destroy(&dev_set->lock);
116                 kfree(dev_set);
117         }
118         xa_unlock(&vfio_device_set_xa);
119 }
120
121 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
122 {
123         struct vfio_device *cur;
124         unsigned int open_count = 0;
125
126         lockdep_assert_held(&dev_set->lock);
127
128         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
129                 open_count += cur->open_count;
130         return open_count;
131 }
132 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
133
134 /*
135  * Device objects - create, release, get, put, search
136  */
137 /* Device reference always implies a group reference */
138 void vfio_device_put_registration(struct vfio_device *device)
139 {
140         if (refcount_dec_and_test(&device->refcount))
141                 complete(&device->comp);
142 }
143
144 bool vfio_device_try_get_registration(struct vfio_device *device)
145 {
146         return refcount_inc_not_zero(&device->refcount);
147 }
148
149 /*
150  * VFIO driver API
151  */
152 /* Release helper called by vfio_put_device() */
153 static void vfio_device_release(struct device *dev)
154 {
155         struct vfio_device *device =
156                         container_of(dev, struct vfio_device, device);
157
158         vfio_release_device_set(device);
159         ida_free(&vfio.device_ida, device->index);
160
161         if (device->ops->release)
162                 device->ops->release(device);
163
164         kvfree(device);
165 }
166
167 static int vfio_init_device(struct vfio_device *device, struct device *dev,
168                             const struct vfio_device_ops *ops);
169
170 /*
171  * Allocate and initialize vfio_device so it can be registered to vfio
172  * core.
173  *
174  * Drivers should use the wrapper vfio_alloc_device() for allocation.
175  * @size is the size of the structure to be allocated, including any
176  * private data used by the driver.
177  *
178  * Driver may provide an @init callback to cover device private data.
179  *
180  * Use vfio_put_device() to release the structure after success return.
181  */
182 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
183                                        const struct vfio_device_ops *ops)
184 {
185         struct vfio_device *device;
186         int ret;
187
188         if (WARN_ON(size < sizeof(struct vfio_device)))
189                 return ERR_PTR(-EINVAL);
190
191         device = kvzalloc(size, GFP_KERNEL);
192         if (!device)
193                 return ERR_PTR(-ENOMEM);
194
195         ret = vfio_init_device(device, dev, ops);
196         if (ret)
197                 goto out_free;
198         return device;
199
200 out_free:
201         kvfree(device);
202         return ERR_PTR(ret);
203 }
204 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
205
206 /*
207  * Initialize a vfio_device so it can be registered to vfio core.
208  */
209 static int vfio_init_device(struct vfio_device *device, struct device *dev,
210                             const struct vfio_device_ops *ops)
211 {
212         int ret;
213
214         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
215         if (ret < 0) {
216                 dev_dbg(dev, "Error to alloc index\n");
217                 return ret;
218         }
219
220         device->index = ret;
221         init_completion(&device->comp);
222         device->dev = dev;
223         device->ops = ops;
224
225         if (ops->init) {
226                 ret = ops->init(device);
227                 if (ret)
228                         goto out_uninit;
229         }
230
231         device_initialize(&device->device);
232         device->device.release = vfio_device_release;
233         device->device.class = vfio.device_class;
234         device->device.parent = device->dev;
235         return 0;
236
237 out_uninit:
238         vfio_release_device_set(device);
239         ida_free(&vfio.device_ida, device->index);
240         return ret;
241 }
242
243 static int __vfio_register_dev(struct vfio_device *device,
244                                enum vfio_group_type type)
245 {
246         int ret;
247
248         if (WARN_ON(device->ops->bind_iommufd &&
249                     (!device->ops->unbind_iommufd ||
250                      !device->ops->attach_ioas)))
251                 return -EINVAL;
252
253         /*
254          * If the driver doesn't specify a set then the device is added to a
255          * singleton set just for itself.
256          */
257         if (!device->dev_set)
258                 vfio_assign_device_set(device, device);
259
260         ret = dev_set_name(&device->device, "vfio%d", device->index);
261         if (ret)
262                 return ret;
263
264         ret = vfio_device_set_group(device, type);
265         if (ret)
266                 return ret;
267
268         ret = device_add(&device->device);
269         if (ret)
270                 goto err_out;
271
272         /* Refcounting can't start until the driver calls register */
273         refcount_set(&device->refcount, 1);
274
275         vfio_device_group_register(device);
276
277         return 0;
278 err_out:
279         vfio_device_remove_group(device);
280         return ret;
281 }
282
283 int vfio_register_group_dev(struct vfio_device *device)
284 {
285         return __vfio_register_dev(device, VFIO_IOMMU);
286 }
287 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
288
289 /*
290  * Register a virtual device without IOMMU backing.  The user of this
291  * device must not be able to directly trigger unmediated DMA.
292  */
293 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
294 {
295         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
296 }
297 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
298
299 /*
300  * Decrement the device reference count and wait for the device to be
301  * removed.  Open file descriptors for the device... */
302 void vfio_unregister_group_dev(struct vfio_device *device)
303 {
304         unsigned int i = 0;
305         bool interrupted = false;
306         long rc;
307
308         vfio_device_put_registration(device);
309         rc = try_wait_for_completion(&device->comp);
310         while (rc <= 0) {
311                 if (device->ops->request)
312                         device->ops->request(device, i++);
313
314                 if (interrupted) {
315                         rc = wait_for_completion_timeout(&device->comp,
316                                                          HZ * 10);
317                 } else {
318                         rc = wait_for_completion_interruptible_timeout(
319                                 &device->comp, HZ * 10);
320                         if (rc < 0) {
321                                 interrupted = true;
322                                 dev_warn(device->dev,
323                                          "Device is currently in use, task"
324                                          " \"%s\" (%d) "
325                                          "blocked until device is released",
326                                          current->comm, task_pid_nr(current));
327                         }
328                 }
329         }
330
331         vfio_device_group_unregister(device);
332
333         /* Balances device_add in register path */
334         device_del(&device->device);
335
336         /* Balances vfio_device_set_group in register path */
337         vfio_device_remove_group(device);
338 }
339 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
340
341 /* true if the vfio_device has open_device() called but not close_device() */
342 static bool vfio_assert_device_open(struct vfio_device *device)
343 {
344         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
345 }
346
347 static int vfio_device_first_open(struct vfio_device *device,
348                                   struct iommufd_ctx *iommufd, struct kvm *kvm)
349 {
350         int ret;
351
352         lockdep_assert_held(&device->dev_set->lock);
353
354         if (!try_module_get(device->dev->driver->owner))
355                 return -ENODEV;
356
357         if (iommufd)
358                 ret = vfio_iommufd_bind(device, iommufd);
359         else
360                 ret = vfio_device_group_use_iommu(device);
361         if (ret)
362                 goto err_module_put;
363
364         device->kvm = kvm;
365         if (device->ops->open_device) {
366                 ret = device->ops->open_device(device);
367                 if (ret)
368                         goto err_unuse_iommu;
369         }
370         return 0;
371
372 err_unuse_iommu:
373         device->kvm = NULL;
374         if (iommufd)
375                 vfio_iommufd_unbind(device);
376         else
377                 vfio_device_group_unuse_iommu(device);
378 err_module_put:
379         module_put(device->dev->driver->owner);
380         return ret;
381 }
382
383 static void vfio_device_last_close(struct vfio_device *device,
384                                    struct iommufd_ctx *iommufd)
385 {
386         lockdep_assert_held(&device->dev_set->lock);
387
388         if (device->ops->close_device)
389                 device->ops->close_device(device);
390         device->kvm = NULL;
391         if (iommufd)
392                 vfio_iommufd_unbind(device);
393         else
394                 vfio_device_group_unuse_iommu(device);
395         module_put(device->dev->driver->owner);
396 }
397
398 int vfio_device_open(struct vfio_device *device,
399                      struct iommufd_ctx *iommufd, struct kvm *kvm)
400 {
401         int ret = 0;
402
403         mutex_lock(&device->dev_set->lock);
404         device->open_count++;
405         if (device->open_count == 1) {
406                 ret = vfio_device_first_open(device, iommufd, kvm);
407                 if (ret)
408                         device->open_count--;
409         }
410         mutex_unlock(&device->dev_set->lock);
411
412         return ret;
413 }
414
415 void vfio_device_close(struct vfio_device *device,
416                        struct iommufd_ctx *iommufd)
417 {
418         mutex_lock(&device->dev_set->lock);
419         vfio_assert_device_open(device);
420         if (device->open_count == 1)
421                 vfio_device_last_close(device, iommufd);
422         device->open_count--;
423         mutex_unlock(&device->dev_set->lock);
424 }
425
426 /*
427  * Wrapper around pm_runtime_resume_and_get().
428  * Return error code on failure or 0 on success.
429  */
430 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
431 {
432         struct device *dev = device->dev;
433
434         if (dev->driver && dev->driver->pm) {
435                 int ret;
436
437                 ret = pm_runtime_resume_and_get(dev);
438                 if (ret) {
439                         dev_info_ratelimited(dev,
440                                 "vfio: runtime resume failed %d\n", ret);
441                         return -EIO;
442                 }
443         }
444
445         return 0;
446 }
447
448 /*
449  * Wrapper around pm_runtime_put().
450  */
451 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
452 {
453         struct device *dev = device->dev;
454
455         if (dev->driver && dev->driver->pm)
456                 pm_runtime_put(dev);
457 }
458
459 /*
460  * VFIO Device fd
461  */
462 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
463 {
464         struct vfio_device *device = filep->private_data;
465
466         vfio_device_group_close(device);
467
468         vfio_device_put_registration(device);
469
470         return 0;
471 }
472
473 /*
474  * vfio_mig_get_next_state - Compute the next step in the FSM
475  * @cur_fsm - The current state the device is in
476  * @new_fsm - The target state to reach
477  * @next_fsm - Pointer to the next step to get to new_fsm
478  *
479  * Return 0 upon success, otherwise -errno
480  * Upon success the next step in the state progression between cur_fsm and
481  * new_fsm will be set in next_fsm.
482  *
483  * This breaks down requests for combination transitions into smaller steps and
484  * returns the next step to get to new_fsm. The function may need to be called
485  * multiple times before reaching new_fsm.
486  *
487  */
488 int vfio_mig_get_next_state(struct vfio_device *device,
489                             enum vfio_device_mig_state cur_fsm,
490                             enum vfio_device_mig_state new_fsm,
491                             enum vfio_device_mig_state *next_fsm)
492 {
493         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
494         /*
495          * The coding in this table requires the driver to implement the
496          * following FSM arcs:
497          *         RESUMING -> STOP
498          *         STOP -> RESUMING
499          *         STOP -> STOP_COPY
500          *         STOP_COPY -> STOP
501          *
502          * If P2P is supported then the driver must also implement these FSM
503          * arcs:
504          *         RUNNING -> RUNNING_P2P
505          *         RUNNING_P2P -> RUNNING
506          *         RUNNING_P2P -> STOP
507          *         STOP -> RUNNING_P2P
508          *
509          * If precopy is supported then the driver must support these additional
510          * FSM arcs:
511          *         RUNNING -> PRE_COPY
512          *         PRE_COPY -> RUNNING
513          *         PRE_COPY -> STOP_COPY
514          * However, if precopy and P2P are supported together then the driver
515          * must support these additional arcs beyond the P2P arcs above:
516          *         PRE_COPY -> RUNNING
517          *         PRE_COPY -> PRE_COPY_P2P
518          *         PRE_COPY_P2P -> PRE_COPY
519          *         PRE_COPY_P2P -> RUNNING_P2P
520          *         PRE_COPY_P2P -> STOP_COPY
521          *         RUNNING -> PRE_COPY
522          *         RUNNING_P2P -> PRE_COPY_P2P
523          *
524          * Without P2P and precopy the driver must implement:
525          *         RUNNING -> STOP
526          *         STOP -> RUNNING
527          *
528          * The coding will step through multiple states for some combination
529          * transitions; if all optional features are supported, this means the
530          * following ones:
531          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
532          *         PRE_COPY -> RUNNING -> RUNNING_P2P
533          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
534          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
535          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
536          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
537          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
538          *         RESUMING -> STOP -> RUNNING_P2P
539          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
540          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
541          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
542          *         RESUMING -> STOP -> STOP_COPY
543          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
544          *         RUNNING -> RUNNING_P2P -> STOP
545          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
546          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
547          *         RUNNING_P2P -> RUNNING -> PRE_COPY
548          *         RUNNING_P2P -> STOP -> RESUMING
549          *         RUNNING_P2P -> STOP -> STOP_COPY
550          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
551          *         STOP -> RUNNING_P2P -> RUNNING
552          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
553          *         STOP_COPY -> STOP -> RESUMING
554          *         STOP_COPY -> STOP -> RUNNING_P2P
555          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
556          *
557          *  The following transitions are blocked:
558          *         STOP_COPY -> PRE_COPY
559          *         STOP_COPY -> PRE_COPY_P2P
560          */
561         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
562                 [VFIO_DEVICE_STATE_STOP] = {
563                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
564                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
565                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
566                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
567                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
568                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
569                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
570                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
571                 },
572                 [VFIO_DEVICE_STATE_RUNNING] = {
573                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
574                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
575                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
576                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
577                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
578                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
579                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
580                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
581                 },
582                 [VFIO_DEVICE_STATE_PRE_COPY] = {
583                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
584                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
585                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
586                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
587                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
588                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
589                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
590                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
591                 },
592                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
593                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
594                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
595                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
596                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
597                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
598                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
599                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
600                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
601                 },
602                 [VFIO_DEVICE_STATE_STOP_COPY] = {
603                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
604                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
605                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
606                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
607                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
608                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
609                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
610                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
611                 },
612                 [VFIO_DEVICE_STATE_RESUMING] = {
613                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
614                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
615                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
616                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
617                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
618                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
619                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
620                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
621                 },
622                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
623                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
624                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
625                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
626                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
627                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
628                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
629                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
630                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
631                 },
632                 [VFIO_DEVICE_STATE_ERROR] = {
633                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
634                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
635                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
636                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
637                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
638                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
639                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
640                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
641                 },
642         };
643
644         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
645                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
646                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
647                 [VFIO_DEVICE_STATE_PRE_COPY] =
648                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
649                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
650                                                    VFIO_MIGRATION_P2P |
651                                                    VFIO_MIGRATION_PRE_COPY,
652                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
653                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
654                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
655                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
656                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
657         };
658
659         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
660                     (state_flags_table[cur_fsm] & device->migration_flags) !=
661                         state_flags_table[cur_fsm]))
662                 return -EINVAL;
663
664         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
665            (state_flags_table[new_fsm] & device->migration_flags) !=
666                         state_flags_table[new_fsm])
667                 return -EINVAL;
668
669         /*
670          * Arcs touching optional and unsupported states are skipped over. The
671          * driver will instead see an arc from the original state to the next
672          * logical state, as per the above comment.
673          */
674         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
675         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
676                         state_flags_table[*next_fsm])
677                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
678
679         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
680 }
681 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
682
683 /*
684  * Convert the drivers's struct file into a FD number and return it to userspace
685  */
686 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
687                                    struct vfio_device_feature_mig_state *mig)
688 {
689         int ret;
690         int fd;
691
692         fd = get_unused_fd_flags(O_CLOEXEC);
693         if (fd < 0) {
694                 ret = fd;
695                 goto out_fput;
696         }
697
698         mig->data_fd = fd;
699         if (copy_to_user(arg, mig, sizeof(*mig))) {
700                 ret = -EFAULT;
701                 goto out_put_unused;
702         }
703         fd_install(fd, filp);
704         return 0;
705
706 out_put_unused:
707         put_unused_fd(fd);
708 out_fput:
709         fput(filp);
710         return ret;
711 }
712
713 static int
714 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
715                                            u32 flags, void __user *arg,
716                                            size_t argsz)
717 {
718         size_t minsz =
719                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
720         struct vfio_device_feature_mig_state mig;
721         struct file *filp = NULL;
722         int ret;
723
724         if (!device->mig_ops)
725                 return -ENOTTY;
726
727         ret = vfio_check_feature(flags, argsz,
728                                  VFIO_DEVICE_FEATURE_SET |
729                                  VFIO_DEVICE_FEATURE_GET,
730                                  sizeof(mig));
731         if (ret != 1)
732                 return ret;
733
734         if (copy_from_user(&mig, arg, minsz))
735                 return -EFAULT;
736
737         if (flags & VFIO_DEVICE_FEATURE_GET) {
738                 enum vfio_device_mig_state curr_state;
739
740                 ret = device->mig_ops->migration_get_state(device,
741                                                            &curr_state);
742                 if (ret)
743                         return ret;
744                 mig.device_state = curr_state;
745                 goto out_copy;
746         }
747
748         /* Handle the VFIO_DEVICE_FEATURE_SET */
749         filp = device->mig_ops->migration_set_state(device, mig.device_state);
750         if (IS_ERR(filp) || !filp)
751                 goto out_copy;
752
753         return vfio_ioct_mig_return_fd(filp, arg, &mig);
754 out_copy:
755         mig.data_fd = -1;
756         if (copy_to_user(arg, &mig, sizeof(mig)))
757                 return -EFAULT;
758         if (IS_ERR(filp))
759                 return PTR_ERR(filp);
760         return 0;
761 }
762
763 static int
764 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
765                                               u32 flags, void __user *arg,
766                                               size_t argsz)
767 {
768         struct vfio_device_feature_mig_data_size data_size = {};
769         unsigned long stop_copy_length;
770         int ret;
771
772         if (!device->mig_ops)
773                 return -ENOTTY;
774
775         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
776                                  sizeof(data_size));
777         if (ret != 1)
778                 return ret;
779
780         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
781         if (ret)
782                 return ret;
783
784         data_size.stop_copy_length = stop_copy_length;
785         if (copy_to_user(arg, &data_size, sizeof(data_size)))
786                 return -EFAULT;
787
788         return 0;
789 }
790
791 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
792                                                u32 flags, void __user *arg,
793                                                size_t argsz)
794 {
795         struct vfio_device_feature_migration mig = {
796                 .flags = device->migration_flags,
797         };
798         int ret;
799
800         if (!device->mig_ops)
801                 return -ENOTTY;
802
803         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
804                                  sizeof(mig));
805         if (ret != 1)
806                 return ret;
807         if (copy_to_user(arg, &mig, sizeof(mig)))
808                 return -EFAULT;
809         return 0;
810 }
811
812 /* Ranges should fit into a single kernel page */
813 #define LOG_MAX_RANGES \
814         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
815
816 static int
817 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
818                                         u32 flags, void __user *arg,
819                                         size_t argsz)
820 {
821         size_t minsz =
822                 offsetofend(struct vfio_device_feature_dma_logging_control,
823                             ranges);
824         struct vfio_device_feature_dma_logging_range __user *ranges;
825         struct vfio_device_feature_dma_logging_control control;
826         struct vfio_device_feature_dma_logging_range range;
827         struct rb_root_cached root = RB_ROOT_CACHED;
828         struct interval_tree_node *nodes;
829         u64 iova_end;
830         u32 nnodes;
831         int i, ret;
832
833         if (!device->log_ops)
834                 return -ENOTTY;
835
836         ret = vfio_check_feature(flags, argsz,
837                                  VFIO_DEVICE_FEATURE_SET,
838                                  sizeof(control));
839         if (ret != 1)
840                 return ret;
841
842         if (copy_from_user(&control, arg, minsz))
843                 return -EFAULT;
844
845         nnodes = control.num_ranges;
846         if (!nnodes)
847                 return -EINVAL;
848
849         if (nnodes > LOG_MAX_RANGES)
850                 return -E2BIG;
851
852         ranges = u64_to_user_ptr(control.ranges);
853         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
854                               GFP_KERNEL);
855         if (!nodes)
856                 return -ENOMEM;
857
858         for (i = 0; i < nnodes; i++) {
859                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
860                         ret = -EFAULT;
861                         goto end;
862                 }
863                 if (!IS_ALIGNED(range.iova, control.page_size) ||
864                     !IS_ALIGNED(range.length, control.page_size)) {
865                         ret = -EINVAL;
866                         goto end;
867                 }
868
869                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
870                     iova_end > ULONG_MAX) {
871                         ret = -EOVERFLOW;
872                         goto end;
873                 }
874
875                 nodes[i].start = range.iova;
876                 nodes[i].last = range.iova + range.length - 1;
877                 if (interval_tree_iter_first(&root, nodes[i].start,
878                                              nodes[i].last)) {
879                         /* Range overlapping */
880                         ret = -EINVAL;
881                         goto end;
882                 }
883                 interval_tree_insert(nodes + i, &root);
884         }
885
886         ret = device->log_ops->log_start(device, &root, nnodes,
887                                          &control.page_size);
888         if (ret)
889                 goto end;
890
891         if (copy_to_user(arg, &control, sizeof(control))) {
892                 ret = -EFAULT;
893                 device->log_ops->log_stop(device);
894         }
895
896 end:
897         kfree(nodes);
898         return ret;
899 }
900
901 static int
902 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
903                                        u32 flags, void __user *arg,
904                                        size_t argsz)
905 {
906         int ret;
907
908         if (!device->log_ops)
909                 return -ENOTTY;
910
911         ret = vfio_check_feature(flags, argsz,
912                                  VFIO_DEVICE_FEATURE_SET, 0);
913         if (ret != 1)
914                 return ret;
915
916         return device->log_ops->log_stop(device);
917 }
918
919 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
920                                           unsigned long iova, size_t length,
921                                           void *opaque)
922 {
923         struct vfio_device *device = opaque;
924
925         return device->log_ops->log_read_and_clear(device, iova, length, iter);
926 }
927
928 static int
929 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
930                                          u32 flags, void __user *arg,
931                                          size_t argsz)
932 {
933         size_t minsz =
934                 offsetofend(struct vfio_device_feature_dma_logging_report,
935                             bitmap);
936         struct vfio_device_feature_dma_logging_report report;
937         struct iova_bitmap *iter;
938         u64 iova_end;
939         int ret;
940
941         if (!device->log_ops)
942                 return -ENOTTY;
943
944         ret = vfio_check_feature(flags, argsz,
945                                  VFIO_DEVICE_FEATURE_GET,
946                                  sizeof(report));
947         if (ret != 1)
948                 return ret;
949
950         if (copy_from_user(&report, arg, minsz))
951                 return -EFAULT;
952
953         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
954                 return -EINVAL;
955
956         if (check_add_overflow(report.iova, report.length, &iova_end) ||
957             iova_end > ULONG_MAX)
958                 return -EOVERFLOW;
959
960         iter = iova_bitmap_alloc(report.iova, report.length,
961                                  report.page_size,
962                                  u64_to_user_ptr(report.bitmap));
963         if (IS_ERR(iter))
964                 return PTR_ERR(iter);
965
966         ret = iova_bitmap_for_each(iter, device,
967                                    vfio_device_log_read_and_clear);
968
969         iova_bitmap_free(iter);
970         return ret;
971 }
972
973 static int vfio_ioctl_device_feature(struct vfio_device *device,
974                                      struct vfio_device_feature __user *arg)
975 {
976         size_t minsz = offsetofend(struct vfio_device_feature, flags);
977         struct vfio_device_feature feature;
978
979         if (copy_from_user(&feature, arg, minsz))
980                 return -EFAULT;
981
982         if (feature.argsz < minsz)
983                 return -EINVAL;
984
985         /* Check unknown flags */
986         if (feature.flags &
987             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
988               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
989                 return -EINVAL;
990
991         /* GET & SET are mutually exclusive except with PROBE */
992         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
993             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
994             (feature.flags & VFIO_DEVICE_FEATURE_GET))
995                 return -EINVAL;
996
997         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
998         case VFIO_DEVICE_FEATURE_MIGRATION:
999                 return vfio_ioctl_device_feature_migration(
1000                         device, feature.flags, arg->data,
1001                         feature.argsz - minsz);
1002         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1003                 return vfio_ioctl_device_feature_mig_device_state(
1004                         device, feature.flags, arg->data,
1005                         feature.argsz - minsz);
1006         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1007                 return vfio_ioctl_device_feature_logging_start(
1008                         device, feature.flags, arg->data,
1009                         feature.argsz - minsz);
1010         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1011                 return vfio_ioctl_device_feature_logging_stop(
1012                         device, feature.flags, arg->data,
1013                         feature.argsz - minsz);
1014         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1015                 return vfio_ioctl_device_feature_logging_report(
1016                         device, feature.flags, arg->data,
1017                         feature.argsz - minsz);
1018         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1019                 return vfio_ioctl_device_feature_migration_data_size(
1020                         device, feature.flags, arg->data,
1021                         feature.argsz - minsz);
1022         default:
1023                 if (unlikely(!device->ops->device_feature))
1024                         return -EINVAL;
1025                 return device->ops->device_feature(device, feature.flags,
1026                                                    arg->data,
1027                                                    feature.argsz - minsz);
1028         }
1029 }
1030
1031 static long vfio_device_fops_unl_ioctl(struct file *filep,
1032                                        unsigned int cmd, unsigned long arg)
1033 {
1034         struct vfio_device *device = filep->private_data;
1035         int ret;
1036
1037         ret = vfio_device_pm_runtime_get(device);
1038         if (ret)
1039                 return ret;
1040
1041         switch (cmd) {
1042         case VFIO_DEVICE_FEATURE:
1043                 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1044                 break;
1045
1046         default:
1047                 if (unlikely(!device->ops->ioctl))
1048                         ret = -EINVAL;
1049                 else
1050                         ret = device->ops->ioctl(device, cmd, arg);
1051                 break;
1052         }
1053
1054         vfio_device_pm_runtime_put(device);
1055         return ret;
1056 }
1057
1058 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1059                                      size_t count, loff_t *ppos)
1060 {
1061         struct vfio_device *device = filep->private_data;
1062
1063         if (unlikely(!device->ops->read))
1064                 return -EINVAL;
1065
1066         return device->ops->read(device, buf, count, ppos);
1067 }
1068
1069 static ssize_t vfio_device_fops_write(struct file *filep,
1070                                       const char __user *buf,
1071                                       size_t count, loff_t *ppos)
1072 {
1073         struct vfio_device *device = filep->private_data;
1074
1075         if (unlikely(!device->ops->write))
1076                 return -EINVAL;
1077
1078         return device->ops->write(device, buf, count, ppos);
1079 }
1080
1081 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1082 {
1083         struct vfio_device *device = filep->private_data;
1084
1085         if (unlikely(!device->ops->mmap))
1086                 return -EINVAL;
1087
1088         return device->ops->mmap(device, vma);
1089 }
1090
1091 const struct file_operations vfio_device_fops = {
1092         .owner          = THIS_MODULE,
1093         .release        = vfio_device_fops_release,
1094         .read           = vfio_device_fops_read,
1095         .write          = vfio_device_fops_write,
1096         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1097         .compat_ioctl   = compat_ptr_ioctl,
1098         .mmap           = vfio_device_fops_mmap,
1099 };
1100
1101 /*
1102  * Sub-module support
1103  */
1104 /*
1105  * Helper for managing a buffer of info chain capabilities, allocate or
1106  * reallocate a buffer with additional @size, filling in @id and @version
1107  * of the capability.  A pointer to the new capability is returned.
1108  *
1109  * NB. The chain is based at the head of the buffer, so new entries are
1110  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1111  * next offsets prior to copying to the user buffer.
1112  */
1113 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1114                                                size_t size, u16 id, u16 version)
1115 {
1116         void *buf;
1117         struct vfio_info_cap_header *header, *tmp;
1118
1119         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1120         if (!buf) {
1121                 kfree(caps->buf);
1122                 caps->buf = NULL;
1123                 caps->size = 0;
1124                 return ERR_PTR(-ENOMEM);
1125         }
1126
1127         caps->buf = buf;
1128         header = buf + caps->size;
1129
1130         /* Eventually copied to user buffer, zero */
1131         memset(header, 0, size);
1132
1133         header->id = id;
1134         header->version = version;
1135
1136         /* Add to the end of the capability chain */
1137         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1138                 ; /* nothing */
1139
1140         tmp->next = caps->size;
1141         caps->size += size;
1142
1143         return header;
1144 }
1145 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1146
1147 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1148 {
1149         struct vfio_info_cap_header *tmp;
1150         void *buf = (void *)caps->buf;
1151
1152         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1153                 tmp->next += offset;
1154 }
1155 EXPORT_SYMBOL(vfio_info_cap_shift);
1156
1157 int vfio_info_add_capability(struct vfio_info_cap *caps,
1158                              struct vfio_info_cap_header *cap, size_t size)
1159 {
1160         struct vfio_info_cap_header *header;
1161
1162         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1163         if (IS_ERR(header))
1164                 return PTR_ERR(header);
1165
1166         memcpy(header + 1, cap + 1, size - sizeof(*header));
1167
1168         return 0;
1169 }
1170 EXPORT_SYMBOL(vfio_info_add_capability);
1171
1172 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1173                                        int max_irq_type, size_t *data_size)
1174 {
1175         unsigned long minsz;
1176         size_t size;
1177
1178         minsz = offsetofend(struct vfio_irq_set, count);
1179
1180         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1181             (hdr->count >= (U32_MAX - hdr->start)) ||
1182             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1183                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1184                 return -EINVAL;
1185
1186         if (data_size)
1187                 *data_size = 0;
1188
1189         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1190                 return -EINVAL;
1191
1192         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1193         case VFIO_IRQ_SET_DATA_NONE:
1194                 size = 0;
1195                 break;
1196         case VFIO_IRQ_SET_DATA_BOOL:
1197                 size = sizeof(uint8_t);
1198                 break;
1199         case VFIO_IRQ_SET_DATA_EVENTFD:
1200                 size = sizeof(int32_t);
1201                 break;
1202         default:
1203                 return -EINVAL;
1204         }
1205
1206         if (size) {
1207                 if (hdr->argsz - minsz < hdr->count * size)
1208                         return -EINVAL;
1209
1210                 if (!data_size)
1211                         return -EINVAL;
1212
1213                 *data_size = hdr->count * size;
1214         }
1215
1216         return 0;
1217 }
1218 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1219
1220 /*
1221  * Pin contiguous user pages and return their associated host pages for local
1222  * domain only.
1223  * @device [in]  : device
1224  * @iova [in]    : starting IOVA of user pages to be pinned.
1225  * @npage [in]   : count of pages to be pinned.  This count should not
1226  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1227  * @prot [in]    : protection flags
1228  * @pages[out]   : array of host pages
1229  * Return error or number of pages pinned.
1230  *
1231  * A driver may only call this function if the vfio_device was created
1232  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1233  */
1234 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1235                    int npage, int prot, struct page **pages)
1236 {
1237         /* group->container cannot change while a vfio device is open */
1238         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1239                 return -EINVAL;
1240         if (vfio_device_has_container(device))
1241                 return vfio_device_container_pin_pages(device, iova,
1242                                                        npage, prot, pages);
1243         if (device->iommufd_access) {
1244                 int ret;
1245
1246                 if (iova > ULONG_MAX)
1247                         return -EINVAL;
1248                 /*
1249                  * VFIO ignores the sub page offset, npages is from the start of
1250                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1251                  * the sub page offset by doing:
1252                  *     pages[0] + (iova % PAGE_SIZE)
1253                  */
1254                 ret = iommufd_access_pin_pages(
1255                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1256                         npage * PAGE_SIZE, pages,
1257                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1258                 if (ret)
1259                         return ret;
1260                 return npage;
1261         }
1262         return -EINVAL;
1263 }
1264 EXPORT_SYMBOL(vfio_pin_pages);
1265
1266 /*
1267  * Unpin contiguous host pages for local domain only.
1268  * @device [in]  : device
1269  * @iova [in]    : starting address of user pages to be unpinned.
1270  * @npage [in]   : count of pages to be unpinned.  This count should not
1271  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1272  */
1273 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1274 {
1275         if (WARN_ON(!vfio_assert_device_open(device)))
1276                 return;
1277
1278         if (vfio_device_has_container(device)) {
1279                 vfio_device_container_unpin_pages(device, iova, npage);
1280                 return;
1281         }
1282         if (device->iommufd_access) {
1283                 if (WARN_ON(iova > ULONG_MAX))
1284                         return;
1285                 iommufd_access_unpin_pages(device->iommufd_access,
1286                                            ALIGN_DOWN(iova, PAGE_SIZE),
1287                                            npage * PAGE_SIZE);
1288                 return;
1289         }
1290 }
1291 EXPORT_SYMBOL(vfio_unpin_pages);
1292
1293 /*
1294  * This interface allows the CPUs to perform some sort of virtual DMA on
1295  * behalf of the device.
1296  *
1297  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1298  * into/from a kernel buffer.
1299  *
1300  * As the read/write of user space memory is conducted via the CPUs and is
1301  * not a real device DMA, it is not necessary to pin the user space memory.
1302  *
1303  * @device [in]         : VFIO device
1304  * @iova [in]           : base IOVA of a user space buffer
1305  * @data [in]           : pointer to kernel buffer
1306  * @len [in]            : kernel buffer length
1307  * @write               : indicate read or write
1308  * Return error code on failure or 0 on success.
1309  */
1310 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1311                 size_t len, bool write)
1312 {
1313         if (!data || len <= 0 || !vfio_assert_device_open(device))
1314                 return -EINVAL;
1315
1316         if (vfio_device_has_container(device))
1317                 return vfio_device_container_dma_rw(device, iova,
1318                                                     data, len, write);
1319
1320         if (device->iommufd_access) {
1321                 unsigned int flags = 0;
1322
1323                 if (iova > ULONG_MAX)
1324                         return -EINVAL;
1325
1326                 /* VFIO historically tries to auto-detect a kthread */
1327                 if (!current->mm)
1328                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1329                 if (write)
1330                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1331                 return iommufd_access_rw(device->iommufd_access, iova, data,
1332                                          len, flags);
1333         }
1334         return -EINVAL;
1335 }
1336 EXPORT_SYMBOL(vfio_dma_rw);
1337
1338 /*
1339  * Module/class support
1340  */
1341 static int __init vfio_init(void)
1342 {
1343         int ret;
1344
1345         ida_init(&vfio.device_ida);
1346
1347         ret = vfio_group_init();
1348         if (ret)
1349                 return ret;
1350
1351         ret = vfio_virqfd_init();
1352         if (ret)
1353                 goto err_virqfd;
1354
1355         /* /sys/class/vfio-dev/vfioX */
1356         vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1357         if (IS_ERR(vfio.device_class)) {
1358                 ret = PTR_ERR(vfio.device_class);
1359                 goto err_dev_class;
1360         }
1361
1362         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1363         return 0;
1364
1365 err_dev_class:
1366         vfio_virqfd_exit();
1367 err_virqfd:
1368         vfio_group_cleanup();
1369         return ret;
1370 }
1371
1372 static void __exit vfio_cleanup(void)
1373 {
1374         ida_destroy(&vfio.device_ida);
1375         class_destroy(vfio.device_class);
1376         vfio.device_class = NULL;
1377         vfio_virqfd_exit();
1378         vfio_group_cleanup();
1379         xa_destroy(&vfio_device_set_xa);
1380 }
1381
1382 module_init(vfio_init);
1383 module_exit(vfio_cleanup);
1384
1385 MODULE_VERSION(DRIVER_VERSION);
1386 MODULE_LICENSE("GPL v2");
1387 MODULE_AUTHOR(DRIVER_AUTHOR);
1388 MODULE_DESCRIPTION(DRIVER_DESC);
1389 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");