drivers/vfio/vfio.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO core
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  */
  12
  13 #include <linux/cdev.h>
  14 #include <linux/compat.h>
  15 #include <linux/device.h>
  16 #include <linux/file.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/fs.h>
  19 #include <linux/idr.h>
  20 #include <linux/iommu.h>
  21 #include <linux/list.h>
  22 #include <linux/miscdevice.h>
  23 #include <linux/module.h>
  24 #include <linux/mutex.h>
  25 #include <linux/pci.h>
  26 #include <linux/rwsem.h>
  27 #include <linux/sched.h>
  28 #include <linux/slab.h>
  29 #include <linux/stat.h>
  30 #include <linux/string.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/vfio.h>
  33 #include <linux/wait.h>
  34 #include <linux/sched/signal.h>
  35
  36 #define DRIVER_VERSION  "0.3"
  37 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40 static struct vfio {
  41         struct class                    *class;
  42         struct list_head                iommu_drivers_list;
  43         struct mutex                    iommu_drivers_lock;
  44         struct list_head                group_list;
  45         struct idr                      group_idr;
  46         struct mutex                    group_lock;
  47         struct cdev                     group_cdev;
  48         dev_t                           group_devt;
  49         wait_queue_head_t               release_q;
  50 } vfio;
  51
  52 struct vfio_iommu_driver {
  53         const struct vfio_iommu_driver_ops      *ops;
  54         struct list_head                        vfio_next;
  55 };
  56
  57 struct vfio_container {
  58         struct kref                     kref;
  59         struct list_head                group_list;
  60         struct rw_semaphore             group_lock;
  61         struct vfio_iommu_driver        *iommu_driver;
  62         void                            *iommu_data;
  63         bool                            noiommu;
  64 };
  65
  66 struct vfio_unbound_dev {
  67         struct device                   *dev;
  68         struct list_head                unbound_next;
  69 };
  70
  71 struct vfio_group {
  72         struct kref                     kref;
  73         int                             minor;
  74         atomic_t                        container_users;
  75         struct iommu_group              *iommu_group;
  76         struct vfio_container           *container;
  77         struct list_head                device_list;
  78         struct mutex                    device_lock;
  79         struct device                   *dev;
  80         struct notifier_block           nb;
  81         struct list_head                vfio_next;
  82         struct list_head                container_next;
  83         struct list_head                unbound_list;
  84         struct mutex                    unbound_lock;
  85         atomic_t                        opened;
  86         wait_queue_head_t               container_q;
  87         bool                            noiommu;
  88         struct kvm                      *kvm;
  89         struct blocking_notifier_head   notifier;
  90 };
  91
  92 struct vfio_device {
  93         struct kref                     kref;
  94         struct device                   *dev;
  95         const struct vfio_device_ops    *ops;
  96         struct vfio_group               *group;
  97         struct list_head                group_next;
  98         void                            *device_data;
  99 };
 100
 101 #ifdef CONFIG_VFIO_NOIOMMU
 102 static bool noiommu __read_mostly;
 103 module_param_named(enable_unsafe_noiommu_mode,
 104                    noiommu, bool, S_IRUGO | S_IWUSR);
 105 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106 #endif
 107
 108 /*
 109  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110  * and remove functions, any use cases other than acquiring the first
 111  * reference for the purpose of calling vfio_add_group_dev() or removing
 112  * that symmetric reference after vfio_del_group_dev() should use the raw
 113  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114  * removes the device from the dummy group and cannot be nested.
 115  */
 116 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117 {
 118         struct iommu_group *group;
 119         int __maybe_unused ret;
 120
 121         group = iommu_group_get(dev);
 122
 123 #ifdef CONFIG_VFIO_NOIOMMU
 124         /*
 125          * With noiommu enabled, an IOMMU group will be created for a device
 126          * that doesn't already have one and doesn't have an iommu_ops on their
 127          * bus.  We set iommudata simply to be able to identify these groups
 128          * as special use and for reclamation later.
 129          */
 130         if (group || !noiommu || iommu_present(dev->bus))
 131                 return group;
 132
 133         group = iommu_group_alloc();
 134         if (IS_ERR(group))
 135                 return NULL;
 136
 137         iommu_group_set_name(group, "vfio-noiommu");
 138         iommu_group_set_iommudata(group, &noiommu, NULL);
 139         ret = iommu_group_add_device(group, dev);
 140         if (ret) {
 141                 iommu_group_put(group);
 142                 return NULL;
 143         }
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         init_waitqueue_head(&group->container_q);
 341         group->iommu_group = iommu_group;
 342 #ifdef CONFIG_VFIO_NOIOMMU
 343         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344 #endif
 345         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346
 347         group->nb.notifier_call = vfio_iommu_group_notifier;
 348
 349         /*
 350          * blocking notifiers acquire a rwsem around registering and hold
 351          * it around callback.  Therefore, need to register outside of
 352          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353          * do anything unless it can find the group in vfio.group_list, so
 354          * no harm in registering early.
 355          */
 356         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357         if (ret) {
 358                 kfree(group);
 359                 return ERR_PTR(ret);
 360         }
 361
 362         mutex_lock(&vfio.group_lock);
 363
 364         /* Did we race creating this group? */
 365         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366                 if (tmp->iommu_group == iommu_group) {
 367                         vfio_group_get(tmp);
 368                         vfio_group_unlock_and_free(group);
 369                         return tmp;
 370                 }
 371         }
 372
 373         minor = vfio_alloc_group_minor(group);
 374         if (minor < 0) {
 375                 vfio_group_unlock_and_free(group);
 376                 return ERR_PTR(minor);
 377         }
 378
 379         dev = device_create(vfio.class, NULL,
 380                             MKDEV(MAJOR(vfio.group_devt), minor),
 381                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 382                             iommu_group_id(iommu_group));
 383         if (IS_ERR(dev)) {
 384                 vfio_free_group_minor(minor);
 385                 vfio_group_unlock_and_free(group);
 386                 return ERR_CAST(dev);
 387         }
 388
 389         group->minor = minor;
 390         group->dev = dev;
 391
 392         list_add(&group->vfio_next, &vfio.group_list);
 393
 394         mutex_unlock(&vfio.group_lock);
 395
 396         return group;
 397 }
 398
 399 /* called with vfio.group_lock held */
 400 static void vfio_group_release(struct kref *kref)
 401 {
 402         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403         struct vfio_unbound_dev *unbound, *tmp;
 404         struct iommu_group *iommu_group = group->iommu_group;
 405
 406         WARN_ON(!list_empty(&group->device_list));
 407         WARN_ON(group->notifier.head);
 408
 409         list_for_each_entry_safe(unbound, tmp,
 410                                  &group->unbound_list, unbound_next) {
 411                 list_del(&unbound->unbound_next);
 412                 kfree(unbound);
 413         }
 414
 415         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416         list_del(&group->vfio_next);
 417         vfio_free_group_minor(group->minor);
 418         vfio_group_unlock_and_free(group);
 419         iommu_group_put(iommu_group);
 420 }
 421
 422 static void vfio_group_put(struct vfio_group *group)
 423 {
 424         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425 }
 426
 427 struct vfio_group_put_work {
 428         struct work_struct work;
 429         struct vfio_group *group;
 430 };
 431
 432 static void vfio_group_put_bg(struct work_struct *work)
 433 {
 434         struct vfio_group_put_work *do_work;
 435
 436         do_work = container_of(work, struct vfio_group_put_work, work);
 437
 438         vfio_group_put(do_work->group);
 439         kfree(do_work);
 440 }
 441
 442 static void vfio_group_schedule_put(struct vfio_group *group)
 443 {
 444         struct vfio_group_put_work *do_work;
 445
 446         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447         if (WARN_ON(!do_work))
 448                 return;
 449
 450         INIT_WORK(&do_work->work, vfio_group_put_bg);
 451         do_work->group = group;
 452         schedule_work(&do_work->work);
 453 }
 454
 455 /* Assume group_lock or group reference is held */
 456 static void vfio_group_get(struct vfio_group *group)
 457 {
 458         kref_get(&group->kref);
 459 }
 460
 461 /*
 462  * Not really a try as we will sleep for mutex, but we need to make
 463  * sure the group pointer is valid under lock and get a reference.
 464  */
 465 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466 {
 467         struct vfio_group *target = group;
 468
 469         mutex_lock(&vfio.group_lock);
 470         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471                 if (group == target) {
 472                         vfio_group_get(group);
 473                         mutex_unlock(&vfio.group_lock);
 474                         return group;
 475                 }
 476         }
 477         mutex_unlock(&vfio.group_lock);
 478
 479         return NULL;
 480 }
 481
 482 static
 483 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484 {
 485         struct vfio_group *group;
 486
 487         mutex_lock(&vfio.group_lock);
 488         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489                 if (group->iommu_group == iommu_group) {
 490                         vfio_group_get(group);
 491                         mutex_unlock(&vfio.group_lock);
 492                         return group;
 493                 }
 494         }
 495         mutex_unlock(&vfio.group_lock);
 496
 497         return NULL;
 498 }
 499
 500 static struct vfio_group *vfio_group_get_from_minor(int minor)
 501 {
 502         struct vfio_group *group;
 503
 504         mutex_lock(&vfio.group_lock);
 505         group = idr_find(&vfio.group_idr, minor);
 506         if (!group) {
 507                 mutex_unlock(&vfio.group_lock);
 508                 return NULL;
 509         }
 510         vfio_group_get(group);
 511         mutex_unlock(&vfio.group_lock);
 512
 513         return group;
 514 }
 515
 516 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517 {
 518         struct iommu_group *iommu_group;
 519         struct vfio_group *group;
 520
 521         iommu_group = iommu_group_get(dev);
 522         if (!iommu_group)
 523                 return NULL;
 524
 525         group = vfio_group_get_from_iommu(iommu_group);
 526         iommu_group_put(iommu_group);
 527
 528         return group;
 529 }
 530
 531 /**
 532  * Device objects - create, release, get, put, search
 533  */
 534 static
 535 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536                                              struct device *dev,
 537                                              const struct vfio_device_ops *ops,
 538                                              void *device_data)
 539 {
 540         struct vfio_device *device;
 541
 542         device = kzalloc(sizeof(*device), GFP_KERNEL);
 543         if (!device)
 544                 return ERR_PTR(-ENOMEM);
 545
 546         kref_init(&device->kref);
 547         device->dev = dev;
 548         device->group = group;
 549         device->ops = ops;
 550         device->device_data = device_data;
 551         dev_set_drvdata(dev, device);
 552
 553         /* No need to get group_lock, caller has group reference */
 554         vfio_group_get(group);
 555
 556         mutex_lock(&group->device_lock);
 557         list_add(&device->group_next, &group->device_list);
 558         mutex_unlock(&group->device_lock);
 559
 560         return device;
 561 }
 562
 563 static void vfio_device_release(struct kref *kref)
 564 {
 565         struct vfio_device *device = container_of(kref,
 566                                                   struct vfio_device, kref);
 567         struct vfio_group *group = device->group;
 568
 569         list_del(&device->group_next);
 570         mutex_unlock(&group->device_lock);
 571
 572         dev_set_drvdata(device->dev, NULL);
 573
 574         kfree(device);
 575
 576         /* vfio_del_group_dev may be waiting for this device */
 577         wake_up(&vfio.release_q);
 578 }
 579
 580 /* Device reference always implies a group reference */
 581 void vfio_device_put(struct vfio_device *device)
 582 {
 583         struct vfio_group *group = device->group;
 584         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585         vfio_group_put(group);
 586 }
 587 EXPORT_SYMBOL_GPL(vfio_device_put);
 588
 589 static void vfio_device_get(struct vfio_device *device)
 590 {
 591         vfio_group_get(device->group);
 592         kref_get(&device->kref);
 593 }
 594
 595 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596                                                  struct device *dev)
 597 {
 598         struct vfio_device *device;
 599
 600         mutex_lock(&group->device_lock);
 601         list_for_each_entry(device, &group->device_list, group_next) {
 602                 if (device->dev == dev) {
 603                         vfio_device_get(device);
 604                         mutex_unlock(&group->device_lock);
 605                         return device;
 606                 }
 607         }
 608         mutex_unlock(&group->device_lock);
 609         return NULL;
 610 }
 611
 612 /*
 613  * Some drivers, like pci-stub, are only used to prevent other drivers from
 614  * claiming a device and are therefore perfectly legitimate for a user owned
 615  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616  * of the device, but it does prevent the user from having direct access to
 617  * the device, which is useful in some circumstances.
 618  *
 619  * We also assume that we can include PCI interconnect devices, ie. bridges.
 620  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621  * then all of the downstream devices will be part of the same IOMMU group as
 622  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623  * breaks anything, it only does so for user owned devices downstream.  Note
 624  * that error notification via MSI can be affected for platforms that handle
 625  * MSI within the same IOVA space as DMA.
 626  */
 627 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628
 629 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 630 {
 631         if (dev_is_pci(dev)) {
 632                 struct pci_dev *pdev = to_pci_dev(dev);
 633
 634                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635                         return true;
 636         }
 637
 638         return match_string(vfio_driver_whitelist,
 639                             ARRAY_SIZE(vfio_driver_whitelist),
 640                             drv->name) >= 0;
 641 }
 642
 643 /*
 644  * A vfio group is viable for use by userspace if all devices are in
 645  * one of the following states:
 646  *  - driver-less
 647  *  - bound to a vfio driver
 648  *  - bound to a whitelisted driver
 649  *  - a PCI interconnect device
 650  *
 651  * We use two methods to determine whether a device is bound to a vfio
 652  * driver.  The first is to test whether the device exists in the vfio
 653  * group.  The second is to test if the device exists on the group
 654  * unbound_list, indicating it's in the middle of transitioning from
 655  * a vfio driver to driver-less.
 656  */
 657 static int vfio_dev_viable(struct device *dev, void *data)
 658 {
 659         struct vfio_group *group = data;
 660         struct vfio_device *device;
 661         struct device_driver *drv = READ_ONCE(dev->driver);
 662         struct vfio_unbound_dev *unbound;
 663         int ret = -EINVAL;
 664
 665         mutex_lock(&group->unbound_lock);
 666         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667                 if (dev == unbound->dev) {
 668                         ret = 0;
 669                         break;
 670                 }
 671         }
 672         mutex_unlock(&group->unbound_lock);
 673
 674         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675                 return 0;
 676
 677         device = vfio_group_get_device(group, dev);
 678         if (device) {
 679                 vfio_device_put(device);
 680                 return 0;
 681         }
 682
 683         return ret;
 684 }
 685
 686 /**
 687  * Async device support
 688  */
 689 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690 {
 691         struct vfio_device *device;
 692
 693         /* Do we already know about it?  We shouldn't */
 694         device = vfio_group_get_device(group, dev);
 695         if (WARN_ON_ONCE(device)) {
 696                 vfio_device_put(device);
 697                 return 0;
 698         }
 699
 700         /* Nothing to do for idle groups */
 701         if (!atomic_read(&group->container_users))
 702                 return 0;
 703
 704         /* TODO Prevent device auto probing */
 705         dev_WARN(dev, "Device added to live group %d!\n",
 706                  iommu_group_id(group->iommu_group));
 707
 708         return 0;
 709 }
 710
 711 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712 {
 713         /* We don't care what happens when the group isn't in use */
 714         if (!atomic_read(&group->container_users))
 715                 return 0;
 716
 717         return vfio_dev_viable(dev, group);
 718 }
 719
 720 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721                                      unsigned long action, void *data)
 722 {
 723         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724         struct device *dev = data;
 725         struct vfio_unbound_dev *unbound;
 726
 727         /*
 728          * Need to go through a group_lock lookup to get a reference or we
 729          * risk racing a group being removed.  Ignore spurious notifies.
 730          */
 731         group = vfio_group_try_get(group);
 732         if (!group)
 733                 return NOTIFY_OK;
 734
 735         switch (action) {
 736         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737                 vfio_group_nb_add_dev(group, dev);
 738                 break;
 739         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740                 /*
 741                  * Nothing to do here.  If the device is in use, then the
 742                  * vfio sub-driver should block the remove callback until
 743                  * it is unused.  If the device is unused or attached to a
 744                  * stub driver, then it should be released and we don't
 745                  * care that it will be going away.
 746                  */
 747                 break;
 748         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749                 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750                         iommu_group_id(group->iommu_group));
 751                 break;
 752         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753                 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754                         iommu_group_id(group->iommu_group), dev->driver->name);
 755                 BUG_ON(vfio_group_nb_verify(group, dev));
 756                 break;
 757         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758                 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759                         __func__, iommu_group_id(group->iommu_group),
 760                         dev->driver->name);
 761                 break;
 762         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763                 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764                         iommu_group_id(group->iommu_group));
 765                 /*
 766                  * XXX An unbound device in a live group is ok, but we'd
 767                  * really like to avoid the above BUG_ON by preventing other
 768                  * drivers from binding to it.  Once that occurs, we have to
 769                  * stop the system to maintain isolation.  At a minimum, we'd
 770                  * want a toggle to disable driver auto probe for this device.
 771                  */
 772
 773                 mutex_lock(&group->unbound_lock);
 774                 list_for_each_entry(unbound,
 775                                     &group->unbound_list, unbound_next) {
 776                         if (dev == unbound->dev) {
 777                                 list_del(&unbound->unbound_next);
 778                                 kfree(unbound);
 779                                 break;
 780                         }
 781                 }
 782                 mutex_unlock(&group->unbound_lock);
 783                 break;
 784         }
 785
 786         /*
 787          * If we're the last reference to the group, the group will be
 788          * released, which includes unregistering the iommu group notifier.
 789          * We hold a read-lock on that notifier list, unregistering needs
 790          * a write-lock... deadlock.  Release our reference asynchronously
 791          * to avoid that situation.
 792          */
 793         vfio_group_schedule_put(group);
 794         return NOTIFY_OK;
 795 }
 796
 797 /**
 798  * VFIO driver API
 799  */
 800 int vfio_add_group_dev(struct device *dev,
 801                        const struct vfio_device_ops *ops, void *device_data)
 802 {
 803         struct iommu_group *iommu_group;
 804         struct vfio_group *group;
 805         struct vfio_device *device;
 806
 807         iommu_group = iommu_group_get(dev);
 808         if (!iommu_group)
 809                 return -EINVAL;
 810
 811         group = vfio_group_get_from_iommu(iommu_group);
 812         if (!group) {
 813                 group = vfio_create_group(iommu_group);
 814                 if (IS_ERR(group)) {
 815                         iommu_group_put(iommu_group);
 816                         return PTR_ERR(group);
 817                 }
 818         } else {
 819                 /*
 820                  * A found vfio_group already holds a reference to the
 821                  * iommu_group.  A created vfio_group keeps the reference.
 822                  */
 823                 iommu_group_put(iommu_group);
 824         }
 825
 826         device = vfio_group_get_device(group, dev);
 827         if (device) {
 828                 dev_WARN(dev, "Device already exists on group %d\n",
 829                          iommu_group_id(iommu_group));
 830                 vfio_device_put(device);
 831                 vfio_group_put(group);
 832                 return -EBUSY;
 833         }
 834
 835         device = vfio_group_create_device(group, dev, ops, device_data);
 836         if (IS_ERR(device)) {
 837                 vfio_group_put(group);
 838                 return PTR_ERR(device);
 839         }
 840
 841         /*
 842          * Drop all but the vfio_device reference.  The vfio_device holds
 843          * a reference to the vfio_group, which holds a reference to the
 844          * iommu_group.
 845          */
 846         vfio_group_put(group);
 847
 848         return 0;
 849 }
 850 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851
 852 /**
 853  * Get a reference to the vfio_device for a device.  Even if the
 854  * caller thinks they own the device, they could be racing with a
 855  * release call path, so we can't trust drvdata for the shortcut.
 856  * Go the long way around, from the iommu_group to the vfio_group
 857  * to the vfio_device.
 858  */
 859 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860 {
 861         struct vfio_group *group;
 862         struct vfio_device *device;
 863
 864         group = vfio_group_get_from_dev(dev);
 865         if (!group)
 866                 return NULL;
 867
 868         device = vfio_group_get_device(group, dev);
 869         vfio_group_put(group);
 870
 871         return device;
 872 }
 873 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874
 875 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876                                                      char *buf)
 877 {
 878         struct vfio_device *it, *device = NULL;
 879
 880         mutex_lock(&group->device_lock);
 881         list_for_each_entry(it, &group->device_list, group_next) {
 882                 if (!strcmp(dev_name(it->dev), buf)) {
 883                         device = it;
 884                         vfio_device_get(device);
 885                         break;
 886                 }
 887         }
 888         mutex_unlock(&group->device_lock);
 889
 890         return device;
 891 }
 892
 893 /*
 894  * Caller must hold a reference to the vfio_device
 895  */
 896 void *vfio_device_data(struct vfio_device *device)
 897 {
 898         return device->device_data;
 899 }
 900 EXPORT_SYMBOL_GPL(vfio_device_data);
 901
 902 /*
 903  * Decrement the device reference count and wait for the device to be
 904  * removed.  Open file descriptors for the device... */
 905 void *vfio_del_group_dev(struct device *dev)
 906 {
 907         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 908         struct vfio_device *device = dev_get_drvdata(dev);
 909         struct vfio_group *group = device->group;
 910         void *device_data = device->device_data;
 911         struct vfio_unbound_dev *unbound;
 912         unsigned int i = 0;
 913         bool interrupted = false;
 914
 915         /*
 916          * The group exists so long as we have a device reference.  Get
 917          * a group reference and use it to scan for the device going away.
 918          */
 919         vfio_group_get(group);
 920
 921         /*
 922          * When the device is removed from the group, the group suddenly
 923          * becomes non-viable; the device has a driver (until the unbind
 924          * completes), but it's not present in the group.  This is bad news
 925          * for any external users that need to re-acquire a group reference
 926          * in order to match and release their existing reference.  To
 927          * solve this, we track such devices on the unbound_list to bridge
 928          * the gap until they're fully unbound.
 929          */
 930         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 931         if (unbound) {
 932                 unbound->dev = dev;
 933                 mutex_lock(&group->unbound_lock);
 934                 list_add(&unbound->unbound_next, &group->unbound_list);
 935                 mutex_unlock(&group->unbound_lock);
 936         }
 937         WARN_ON(!unbound);
 938
 939         vfio_device_put(device);
 940
 941         /*
 942          * If the device is still present in the group after the above
 943          * 'put', then it is in use and we need to request it from the
 944          * bus driver.  The driver may in turn need to request the
 945          * device from the user.  We send the request on an arbitrary
 946          * interval with counter to allow the driver to take escalating
 947          * measures to release the device if it has the ability to do so.
 948          */
 949         add_wait_queue(&vfio.release_q, &wait);
 950
 951         do {
 952                 device = vfio_group_get_device(group, dev);
 953                 if (!device)
 954                         break;
 955
 956                 if (device->ops->request)
 957                         device->ops->request(device_data, i++);
 958
 959                 vfio_device_put(device);
 960
 961                 if (interrupted) {
 962                         wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 963                 } else {
 964                         wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 965                         if (signal_pending(current)) {
 966                                 interrupted = true;
 967                                 dev_warn(dev,
 968                                          "Device is currently in use, task"
 969                                          " \"%s\" (%d) "
 970                                          "blocked until device is released",
 971                                          current->comm, task_pid_nr(current));
 972                         }
 973                 }
 974
 975         } while (1);
 976
 977         remove_wait_queue(&vfio.release_q, &wait);
 978         /*
 979          * In order to support multiple devices per group, devices can be
 980          * plucked from the group while other devices in the group are still
 981          * in use.  The container persists with this group and those remaining
 982          * devices still attached.  If the user creates an isolation violation
 983          * by binding this device to another driver while the group is still in
 984          * use, that's their fault.  However, in the case of removing the last,
 985          * or potentially the only, device in the group there can be no other
 986          * in-use devices in the group.  The user has done their due diligence
 987          * and we should lay no claims to those devices.  In order to do that,
 988          * we need to make sure the group is detached from the container.
 989          * Without this stall, we're potentially racing with a user process
 990          * that may attempt to immediately bind this device to another driver.
 991          */
 992         if (list_empty(&group->device_list))
 993                 wait_event(group->container_q, !group->container);
 994
 995         vfio_group_put(group);
 996
 997         return device_data;
 998 }
 999 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000
1001 /**
1002  * VFIO base fd, /dev/vfio/vfio
1003  */
1004 static long vfio_ioctl_check_extension(struct vfio_container *container,
1005                                        unsigned long arg)
1006 {
1007         struct vfio_iommu_driver *driver;
1008         long ret = 0;
1009
1010         down_read(&container->group_lock);
1011
1012         driver = container->iommu_driver;
1013
1014         switch (arg) {
1015                 /* No base extensions yet */
1016         default:
1017                 /*
1018                  * If no driver is set, poll all registered drivers for
1019                  * extensions and return the first positive result.  If
1020                  * a driver is already set, further queries will be passed
1021                  * only to that driver.
1022                  */
1023                 if (!driver) {
1024                         mutex_lock(&vfio.iommu_drivers_lock);
1025                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026                                             vfio_next) {
1027
1028 #ifdef CONFIG_VFIO_NOIOMMU
1029                                 if (!list_empty(&container->group_list) &&
1030                                     (container->noiommu !=
1031                                      (driver->ops == &vfio_noiommu_ops)))
1032                                         continue;
1033 #endif
1034
1035                                 if (!try_module_get(driver->ops->owner))
1036                                         continue;
1037
1038                                 ret = driver->ops->ioctl(NULL,
1039                                                          VFIO_CHECK_EXTENSION,
1040                                                          arg);
1041                                 module_put(driver->ops->owner);
1042                                 if (ret > 0)
1043                                         break;
1044                         }
1045                         mutex_unlock(&vfio.iommu_drivers_lock);
1046                 } else
1047                         ret = driver->ops->ioctl(container->iommu_data,
1048                                                  VFIO_CHECK_EXTENSION, arg);
1049         }
1050
1051         up_read(&container->group_lock);
1052
1053         return ret;
1054 }
1055
1056 /* hold write lock on container->group_lock */
1057 static int __vfio_container_attach_groups(struct vfio_container *container,
1058                                           struct vfio_iommu_driver *driver,
1059                                           void *data)
1060 {
1061         struct vfio_group *group;
1062         int ret = -ENODEV;
1063
1064         list_for_each_entry(group, &container->group_list, container_next) {
1065                 ret = driver->ops->attach_group(data, group->iommu_group);
1066                 if (ret)
1067                         goto unwind;
1068         }
1069
1070         return ret;
1071
1072 unwind:
1073         list_for_each_entry_continue_reverse(group, &container->group_list,
1074                                              container_next) {
1075                 driver->ops->detach_group(data, group->iommu_group);
1076         }
1077
1078         return ret;
1079 }
1080
1081 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082                                  unsigned long arg)
1083 {
1084         struct vfio_iommu_driver *driver;
1085         long ret = -ENODEV;
1086
1087         down_write(&container->group_lock);
1088
1089         /*
1090          * The container is designed to be an unprivileged interface while
1091          * the group can be assigned to specific users.  Therefore, only by
1092          * adding a group to a container does the user get the privilege of
1093          * enabling the iommu, which may allocate finite resources.  There
1094          * is no unset_iommu, but by removing all the groups from a container,
1095          * the container is deprivileged and returns to an unset state.
1096          */
1097         if (list_empty(&container->group_list) || container->iommu_driver) {
1098                 up_write(&container->group_lock);
1099                 return -EINVAL;
1100         }
1101
1102         mutex_lock(&vfio.iommu_drivers_lock);
1103         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104                 void *data;
1105
1106 #ifdef CONFIG_VFIO_NOIOMMU
1107                 /*
1108                  * Only noiommu containers can use vfio-noiommu and noiommu
1109                  * containers can only use vfio-noiommu.
1110                  */
1111                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112                         continue;
1113 #endif
1114
1115                 if (!try_module_get(driver->ops->owner))
1116                         continue;
1117
1118                 /*
1119                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120                  * so test which iommu driver reported support for this
1121                  * extension and call open on them.  We also pass them the
1122                  * magic, allowing a single driver to support multiple
1123                  * interfaces if they'd like.
1124                  */
1125                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126                         module_put(driver->ops->owner);
1127                         continue;
1128                 }
1129
1130                 data = driver->ops->open(arg);
1131                 if (IS_ERR(data)) {
1132                         ret = PTR_ERR(data);
1133                         module_put(driver->ops->owner);
1134                         continue;
1135                 }
1136
1137                 ret = __vfio_container_attach_groups(container, driver, data);
1138                 if (ret) {
1139                         driver->ops->release(data);
1140                         module_put(driver->ops->owner);
1141                         continue;
1142                 }
1143
1144                 container->iommu_driver = driver;
1145                 container->iommu_data = data;
1146                 break;
1147         }
1148
1149         mutex_unlock(&vfio.iommu_drivers_lock);
1150         up_write(&container->group_lock);
1151
1152         return ret;
1153 }
1154
1155 static long vfio_fops_unl_ioctl(struct file *filep,
1156                                 unsigned int cmd, unsigned long arg)
1157 {
1158         struct vfio_container *container = filep->private_data;
1159         struct vfio_iommu_driver *driver;
1160         void *data;
1161         long ret = -EINVAL;
1162
1163         if (!container)
1164                 return ret;
1165
1166         switch (cmd) {
1167         case VFIO_GET_API_VERSION:
1168                 ret = VFIO_API_VERSION;
1169                 break;
1170         case VFIO_CHECK_EXTENSION:
1171                 ret = vfio_ioctl_check_extension(container, arg);
1172                 break;
1173         case VFIO_SET_IOMMU:
1174                 ret = vfio_ioctl_set_iommu(container, arg);
1175                 break;
1176         default:
1177                 driver = container->iommu_driver;
1178                 data = container->iommu_data;
1179
1180                 if (driver) /* passthrough all unrecognized ioctls */
1181                         ret = driver->ops->ioctl(data, cmd, arg);
1182         }
1183
1184         return ret;
1185 }
1186
1187 static int vfio_fops_open(struct inode *inode, struct file *filep)
1188 {
1189         struct vfio_container *container;
1190
1191         container = kzalloc(sizeof(*container), GFP_KERNEL);
1192         if (!container)
1193                 return -ENOMEM;
1194
1195         INIT_LIST_HEAD(&container->group_list);
1196         init_rwsem(&container->group_lock);
1197         kref_init(&container->kref);
1198
1199         filep->private_data = container;
1200
1201         return 0;
1202 }
1203
1204 static int vfio_fops_release(struct inode *inode, struct file *filep)
1205 {
1206         struct vfio_container *container = filep->private_data;
1207
1208         filep->private_data = NULL;
1209
1210         vfio_container_put(container);
1211
1212         return 0;
1213 }
1214
1215 /*
1216  * Once an iommu driver is set, we optionally pass read/write/mmap
1217  * on to the driver, allowing management interfaces beyond ioctl.
1218  */
1219 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1220                               size_t count, loff_t *ppos)
1221 {
1222         struct vfio_container *container = filep->private_data;
1223         struct vfio_iommu_driver *driver;
1224         ssize_t ret = -EINVAL;
1225
1226         driver = container->iommu_driver;
1227         if (likely(driver && driver->ops->read))
1228                 ret = driver->ops->read(container->iommu_data,
1229                                         buf, count, ppos);
1230
1231         return ret;
1232 }
1233
1234 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1235                                size_t count, loff_t *ppos)
1236 {
1237         struct vfio_container *container = filep->private_data;
1238         struct vfio_iommu_driver *driver;
1239         ssize_t ret = -EINVAL;
1240
1241         driver = container->iommu_driver;
1242         if (likely(driver && driver->ops->write))
1243                 ret = driver->ops->write(container->iommu_data,
1244                                          buf, count, ppos);
1245
1246         return ret;
1247 }
1248
1249 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1250 {
1251         struct vfio_container *container = filep->private_data;
1252         struct vfio_iommu_driver *driver;
1253         int ret = -EINVAL;
1254
1255         driver = container->iommu_driver;
1256         if (likely(driver && driver->ops->mmap))
1257                 ret = driver->ops->mmap(container->iommu_data, vma);
1258
1259         return ret;
1260 }
1261
1262 static const struct file_operations vfio_fops = {
1263         .owner          = THIS_MODULE,
1264         .open           = vfio_fops_open,
1265         .release        = vfio_fops_release,
1266         .read           = vfio_fops_read,
1267         .write          = vfio_fops_write,
1268         .unlocked_ioctl = vfio_fops_unl_ioctl,
1269         .compat_ioctl   = compat_ptr_ioctl,
1270         .mmap           = vfio_fops_mmap,
1271 };
1272
1273 /**
1274  * VFIO Group fd, /dev/vfio/$GROUP
1275  */
1276 static void __vfio_group_unset_container(struct vfio_group *group)
1277 {
1278         struct vfio_container *container = group->container;
1279         struct vfio_iommu_driver *driver;
1280
1281         down_write(&container->group_lock);
1282
1283         driver = container->iommu_driver;
1284         if (driver)
1285                 driver->ops->detach_group(container->iommu_data,
1286                                           group->iommu_group);
1287
1288         group->container = NULL;
1289         wake_up(&group->container_q);
1290         list_del(&group->container_next);
1291
1292         /* Detaching the last group deprivileges a container, remove iommu */
1293         if (driver && list_empty(&container->group_list)) {
1294                 driver->ops->release(container->iommu_data);
1295                 module_put(driver->ops->owner);
1296                 container->iommu_driver = NULL;
1297                 container->iommu_data = NULL;
1298         }
1299
1300         up_write(&container->group_lock);
1301
1302         vfio_container_put(container);
1303 }
1304
1305 /*
1306  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1307  * if there was no container to unset.  Since the ioctl is called on
1308  * the group, we know that still exists, therefore the only valid
1309  * transition here is 1->0.
1310  */
1311 static int vfio_group_unset_container(struct vfio_group *group)
1312 {
1313         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1314
1315         if (!users)
1316                 return -EINVAL;
1317         if (users != 1)
1318                 return -EBUSY;
1319
1320         __vfio_group_unset_container(group);
1321
1322         return 0;
1323 }
1324
1325 /*
1326  * When removing container users, anything that removes the last user
1327  * implicitly removes the group from the container.  That is, if the
1328  * group file descriptor is closed, as well as any device file descriptors,
1329  * the group is free.
1330  */
1331 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1332 {
1333         if (0 == atomic_dec_if_positive(&group->container_users))
1334                 __vfio_group_unset_container(group);
1335 }
1336
1337 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1338 {
1339         struct fd f;
1340         struct vfio_container *container;
1341         struct vfio_iommu_driver *driver;
1342         int ret = 0;
1343
1344         if (atomic_read(&group->container_users))
1345                 return -EINVAL;
1346
1347         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1348                 return -EPERM;
1349
1350         f = fdget(container_fd);
1351         if (!f.file)
1352                 return -EBADF;
1353
1354         /* Sanity check, is this really our fd? */
1355         if (f.file->f_op != &vfio_fops) {
1356                 fdput(f);
1357                 return -EINVAL;
1358         }
1359
1360         container = f.file->private_data;
1361         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1362
1363         down_write(&container->group_lock);
1364
1365         /* Real groups and fake groups cannot mix */
1366         if (!list_empty(&container->group_list) &&
1367             container->noiommu != group->noiommu) {
1368                 ret = -EPERM;
1369                 goto unlock_out;
1370         }
1371
1372         driver = container->iommu_driver;
1373         if (driver) {
1374                 ret = driver->ops->attach_group(container->iommu_data,
1375                                                 group->iommu_group);
1376                 if (ret)
1377                         goto unlock_out;
1378         }
1379
1380         group->container = container;
1381         container->noiommu = group->noiommu;
1382         list_add(&group->container_next, &container->group_list);
1383
1384         /* Get a reference on the container and mark a user within the group */
1385         vfio_container_get(container);
1386         atomic_inc(&group->container_users);
1387
1388 unlock_out:
1389         up_write(&container->group_lock);
1390         fdput(f);
1391         return ret;
1392 }
1393
1394 static bool vfio_group_viable(struct vfio_group *group)
1395 {
1396         return (iommu_group_for_each_dev(group->iommu_group,
1397                                          group, vfio_dev_viable) == 0);
1398 }
1399
1400 static int vfio_group_add_container_user(struct vfio_group *group)
1401 {
1402         if (!atomic_inc_not_zero(&group->container_users))
1403                 return -EINVAL;
1404
1405         if (group->noiommu) {
1406                 atomic_dec(&group->container_users);
1407                 return -EPERM;
1408         }
1409         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1410                 atomic_dec(&group->container_users);
1411                 return -EINVAL;
1412         }
1413
1414         return 0;
1415 }
1416
1417 static const struct file_operations vfio_device_fops;
1418
1419 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1420 {
1421         struct vfio_device *device;
1422         struct file *filep;
1423         int ret;
1424
1425         if (0 == atomic_read(&group->container_users) ||
1426             !group->container->iommu_driver || !vfio_group_viable(group))
1427                 return -EINVAL;
1428
1429         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1430                 return -EPERM;
1431
1432         device = vfio_device_get_from_name(group, buf);
1433         if (!device)
1434                 return -ENODEV;
1435
1436         ret = device->ops->open(device->device_data);
1437         if (ret) {
1438                 vfio_device_put(device);
1439                 return ret;
1440         }
1441
1442         /*
1443          * We can't use anon_inode_getfd() because we need to modify
1444          * the f_mode flags directly to allow more than just ioctls
1445          */
1446         ret = get_unused_fd_flags(O_CLOEXEC);
1447         if (ret < 0) {
1448                 device->ops->release(device->device_data);
1449                 vfio_device_put(device);
1450                 return ret;
1451         }
1452
1453         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1454                                    device, O_RDWR);
1455         if (IS_ERR(filep)) {
1456                 put_unused_fd(ret);
1457                 ret = PTR_ERR(filep);
1458                 device->ops->release(device->device_data);
1459                 vfio_device_put(device);
1460                 return ret;
1461         }
1462
1463         /*
1464          * TODO: add an anon_inode interface to do this.
1465          * Appears to be missing by lack of need rather than
1466          * explicitly prevented.  Now there's need.
1467          */
1468         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1469
1470         atomic_inc(&group->container_users);
1471
1472         fd_install(ret, filep);
1473
1474         if (group->noiommu)
1475                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1476                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1477
1478         return ret;
1479 }
1480
1481 static long vfio_group_fops_unl_ioctl(struct file *filep,
1482                                       unsigned int cmd, unsigned long arg)
1483 {
1484         struct vfio_group *group = filep->private_data;
1485         long ret = -ENOTTY;
1486
1487         switch (cmd) {
1488         case VFIO_GROUP_GET_STATUS:
1489         {
1490                 struct vfio_group_status status;
1491                 unsigned long minsz;
1492
1493                 minsz = offsetofend(struct vfio_group_status, flags);
1494
1495                 if (copy_from_user(&status, (void __user *)arg, minsz))
1496                         return -EFAULT;
1497
1498                 if (status.argsz < minsz)
1499                         return -EINVAL;
1500
1501                 status.flags = 0;
1502
1503                 if (vfio_group_viable(group))
1504                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1505
1506                 if (group->container)
1507                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1508
1509                 if (copy_to_user((void __user *)arg, &status, minsz))
1510                         return -EFAULT;
1511
1512                 ret = 0;
1513                 break;
1514         }
1515         case VFIO_GROUP_SET_CONTAINER:
1516         {
1517                 int fd;
1518
1519                 if (get_user(fd, (int __user *)arg))
1520                         return -EFAULT;
1521
1522                 if (fd < 0)
1523                         return -EINVAL;
1524
1525                 ret = vfio_group_set_container(group, fd);
1526                 break;
1527         }
1528         case VFIO_GROUP_UNSET_CONTAINER:
1529                 ret = vfio_group_unset_container(group);
1530                 break;
1531         case VFIO_GROUP_GET_DEVICE_FD:
1532         {
1533                 char *buf;
1534
1535                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1536                 if (IS_ERR(buf))
1537                         return PTR_ERR(buf);
1538
1539                 ret = vfio_group_get_device_fd(group, buf);
1540                 kfree(buf);
1541                 break;
1542         }
1543         }
1544
1545         return ret;
1546 }
1547
1548 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1549 {
1550         struct vfio_group *group;
1551         int opened;
1552
1553         group = vfio_group_get_from_minor(iminor(inode));
1554         if (!group)
1555                 return -ENODEV;
1556
1557         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1558                 vfio_group_put(group);
1559                 return -EPERM;
1560         }
1561
1562         /* Do we need multiple instances of the group open?  Seems not. */
1563         opened = atomic_cmpxchg(&group->opened, 0, 1);
1564         if (opened) {
1565                 vfio_group_put(group);
1566                 return -EBUSY;
1567         }
1568
1569         /* Is something still in use from a previous open? */
1570         if (group->container) {
1571                 atomic_dec(&group->opened);
1572                 vfio_group_put(group);
1573                 return -EBUSY;
1574         }
1575
1576         /* Warn if previous user didn't cleanup and re-init to drop them */
1577         if (WARN_ON(group->notifier.head))
1578                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1579
1580         filep->private_data = group;
1581
1582         return 0;
1583 }
1584
1585 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1586 {
1587         struct vfio_group *group = filep->private_data;
1588
1589         filep->private_data = NULL;
1590
1591         vfio_group_try_dissolve_container(group);
1592
1593         atomic_dec(&group->opened);
1594
1595         vfio_group_put(group);
1596
1597         return 0;
1598 }
1599
1600 static const struct file_operations vfio_group_fops = {
1601         .owner          = THIS_MODULE,
1602         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1603         .compat_ioctl   = compat_ptr_ioctl,
1604         .open           = vfio_group_fops_open,
1605         .release        = vfio_group_fops_release,
1606 };
1607
1608 /**
1609  * VFIO Device fd
1610  */
1611 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1612 {
1613         struct vfio_device *device = filep->private_data;
1614
1615         device->ops->release(device->device_data);
1616
1617         vfio_group_try_dissolve_container(device->group);
1618
1619         vfio_device_put(device);
1620
1621         return 0;
1622 }
1623
1624 static long vfio_device_fops_unl_ioctl(struct file *filep,
1625                                        unsigned int cmd, unsigned long arg)
1626 {
1627         struct vfio_device *device = filep->private_data;
1628
1629         if (unlikely(!device->ops->ioctl))
1630                 return -EINVAL;
1631
1632         return device->ops->ioctl(device->device_data, cmd, arg);
1633 }
1634
1635 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1636                                      size_t count, loff_t *ppos)
1637 {
1638         struct vfio_device *device = filep->private_data;
1639
1640         if (unlikely(!device->ops->read))
1641                 return -EINVAL;
1642
1643         return device->ops->read(device->device_data, buf, count, ppos);
1644 }
1645
1646 static ssize_t vfio_device_fops_write(struct file *filep,
1647                                       const char __user *buf,
1648                                       size_t count, loff_t *ppos)
1649 {
1650         struct vfio_device *device = filep->private_data;
1651
1652         if (unlikely(!device->ops->write))
1653                 return -EINVAL;
1654
1655         return device->ops->write(device->device_data, buf, count, ppos);
1656 }
1657
1658 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661
1662         if (unlikely(!device->ops->mmap))
1663                 return -EINVAL;
1664
1665         return device->ops->mmap(device->device_data, vma);
1666 }
1667
1668 static const struct file_operations vfio_device_fops = {
1669         .owner          = THIS_MODULE,
1670         .release        = vfio_device_fops_release,
1671         .read           = vfio_device_fops_read,
1672         .write          = vfio_device_fops_write,
1673         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1674         .compat_ioctl   = compat_ptr_ioctl,
1675         .mmap           = vfio_device_fops_mmap,
1676 };
1677
1678 /**
1679  * External user API, exported by symbols to be linked dynamically.
1680  *
1681  * The protocol includes:
1682  *  1. do normal VFIO init operation:
1683  *      - opening a new container;
1684  *      - attaching group(s) to it;
1685  *      - setting an IOMMU driver for a container.
1686  * When IOMMU is set for a container, all groups in it are
1687  * considered ready to use by an external user.
1688  *
1689  * 2. User space passes a group fd to an external user.
1690  * The external user calls vfio_group_get_external_user()
1691  * to verify that:
1692  *      - the group is initialized;
1693  *      - IOMMU is set for it.
1694  * If both checks passed, vfio_group_get_external_user()
1695  * increments the container user counter to prevent
1696  * the VFIO group from disposal before KVM exits.
1697  *
1698  * 3. The external user calls vfio_external_user_iommu_id()
1699  * to know an IOMMU ID.
1700  *
1701  * 4. When the external KVM finishes, it calls
1702  * vfio_group_put_external_user() to release the VFIO group.
1703  * This call decrements the container user counter.
1704  */
1705 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1706 {
1707         struct vfio_group *group = filep->private_data;
1708         int ret;
1709
1710         if (filep->f_op != &vfio_group_fops)
1711                 return ERR_PTR(-EINVAL);
1712
1713         ret = vfio_group_add_container_user(group);
1714         if (ret)
1715                 return ERR_PTR(ret);
1716
1717         vfio_group_get(group);
1718
1719         return group;
1720 }
1721 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1722
1723 void vfio_group_put_external_user(struct vfio_group *group)
1724 {
1725         vfio_group_try_dissolve_container(group);
1726         vfio_group_put(group);
1727 }
1728 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1729
1730 bool vfio_external_group_match_file(struct vfio_group *test_group,
1731                                     struct file *filep)
1732 {
1733         struct vfio_group *group = filep->private_data;
1734
1735         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1736 }
1737 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1738
1739 int vfio_external_user_iommu_id(struct vfio_group *group)
1740 {
1741         return iommu_group_id(group->iommu_group);
1742 }
1743 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1744
1745 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1746 {
1747         return vfio_ioctl_check_extension(group->container, arg);
1748 }
1749 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1750
1751 /**
1752  * Sub-module support
1753  */
1754 /*
1755  * Helper for managing a buffer of info chain capabilities, allocate or
1756  * reallocate a buffer with additional @size, filling in @id and @version
1757  * of the capability.  A pointer to the new capability is returned.
1758  *
1759  * NB. The chain is based at the head of the buffer, so new entries are
1760  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1761  * next offsets prior to copying to the user buffer.
1762  */
1763 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1764                                                size_t size, u16 id, u16 version)
1765 {
1766         void *buf;
1767         struct vfio_info_cap_header *header, *tmp;
1768
1769         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1770         if (!buf) {
1771                 kfree(caps->buf);
1772                 caps->size = 0;
1773                 return ERR_PTR(-ENOMEM);
1774         }
1775
1776         caps->buf = buf;
1777         header = buf + caps->size;
1778
1779         /* Eventually copied to user buffer, zero */
1780         memset(header, 0, size);
1781
1782         header->id = id;
1783         header->version = version;
1784
1785         /* Add to the end of the capability chain */
1786         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1787                 ; /* nothing */
1788
1789         tmp->next = caps->size;
1790         caps->size += size;
1791
1792         return header;
1793 }
1794 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1795
1796 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1797 {
1798         struct vfio_info_cap_header *tmp;
1799         void *buf = (void *)caps->buf;
1800
1801         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1802                 tmp->next += offset;
1803 }
1804 EXPORT_SYMBOL(vfio_info_cap_shift);
1805
1806 int vfio_info_add_capability(struct vfio_info_cap *caps,
1807                              struct vfio_info_cap_header *cap, size_t size)
1808 {
1809         struct vfio_info_cap_header *header;
1810
1811         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1812         if (IS_ERR(header))
1813                 return PTR_ERR(header);
1814
1815         memcpy(header + 1, cap + 1, size - sizeof(*header));
1816
1817         return 0;
1818 }
1819 EXPORT_SYMBOL(vfio_info_add_capability);
1820
1821 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1822                                        int max_irq_type, size_t *data_size)
1823 {
1824         unsigned long minsz;
1825         size_t size;
1826
1827         minsz = offsetofend(struct vfio_irq_set, count);
1828
1829         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1830             (hdr->count >= (U32_MAX - hdr->start)) ||
1831             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1832                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1833                 return -EINVAL;
1834
1835         if (data_size)
1836                 *data_size = 0;
1837
1838         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1839                 return -EINVAL;
1840
1841         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1842         case VFIO_IRQ_SET_DATA_NONE:
1843                 size = 0;
1844                 break;
1845         case VFIO_IRQ_SET_DATA_BOOL:
1846                 size = sizeof(uint8_t);
1847                 break;
1848         case VFIO_IRQ_SET_DATA_EVENTFD:
1849                 size = sizeof(int32_t);
1850                 break;
1851         default:
1852                 return -EINVAL;
1853         }
1854
1855         if (size) {
1856                 if (hdr->argsz - minsz < hdr->count * size)
1857                         return -EINVAL;
1858
1859                 if (!data_size)
1860                         return -EINVAL;
1861
1862                 *data_size = hdr->count * size;
1863         }
1864
1865         return 0;
1866 }
1867 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1868
1869 /*
1870  * Pin a set of guest PFNs and return their associated host PFNs for local
1871  * domain only.
1872  * @dev [in]     : device
1873  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1874  * @npage [in]   : count of elements in user_pfn array.  This count should not
1875  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1876  * @prot [in]    : protection flags
1877  * @phys_pfn[out]: array of host PFNs
1878  * Return error or number of pages pinned.
1879  */
1880 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1881                    int prot, unsigned long *phys_pfn)
1882 {
1883         struct vfio_container *container;
1884         struct vfio_group *group;
1885         struct vfio_iommu_driver *driver;
1886         int ret;
1887
1888         if (!dev || !user_pfn || !phys_pfn || !npage)
1889                 return -EINVAL;
1890
1891         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1892                 return -E2BIG;
1893
1894         group = vfio_group_get_from_dev(dev);
1895         if (!group)
1896                 return -ENODEV;
1897
1898         ret = vfio_group_add_container_user(group);
1899         if (ret)
1900                 goto err_pin_pages;
1901
1902         container = group->container;
1903         driver = container->iommu_driver;
1904         if (likely(driver && driver->ops->pin_pages))
1905                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1906                                              npage, prot, phys_pfn);
1907         else
1908                 ret = -ENOTTY;
1909
1910         vfio_group_try_dissolve_container(group);
1911
1912 err_pin_pages:
1913         vfio_group_put(group);
1914         return ret;
1915 }
1916 EXPORT_SYMBOL(vfio_pin_pages);
1917
1918 /*
1919  * Unpin set of host PFNs for local domain only.
1920  * @dev [in]     : device
1921  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1922  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1923  * @npage [in]   : count of elements in user_pfn array.  This count should not
1924  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1925  * Return error or number of pages unpinned.
1926  */
1927 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1928 {
1929         struct vfio_container *container;
1930         struct vfio_group *group;
1931         struct vfio_iommu_driver *driver;
1932         int ret;
1933
1934         if (!dev || !user_pfn || !npage)
1935                 return -EINVAL;
1936
1937         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1938                 return -E2BIG;
1939
1940         group = vfio_group_get_from_dev(dev);
1941         if (!group)
1942                 return -ENODEV;
1943
1944         ret = vfio_group_add_container_user(group);
1945         if (ret)
1946                 goto err_unpin_pages;
1947
1948         container = group->container;
1949         driver = container->iommu_driver;
1950         if (likely(driver && driver->ops->unpin_pages))
1951                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1952                                                npage);
1953         else
1954                 ret = -ENOTTY;
1955
1956         vfio_group_try_dissolve_container(group);
1957
1958 err_unpin_pages:
1959         vfio_group_put(group);
1960         return ret;
1961 }
1962 EXPORT_SYMBOL(vfio_unpin_pages);
1963
1964 static int vfio_register_iommu_notifier(struct vfio_group *group,
1965                                         unsigned long *events,
1966                                         struct notifier_block *nb)
1967 {
1968         struct vfio_container *container;
1969         struct vfio_iommu_driver *driver;
1970         int ret;
1971
1972         ret = vfio_group_add_container_user(group);
1973         if (ret)
1974                 return -EINVAL;
1975
1976         container = group->container;
1977         driver = container->iommu_driver;
1978         if (likely(driver && driver->ops->register_notifier))
1979                 ret = driver->ops->register_notifier(container->iommu_data,
1980                                                      events, nb);
1981         else
1982                 ret = -ENOTTY;
1983
1984         vfio_group_try_dissolve_container(group);
1985
1986         return ret;
1987 }
1988
1989 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
1990                                           struct notifier_block *nb)
1991 {
1992         struct vfio_container *container;
1993         struct vfio_iommu_driver *driver;
1994         int ret;
1995
1996         ret = vfio_group_add_container_user(group);
1997         if (ret)
1998                 return -EINVAL;
1999
2000         container = group->container;
2001         driver = container->iommu_driver;
2002         if (likely(driver && driver->ops->unregister_notifier))
2003                 ret = driver->ops->unregister_notifier(container->iommu_data,
2004                                                        nb);
2005         else
2006                 ret = -ENOTTY;
2007
2008         vfio_group_try_dissolve_container(group);
2009
2010         return ret;
2011 }
2012
2013 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2014 {
2015         group->kvm = kvm;
2016         blocking_notifier_call_chain(&group->notifier,
2017                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2018 }
2019 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2020
2021 static int vfio_register_group_notifier(struct vfio_group *group,
2022                                         unsigned long *events,
2023                                         struct notifier_block *nb)
2024 {
2025         int ret;
2026         bool set_kvm = false;
2027
2028         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2029                 set_kvm = true;
2030
2031         /* clear known events */
2032         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2033
2034         /* refuse to continue if still events remaining */
2035         if (*events)
2036                 return -EINVAL;
2037
2038         ret = vfio_group_add_container_user(group);
2039         if (ret)
2040                 return -EINVAL;
2041
2042         ret = blocking_notifier_chain_register(&group->notifier, nb);
2043
2044         /*
2045          * The attaching of kvm and vfio_group might already happen, so
2046          * here we replay once upon registration.
2047          */
2048         if (!ret && set_kvm && group->kvm)
2049                 blocking_notifier_call_chain(&group->notifier,
2050                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2051
2052         vfio_group_try_dissolve_container(group);
2053
2054         return ret;
2055 }
2056
2057 static int vfio_unregister_group_notifier(struct vfio_group *group,
2058                                          struct notifier_block *nb)
2059 {
2060         int ret;
2061
2062         ret = vfio_group_add_container_user(group);
2063         if (ret)
2064                 return -EINVAL;
2065
2066         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2067
2068         vfio_group_try_dissolve_container(group);
2069
2070         return ret;
2071 }
2072
2073 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2074                            unsigned long *events, struct notifier_block *nb)
2075 {
2076         struct vfio_group *group;
2077         int ret;
2078
2079         if (!dev || !nb || !events || (*events == 0))
2080                 return -EINVAL;
2081
2082         group = vfio_group_get_from_dev(dev);
2083         if (!group)
2084                 return -ENODEV;
2085
2086         switch (type) {
2087         case VFIO_IOMMU_NOTIFY:
2088                 ret = vfio_register_iommu_notifier(group, events, nb);
2089                 break;
2090         case VFIO_GROUP_NOTIFY:
2091                 ret = vfio_register_group_notifier(group, events, nb);
2092                 break;
2093         default:
2094                 ret = -EINVAL;
2095         }
2096
2097         vfio_group_put(group);
2098         return ret;
2099 }
2100 EXPORT_SYMBOL(vfio_register_notifier);
2101
2102 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2103                              struct notifier_block *nb)
2104 {
2105         struct vfio_group *group;
2106         int ret;
2107
2108         if (!dev || !nb)
2109                 return -EINVAL;
2110
2111         group = vfio_group_get_from_dev(dev);
2112         if (!group)
2113                 return -ENODEV;
2114
2115         switch (type) {
2116         case VFIO_IOMMU_NOTIFY:
2117                 ret = vfio_unregister_iommu_notifier(group, nb);
2118                 break;
2119         case VFIO_GROUP_NOTIFY:
2120                 ret = vfio_unregister_group_notifier(group, nb);
2121                 break;
2122         default:
2123                 ret = -EINVAL;
2124         }
2125
2126         vfio_group_put(group);
2127         return ret;
2128 }
2129 EXPORT_SYMBOL(vfio_unregister_notifier);
2130
2131 /**
2132  * Module/class support
2133  */
2134 static char *vfio_devnode(struct device *dev, umode_t *mode)
2135 {
2136         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2137 }
2138
2139 static struct miscdevice vfio_dev = {
2140         .minor = VFIO_MINOR,
2141         .name = "vfio",
2142         .fops = &vfio_fops,
2143         .nodename = "vfio/vfio",
2144         .mode = S_IRUGO | S_IWUGO,
2145 };
2146
2147 static int __init vfio_init(void)
2148 {
2149         int ret;
2150
2151         idr_init(&vfio.group_idr);
2152         mutex_init(&vfio.group_lock);
2153         mutex_init(&vfio.iommu_drivers_lock);
2154         INIT_LIST_HEAD(&vfio.group_list);
2155         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2156         init_waitqueue_head(&vfio.release_q);
2157
2158         ret = misc_register(&vfio_dev);
2159         if (ret) {
2160                 pr_err("vfio: misc device register failed\n");
2161                 return ret;
2162         }
2163
2164         /* /dev/vfio/$GROUP */
2165         vfio.class = class_create(THIS_MODULE, "vfio");
2166         if (IS_ERR(vfio.class)) {
2167                 ret = PTR_ERR(vfio.class);
2168                 goto err_class;
2169         }
2170
2171         vfio.class->devnode = vfio_devnode;
2172
2173         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2174         if (ret)
2175                 goto err_alloc_chrdev;
2176
2177         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2178         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2179         if (ret)
2180                 goto err_cdev_add;
2181
2182         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2183
2184 #ifdef CONFIG_VFIO_NOIOMMU
2185         vfio_register_iommu_driver(&vfio_noiommu_ops);
2186 #endif
2187         return 0;
2188
2189 err_cdev_add:
2190         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2191 err_alloc_chrdev:
2192         class_destroy(vfio.class);
2193         vfio.class = NULL;
2194 err_class:
2195         misc_deregister(&vfio_dev);
2196         return ret;
2197 }
2198
2199 static void __exit vfio_cleanup(void)
2200 {
2201         WARN_ON(!list_empty(&vfio.group_list));
2202
2203 #ifdef CONFIG_VFIO_NOIOMMU
2204         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2205 #endif
2206         idr_destroy(&vfio.group_idr);
2207         cdev_del(&vfio.group_cdev);
2208         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2209         class_destroy(vfio.class);
2210         vfio.class = NULL;
2211         misc_deregister(&vfio_dev);
2212 }
2213
2214 module_init(vfio_init);
2215 module_exit(vfio_cleanup);
2216
2217 MODULE_VERSION(DRIVER_VERSION);
2218 MODULE_LICENSE("GPL v2");
2219 MODULE_AUTHOR(DRIVER_AUTHOR);
2220 MODULE_DESCRIPTION(DRIVER_DESC);
2221 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2222 MODULE_ALIAS("devname:vfio/vfio");
2223 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");