2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Kevin Tian <kevin.tian@intel.com>
27 * Jike Song <jike.song@intel.com>
28 * Xiaoguang Chen <xiaoguang.chen@intel.com>
31 #include <linux/init.h>
32 #include <linux/device.h>
34 #include <linux/mmu_context.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
47 #include <linux/nospec.h>
52 static const struct intel_gvt_ops *intel_gvt_ops;
54 /* helper macros copied from vfio-pci */
55 #define VFIO_PCI_OFFSET_SHIFT 40
56 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
58 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
60 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
62 #define OPREGION_SIGNATURE "IntelGraphicsMem"
65 struct intel_vgpu_regops {
66 size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
67 size_t count, loff_t *ppos, bool iswrite);
68 void (*release)(struct intel_vgpu *vgpu,
69 struct vfio_region *region);
77 const struct intel_vgpu_regops *ops;
81 struct vfio_edid_region {
82 struct vfio_region_gfx_edid vfio_edid_regs;
88 struct hlist_node hnode;
91 struct kvmgt_guest_info {
93 struct intel_vgpu *vgpu;
94 struct kvm_page_track_notifier_node track_node;
95 #define NR_BKT (1 << 18)
96 struct hlist_head ptable[NR_BKT];
98 struct dentry *debugfs_cache_entries;
102 struct intel_vgpu *vgpu;
103 struct rb_node gfn_node;
104 struct rb_node dma_addr_node;
112 struct intel_vgpu *vgpu;
113 struct mdev_device *mdev;
114 struct vfio_region *region;
116 struct eventfd_ctx *intx_trigger;
117 struct eventfd_ctx *msi_trigger;
120 * Two caches are used to avoid mapping duplicated pages (eg.
121 * scratch pages). This help to reduce dma setup overhead.
123 struct rb_root gfn_cache;
124 struct rb_root dma_addr_cache;
125 unsigned long nr_cache_entries;
126 struct mutex cache_lock;
128 struct notifier_block iommu_notifier;
129 struct notifier_block group_notifier;
131 struct work_struct release_work;
133 struct vfio_device *vfio_device;
136 static inline struct kvmgt_vdev *kvmgt_vdev(struct intel_vgpu *vgpu)
138 return intel_vgpu_vdev(vgpu);
141 static inline bool handle_valid(unsigned long handle)
143 return !!(handle & ~0xff);
146 static int kvmgt_guest_init(struct mdev_device *mdev);
147 static void intel_vgpu_release_work(struct work_struct *work);
148 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
150 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
153 struct drm_i915_private *i915 = vgpu->gvt->dev_priv;
158 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
160 for (npage = 0; npage < total_pages; npage++) {
161 unsigned long cur_gfn = gfn + npage;
163 ret = vfio_unpin_pages(mdev_dev(kvmgt_vdev(vgpu)->mdev), &cur_gfn, 1);
164 drm_WARN_ON(&i915->drm, ret != 1);
168 /* Pin a normal or compound guest page for dma. */
169 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
170 unsigned long size, struct page **page)
172 unsigned long base_pfn = 0;
177 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
179 * We pin the pages one-by-one to avoid allocating a big arrary
180 * on stack to hold pfns.
182 for (npage = 0; npage < total_pages; npage++) {
183 unsigned long cur_gfn = gfn + npage;
186 ret = vfio_pin_pages(mdev_dev(kvmgt_vdev(vgpu)->mdev), &cur_gfn, 1,
187 IOMMU_READ | IOMMU_WRITE, &pfn);
189 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
194 if (!pfn_valid(pfn)) {
195 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
203 else if (base_pfn + npage != pfn) {
204 gvt_vgpu_err("The pages are not continuous\n");
211 *page = pfn_to_page(base_pfn);
214 gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
218 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
219 dma_addr_t *dma_addr, unsigned long size)
221 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
222 struct page *page = NULL;
225 ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
229 /* Setup DMA mapping. */
230 *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
231 if (dma_mapping_error(dev, *dma_addr)) {
232 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
233 page_to_pfn(page), ret);
234 gvt_unpin_guest_page(vgpu, gfn, size);
241 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
242 dma_addr_t dma_addr, unsigned long size)
244 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
246 dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
247 gvt_unpin_guest_page(vgpu, gfn, size);
250 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
253 struct rb_node *node = kvmgt_vdev(vgpu)->dma_addr_cache.rb_node;
257 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
259 if (dma_addr < itr->dma_addr)
260 node = node->rb_left;
261 else if (dma_addr > itr->dma_addr)
262 node = node->rb_right;
269 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
271 struct rb_node *node = kvmgt_vdev(vgpu)->gfn_cache.rb_node;
275 itr = rb_entry(node, struct gvt_dma, gfn_node);
278 node = node->rb_left;
279 else if (gfn > itr->gfn)
280 node = node->rb_right;
287 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
288 dma_addr_t dma_addr, unsigned long size)
290 struct gvt_dma *new, *itr;
291 struct rb_node **link, *parent = NULL;
292 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
294 new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
300 new->dma_addr = dma_addr;
302 kref_init(&new->ref);
304 /* gfn_cache maps gfn to struct gvt_dma. */
305 link = &vdev->gfn_cache.rb_node;
308 itr = rb_entry(parent, struct gvt_dma, gfn_node);
311 link = &parent->rb_left;
313 link = &parent->rb_right;
315 rb_link_node(&new->gfn_node, parent, link);
316 rb_insert_color(&new->gfn_node, &vdev->gfn_cache);
318 /* dma_addr_cache maps dma addr to struct gvt_dma. */
320 link = &vdev->dma_addr_cache.rb_node;
323 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
325 if (dma_addr < itr->dma_addr)
326 link = &parent->rb_left;
328 link = &parent->rb_right;
330 rb_link_node(&new->dma_addr_node, parent, link);
331 rb_insert_color(&new->dma_addr_node, &vdev->dma_addr_cache);
333 vdev->nr_cache_entries++;
337 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
338 struct gvt_dma *entry)
340 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
342 rb_erase(&entry->gfn_node, &vdev->gfn_cache);
343 rb_erase(&entry->dma_addr_node, &vdev->dma_addr_cache);
345 vdev->nr_cache_entries--;
348 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
351 struct rb_node *node = NULL;
352 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
355 mutex_lock(&vdev->cache_lock);
356 node = rb_first(&vdev->gfn_cache);
358 mutex_unlock(&vdev->cache_lock);
361 dma = rb_entry(node, struct gvt_dma, gfn_node);
362 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
363 __gvt_cache_remove_entry(vgpu, dma);
364 mutex_unlock(&vdev->cache_lock);
368 static void gvt_cache_init(struct intel_vgpu *vgpu)
370 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
372 vdev->gfn_cache = RB_ROOT;
373 vdev->dma_addr_cache = RB_ROOT;
374 vdev->nr_cache_entries = 0;
375 mutex_init(&vdev->cache_lock);
378 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
380 hash_init(info->ptable);
383 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
385 struct kvmgt_pgfn *p;
386 struct hlist_node *tmp;
389 hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
395 static struct kvmgt_pgfn *
396 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
398 struct kvmgt_pgfn *p, *res = NULL;
400 hash_for_each_possible(info->ptable, p, hnode, gfn) {
410 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
413 struct kvmgt_pgfn *p;
415 p = __kvmgt_protect_table_find(info, gfn);
419 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
421 struct kvmgt_pgfn *p;
423 if (kvmgt_gfn_is_write_protected(info, gfn))
426 p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
427 if (WARN(!p, "gfn: 0x%llx\n", gfn))
431 hash_add(info->ptable, &p->hnode, gfn);
434 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
437 struct kvmgt_pgfn *p;
439 p = __kvmgt_protect_table_find(info, gfn);
446 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
447 size_t count, loff_t *ppos, bool iswrite)
449 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
450 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
451 VFIO_PCI_NUM_REGIONS;
452 void *base = vdev->region[i].data;
453 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
456 if (pos >= vdev->region[i].size || iswrite) {
457 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
460 count = min(count, (size_t)(vdev->region[i].size - pos));
461 memcpy(buf, base + pos, count);
466 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
467 struct vfio_region *region)
471 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
472 .rw = intel_vgpu_reg_rw_opregion,
473 .release = intel_vgpu_reg_release_opregion,
476 static int handle_edid_regs(struct intel_vgpu *vgpu,
477 struct vfio_edid_region *region, char *buf,
478 size_t count, u16 offset, bool is_write)
480 struct vfio_region_gfx_edid *regs = ®ion->vfio_edid_regs;
483 if (offset + count > sizeof(*regs))
490 data = *((unsigned int *)buf);
492 case offsetof(struct vfio_region_gfx_edid, link_state):
493 if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
494 if (!drm_edid_block_valid(
495 (u8 *)region->edid_blob,
499 gvt_vgpu_err("invalid EDID blob\n");
502 intel_gvt_ops->emulate_hotplug(vgpu, true);
503 } else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
504 intel_gvt_ops->emulate_hotplug(vgpu, false);
506 gvt_vgpu_err("invalid EDID link state %d\n",
510 regs->link_state = data;
512 case offsetof(struct vfio_region_gfx_edid, edid_size):
513 if (data > regs->edid_max_size) {
514 gvt_vgpu_err("EDID size is bigger than %d!\n",
515 regs->edid_max_size);
518 regs->edid_size = data;
522 gvt_vgpu_err("write read-only EDID region at offset %d\n",
527 memcpy(buf, (char *)regs + offset, count);
533 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
534 size_t count, u16 offset, bool is_write)
536 if (offset + count > region->vfio_edid_regs.edid_size)
540 memcpy(region->edid_blob + offset, buf, count);
542 memcpy(buf, region->edid_blob + offset, count);
547 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
548 size_t count, loff_t *ppos, bool iswrite)
551 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
552 VFIO_PCI_NUM_REGIONS;
553 struct vfio_edid_region *region =
554 (struct vfio_edid_region *)kvmgt_vdev(vgpu)->region[i].data;
555 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
557 if (pos < region->vfio_edid_regs.edid_offset) {
558 ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
560 pos -= EDID_BLOB_OFFSET;
561 ret = handle_edid_blob(region, buf, count, pos, iswrite);
565 gvt_vgpu_err("failed to access EDID region\n");
570 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
571 struct vfio_region *region)
576 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
577 .rw = intel_vgpu_reg_rw_edid,
578 .release = intel_vgpu_reg_release_edid,
581 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
582 unsigned int type, unsigned int subtype,
583 const struct intel_vgpu_regops *ops,
584 size_t size, u32 flags, void *data)
586 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
587 struct vfio_region *region;
589 region = krealloc(vdev->region,
590 (vdev->num_regions + 1) * sizeof(*region),
595 vdev->region = region;
596 vdev->region[vdev->num_regions].type = type;
597 vdev->region[vdev->num_regions].subtype = subtype;
598 vdev->region[vdev->num_regions].ops = ops;
599 vdev->region[vdev->num_regions].size = size;
600 vdev->region[vdev->num_regions].flags = flags;
601 vdev->region[vdev->num_regions].data = data;
606 static int kvmgt_get_vfio_device(void *p_vgpu)
608 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
609 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
611 vdev->vfio_device = vfio_device_get_from_dev(
612 mdev_dev(vdev->mdev));
613 if (!vdev->vfio_device) {
614 gvt_vgpu_err("failed to get vfio device\n");
621 static int kvmgt_set_opregion(void *p_vgpu)
623 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
627 /* Each vgpu has its own opregion, although VFIO would create another
628 * one later. This one is used to expose opregion to VFIO. And the
629 * other one created by VFIO later, is used by guest actually.
631 base = vgpu_opregion(vgpu)->va;
635 if (memcmp(base, OPREGION_SIGNATURE, 16)) {
640 ret = intel_vgpu_register_reg(vgpu,
641 PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
642 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
643 &intel_vgpu_regops_opregion, OPREGION_SIZE,
644 VFIO_REGION_INFO_FLAG_READ, base);
649 static int kvmgt_set_edid(void *p_vgpu, int port_num)
651 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
652 struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
653 struct vfio_edid_region *base;
656 base = kzalloc(sizeof(*base), GFP_KERNEL);
660 /* TODO: Add multi-port and EDID extension block support */
661 base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
662 base->vfio_edid_regs.edid_max_size = EDID_SIZE;
663 base->vfio_edid_regs.edid_size = EDID_SIZE;
664 base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
665 base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
666 base->edid_blob = port->edid->edid_block;
668 ret = intel_vgpu_register_reg(vgpu,
669 VFIO_REGION_TYPE_GFX,
670 VFIO_REGION_SUBTYPE_GFX_EDID,
671 &intel_vgpu_regops_edid, EDID_SIZE,
672 VFIO_REGION_INFO_FLAG_READ |
673 VFIO_REGION_INFO_FLAG_WRITE |
674 VFIO_REGION_INFO_FLAG_CAPS, base);
679 static void kvmgt_put_vfio_device(void *vgpu)
681 struct kvmgt_vdev *vdev = kvmgt_vdev((struct intel_vgpu *)vgpu);
683 if (WARN_ON(!vdev->vfio_device))
686 vfio_device_put(vdev->vfio_device);
689 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
691 struct intel_vgpu *vgpu = NULL;
692 struct intel_vgpu_type *type;
697 pdev = mdev_parent_dev(mdev);
698 gvt = kdev_to_i915(pdev)->gvt;
700 type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
702 gvt_vgpu_err("failed to find type %s to create\n",
708 vgpu = intel_gvt_ops->vgpu_create(gvt, type);
709 if (IS_ERR_OR_NULL(vgpu)) {
710 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
711 gvt_err("failed to create intel vgpu: %d\n", ret);
715 INIT_WORK(&kvmgt_vdev(vgpu)->release_work, intel_vgpu_release_work);
717 kvmgt_vdev(vgpu)->mdev = mdev;
718 mdev_set_drvdata(mdev, vgpu);
720 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
721 dev_name(mdev_dev(mdev)));
728 static int intel_vgpu_remove(struct mdev_device *mdev)
730 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
732 if (handle_valid(vgpu->handle))
735 intel_gvt_ops->vgpu_destroy(vgpu);
739 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
740 unsigned long action, void *data)
742 struct kvmgt_vdev *vdev = container_of(nb,
745 struct intel_vgpu *vgpu = vdev->vgpu;
747 if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
748 struct vfio_iommu_type1_dma_unmap *unmap = data;
749 struct gvt_dma *entry;
750 unsigned long iov_pfn, end_iov_pfn;
752 iov_pfn = unmap->iova >> PAGE_SHIFT;
753 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
755 mutex_lock(&vdev->cache_lock);
756 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
757 entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
761 gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
763 __gvt_cache_remove_entry(vgpu, entry);
765 mutex_unlock(&vdev->cache_lock);
771 static int intel_vgpu_group_notifier(struct notifier_block *nb,
772 unsigned long action, void *data)
774 struct kvmgt_vdev *vdev = container_of(nb,
778 /* the only action we care about */
779 if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
783 schedule_work(&vdev->release_work);
789 static int intel_vgpu_open(struct mdev_device *mdev)
791 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
792 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
793 unsigned long events;
796 vdev->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
797 vdev->group_notifier.notifier_call = intel_vgpu_group_notifier;
799 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
800 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
801 &vdev->iommu_notifier);
803 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
808 events = VFIO_GROUP_NOTIFY_SET_KVM;
809 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
810 &vdev->group_notifier);
812 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
817 /* Take a module reference as mdev core doesn't take
818 * a reference for vendor driver.
820 if (!try_module_get(THIS_MODULE))
823 ret = kvmgt_guest_init(mdev);
827 intel_gvt_ops->vgpu_activate(vgpu);
829 atomic_set(&vdev->released, 0);
833 vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
834 &vdev->group_notifier);
837 vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
838 &vdev->iommu_notifier);
843 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
845 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
846 struct eventfd_ctx *trigger;
848 trigger = vdev->msi_trigger;
850 eventfd_ctx_put(trigger);
851 vdev->msi_trigger = NULL;
855 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
857 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
858 struct drm_i915_private *i915 = vgpu->gvt->dev_priv;
859 struct kvmgt_guest_info *info;
862 if (!handle_valid(vgpu->handle))
865 if (atomic_cmpxchg(&vdev->released, 0, 1))
868 intel_gvt_ops->vgpu_release(vgpu);
870 ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_IOMMU_NOTIFY,
871 &vdev->iommu_notifier);
872 drm_WARN(&i915->drm, ret,
873 "vfio_unregister_notifier for iommu failed: %d\n", ret);
875 ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_GROUP_NOTIFY,
876 &vdev->group_notifier);
877 drm_WARN(&i915->drm, ret,
878 "vfio_unregister_notifier for group failed: %d\n", ret);
880 /* dereference module reference taken at open */
881 module_put(THIS_MODULE);
883 info = (struct kvmgt_guest_info *)vgpu->handle;
884 kvmgt_guest_exit(info);
886 intel_vgpu_release_msi_eventfd_ctx(vgpu);
892 static void intel_vgpu_release(struct mdev_device *mdev)
894 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
896 __intel_vgpu_release(vgpu);
899 static void intel_vgpu_release_work(struct work_struct *work)
901 struct kvmgt_vdev *vdev = container_of(work, struct kvmgt_vdev,
904 __intel_vgpu_release(vdev->vgpu);
907 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
909 u32 start_lo, start_hi;
912 start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
913 PCI_BASE_ADDRESS_MEM_MASK;
914 mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
915 PCI_BASE_ADDRESS_MEM_TYPE_MASK;
918 case PCI_BASE_ADDRESS_MEM_TYPE_64:
919 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
922 case PCI_BASE_ADDRESS_MEM_TYPE_32:
923 case PCI_BASE_ADDRESS_MEM_TYPE_1M:
924 /* 1M mem BAR treated as 32-bit BAR */
926 /* mem unknown type treated as 32-bit BAR */
931 return ((u64)start_hi << 32) | start_lo;
934 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
935 void *buf, unsigned int count, bool is_write)
937 u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
941 ret = intel_gvt_ops->emulate_mmio_write(vgpu,
942 bar_start + off, buf, count);
944 ret = intel_gvt_ops->emulate_mmio_read(vgpu,
945 bar_start + off, buf, count);
949 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
951 return off >= vgpu_aperture_offset(vgpu) &&
952 off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
955 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
956 void *buf, unsigned long count, bool is_write)
958 void __iomem *aperture_va;
960 if (!intel_vgpu_in_aperture(vgpu, off) ||
961 !intel_vgpu_in_aperture(vgpu, off + count)) {
962 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
966 aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
967 ALIGN_DOWN(off, PAGE_SIZE),
968 count + offset_in_page(off));
973 memcpy_toio(aperture_va + offset_in_page(off), buf, count);
975 memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
977 io_mapping_unmap(aperture_va);
982 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
983 size_t count, loff_t *ppos, bool is_write)
985 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
986 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
987 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
988 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
992 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) {
993 gvt_vgpu_err("invalid index: %u\n", index);
998 case VFIO_PCI_CONFIG_REGION_INDEX:
1000 ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
1003 ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
1006 case VFIO_PCI_BAR0_REGION_INDEX:
1007 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
1008 buf, count, is_write);
1010 case VFIO_PCI_BAR2_REGION_INDEX:
1011 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
1013 case VFIO_PCI_BAR1_REGION_INDEX:
1014 case VFIO_PCI_BAR3_REGION_INDEX:
1015 case VFIO_PCI_BAR4_REGION_INDEX:
1016 case VFIO_PCI_BAR5_REGION_INDEX:
1017 case VFIO_PCI_VGA_REGION_INDEX:
1018 case VFIO_PCI_ROM_REGION_INDEX:
1021 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1024 index -= VFIO_PCI_NUM_REGIONS;
1025 return vdev->region[index].ops->rw(vgpu, buf, count,
1029 return ret == 0 ? count : ret;
1032 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
1034 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1035 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1036 struct intel_gvt *gvt = vgpu->gvt;
1039 /* Only allow MMIO GGTT entry access */
1040 if (index != PCI_BASE_ADDRESS_0)
1043 offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
1044 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
1046 return (offset >= gvt->device_info.gtt_start_offset &&
1047 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1051 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
1052 size_t count, loff_t *ppos)
1054 unsigned int done = 0;
1060 /* Only support GGTT entry 8 bytes read */
1061 if (count >= 8 && !(*ppos % 8) &&
1062 gtt_entry(mdev, ppos)) {
1065 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1070 if (copy_to_user(buf, &val, sizeof(val)))
1074 } else if (count >= 4 && !(*ppos % 4)) {
1077 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1082 if (copy_to_user(buf, &val, sizeof(val)))
1086 } else if (count >= 2 && !(*ppos % 2)) {
1089 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1094 if (copy_to_user(buf, &val, sizeof(val)))
1101 ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
1106 if (copy_to_user(buf, &val, sizeof(val)))
1124 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
1125 const char __user *buf,
1126 size_t count, loff_t *ppos)
1128 unsigned int done = 0;
1134 /* Only support GGTT entry 8 bytes write */
1135 if (count >= 8 && !(*ppos % 8) &&
1136 gtt_entry(mdev, ppos)) {
1139 if (copy_from_user(&val, buf, sizeof(val)))
1142 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1148 } else if (count >= 4 && !(*ppos % 4)) {
1151 if (copy_from_user(&val, buf, sizeof(val)))
1154 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1160 } else if (count >= 2 && !(*ppos % 2)) {
1163 if (copy_from_user(&val, buf, sizeof(val)))
1166 ret = intel_vgpu_rw(mdev, (char *)&val,
1167 sizeof(val), ppos, true);
1175 if (copy_from_user(&val, buf, sizeof(val)))
1178 ret = intel_vgpu_rw(mdev, &val, sizeof(val),
1197 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1201 unsigned long req_size, pgoff, req_start;
1203 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1205 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1206 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1209 if (vma->vm_end < vma->vm_start)
1211 if ((vma->vm_flags & VM_SHARED) == 0)
1213 if (index != VFIO_PCI_BAR2_REGION_INDEX)
1216 pg_prot = vma->vm_page_prot;
1217 virtaddr = vma->vm_start;
1218 req_size = vma->vm_end - vma->vm_start;
1219 pgoff = vma->vm_pgoff &
1220 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1221 req_start = pgoff << PAGE_SHIFT;
1223 if (!intel_vgpu_in_aperture(vgpu, req_start))
1225 if (req_start + req_size >
1226 vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1229 pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1231 return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1234 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1236 if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1242 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1243 unsigned int index, unsigned int start,
1244 unsigned int count, u32 flags,
1250 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1251 unsigned int index, unsigned int start,
1252 unsigned int count, u32 flags, void *data)
1257 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1258 unsigned int index, unsigned int start, unsigned int count,
1259 u32 flags, void *data)
1264 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1265 unsigned int index, unsigned int start, unsigned int count,
1266 u32 flags, void *data)
1268 struct eventfd_ctx *trigger;
1270 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1271 int fd = *(int *)data;
1273 trigger = eventfd_ctx_fdget(fd);
1274 if (IS_ERR(trigger)) {
1275 gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1276 return PTR_ERR(trigger);
1278 kvmgt_vdev(vgpu)->msi_trigger = trigger;
1279 } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1280 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1285 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1286 unsigned int index, unsigned int start, unsigned int count,
1289 int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1290 unsigned int start, unsigned int count, u32 flags,
1294 case VFIO_PCI_INTX_IRQ_INDEX:
1295 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1296 case VFIO_IRQ_SET_ACTION_MASK:
1297 func = intel_vgpu_set_intx_mask;
1299 case VFIO_IRQ_SET_ACTION_UNMASK:
1300 func = intel_vgpu_set_intx_unmask;
1302 case VFIO_IRQ_SET_ACTION_TRIGGER:
1303 func = intel_vgpu_set_intx_trigger;
1307 case VFIO_PCI_MSI_IRQ_INDEX:
1308 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1309 case VFIO_IRQ_SET_ACTION_MASK:
1310 case VFIO_IRQ_SET_ACTION_UNMASK:
1311 /* XXX Need masking support exported */
1313 case VFIO_IRQ_SET_ACTION_TRIGGER:
1314 func = intel_vgpu_set_msi_trigger;
1323 return func(vgpu, index, start, count, flags, data);
1326 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1329 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1330 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1331 unsigned long minsz;
1333 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1335 if (cmd == VFIO_DEVICE_GET_INFO) {
1336 struct vfio_device_info info;
1338 minsz = offsetofend(struct vfio_device_info, num_irqs);
1340 if (copy_from_user(&info, (void __user *)arg, minsz))
1343 if (info.argsz < minsz)
1346 info.flags = VFIO_DEVICE_FLAGS_PCI;
1347 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1348 info.num_regions = VFIO_PCI_NUM_REGIONS +
1350 info.num_irqs = VFIO_PCI_NUM_IRQS;
1352 return copy_to_user((void __user *)arg, &info, minsz) ?
1355 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1356 struct vfio_region_info info;
1357 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1360 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1364 minsz = offsetofend(struct vfio_region_info, offset);
1366 if (copy_from_user(&info, (void __user *)arg, minsz))
1369 if (info.argsz < minsz)
1372 switch (info.index) {
1373 case VFIO_PCI_CONFIG_REGION_INDEX:
1374 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1375 info.size = vgpu->gvt->device_info.cfg_space_size;
1376 info.flags = VFIO_REGION_INFO_FLAG_READ |
1377 VFIO_REGION_INFO_FLAG_WRITE;
1379 case VFIO_PCI_BAR0_REGION_INDEX:
1380 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1381 info.size = vgpu->cfg_space.bar[info.index].size;
1387 info.flags = VFIO_REGION_INFO_FLAG_READ |
1388 VFIO_REGION_INFO_FLAG_WRITE;
1390 case VFIO_PCI_BAR1_REGION_INDEX:
1391 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1395 case VFIO_PCI_BAR2_REGION_INDEX:
1396 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1397 info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1398 VFIO_REGION_INFO_FLAG_MMAP |
1399 VFIO_REGION_INFO_FLAG_READ |
1400 VFIO_REGION_INFO_FLAG_WRITE;
1401 info.size = gvt_aperture_sz(vgpu->gvt);
1403 sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1408 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1409 sparse->header.version = 1;
1410 sparse->nr_areas = nr_areas;
1411 cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1412 sparse->areas[0].offset =
1413 PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1414 sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1417 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1418 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1422 gvt_dbg_core("get region info bar:%d\n", info.index);
1425 case VFIO_PCI_ROM_REGION_INDEX:
1426 case VFIO_PCI_VGA_REGION_INDEX:
1427 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1431 gvt_dbg_core("get region info index:%d\n", info.index);
1435 struct vfio_region_info_cap_type cap_type = {
1436 .header.id = VFIO_REGION_INFO_CAP_TYPE,
1437 .header.version = 1 };
1439 if (info.index >= VFIO_PCI_NUM_REGIONS +
1443 array_index_nospec(info.index,
1444 VFIO_PCI_NUM_REGIONS +
1447 i = info.index - VFIO_PCI_NUM_REGIONS;
1450 VFIO_PCI_INDEX_TO_OFFSET(info.index);
1451 info.size = vdev->region[i].size;
1452 info.flags = vdev->region[i].flags;
1454 cap_type.type = vdev->region[i].type;
1455 cap_type.subtype = vdev->region[i].subtype;
1457 ret = vfio_info_add_capability(&caps,
1465 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1466 switch (cap_type_id) {
1467 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1468 ret = vfio_info_add_capability(&caps,
1470 struct_size(sparse, areas,
1484 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1485 if (info.argsz < sizeof(info) + caps.size) {
1486 info.argsz = sizeof(info) + caps.size;
1487 info.cap_offset = 0;
1489 vfio_info_cap_shift(&caps, sizeof(info));
1490 if (copy_to_user((void __user *)arg +
1491 sizeof(info), caps.buf,
1497 info.cap_offset = sizeof(info);
1504 return copy_to_user((void __user *)arg, &info, minsz) ?
1506 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1507 struct vfio_irq_info info;
1509 minsz = offsetofend(struct vfio_irq_info, count);
1511 if (copy_from_user(&info, (void __user *)arg, minsz))
1514 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1517 switch (info.index) {
1518 case VFIO_PCI_INTX_IRQ_INDEX:
1519 case VFIO_PCI_MSI_IRQ_INDEX:
1525 info.flags = VFIO_IRQ_INFO_EVENTFD;
1527 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1529 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1530 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1531 VFIO_IRQ_INFO_AUTOMASKED);
1533 info.flags |= VFIO_IRQ_INFO_NORESIZE;
1535 return copy_to_user((void __user *)arg, &info, minsz) ?
1537 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1538 struct vfio_irq_set hdr;
1541 size_t data_size = 0;
1543 minsz = offsetofend(struct vfio_irq_set, count);
1545 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1548 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1549 int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1551 ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1552 VFIO_PCI_NUM_IRQS, &data_size);
1554 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1558 data = memdup_user((void __user *)(arg + minsz),
1561 return PTR_ERR(data);
1565 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1566 hdr.start, hdr.count, data);
1570 } else if (cmd == VFIO_DEVICE_RESET) {
1571 intel_gvt_ops->vgpu_reset(vgpu);
1573 } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1574 struct vfio_device_gfx_plane_info dmabuf;
1577 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1579 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1581 if (dmabuf.argsz < minsz)
1584 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1588 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1590 } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1594 if (get_user(dmabuf_id, (__u32 __user *)arg))
1597 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1606 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1609 struct mdev_device *mdev = mdev_from_dev(dev);
1612 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1613 mdev_get_drvdata(mdev);
1614 return sprintf(buf, "%d\n", vgpu->id);
1616 return sprintf(buf, "\n");
1619 static DEVICE_ATTR_RO(vgpu_id);
1621 static struct attribute *intel_vgpu_attrs[] = {
1622 &dev_attr_vgpu_id.attr,
1626 static const struct attribute_group intel_vgpu_group = {
1627 .name = "intel_vgpu",
1628 .attrs = intel_vgpu_attrs,
1631 static const struct attribute_group *intel_vgpu_groups[] = {
1636 static struct mdev_parent_ops intel_vgpu_ops = {
1637 .mdev_attr_groups = intel_vgpu_groups,
1638 .create = intel_vgpu_create,
1639 .remove = intel_vgpu_remove,
1641 .open = intel_vgpu_open,
1642 .release = intel_vgpu_release,
1644 .read = intel_vgpu_read,
1645 .write = intel_vgpu_write,
1646 .mmap = intel_vgpu_mmap,
1647 .ioctl = intel_vgpu_ioctl,
1650 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1652 struct attribute_group **kvm_vgpu_type_groups;
1654 intel_gvt_ops = ops;
1655 if (!intel_gvt_ops->get_gvt_attrs(&kvm_vgpu_type_groups))
1657 intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1659 return mdev_register_device(dev, &intel_vgpu_ops);
1662 static void kvmgt_host_exit(struct device *dev)
1664 mdev_unregister_device(dev);
1667 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1669 struct kvmgt_guest_info *info;
1671 struct kvm_memory_slot *slot;
1674 if (!handle_valid(handle))
1677 info = (struct kvmgt_guest_info *)handle;
1680 idx = srcu_read_lock(&kvm->srcu);
1681 slot = gfn_to_memslot(kvm, gfn);
1683 srcu_read_unlock(&kvm->srcu, idx);
1687 spin_lock(&kvm->mmu_lock);
1689 if (kvmgt_gfn_is_write_protected(info, gfn))
1692 kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1693 kvmgt_protect_table_add(info, gfn);
1696 spin_unlock(&kvm->mmu_lock);
1697 srcu_read_unlock(&kvm->srcu, idx);
1701 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1703 struct kvmgt_guest_info *info;
1705 struct kvm_memory_slot *slot;
1708 if (!handle_valid(handle))
1711 info = (struct kvmgt_guest_info *)handle;
1714 idx = srcu_read_lock(&kvm->srcu);
1715 slot = gfn_to_memslot(kvm, gfn);
1717 srcu_read_unlock(&kvm->srcu, idx);
1721 spin_lock(&kvm->mmu_lock);
1723 if (!kvmgt_gfn_is_write_protected(info, gfn))
1726 kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1727 kvmgt_protect_table_del(info, gfn);
1730 spin_unlock(&kvm->mmu_lock);
1731 srcu_read_unlock(&kvm->srcu, idx);
1735 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1736 const u8 *val, int len,
1737 struct kvm_page_track_notifier_node *node)
1739 struct kvmgt_guest_info *info = container_of(node,
1740 struct kvmgt_guest_info, track_node);
1742 if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1743 intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1747 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1748 struct kvm_memory_slot *slot,
1749 struct kvm_page_track_notifier_node *node)
1753 struct kvmgt_guest_info *info = container_of(node,
1754 struct kvmgt_guest_info, track_node);
1756 spin_lock(&kvm->mmu_lock);
1757 for (i = 0; i < slot->npages; i++) {
1758 gfn = slot->base_gfn + i;
1759 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1760 kvm_slot_page_track_remove_page(kvm, slot, gfn,
1761 KVM_PAGE_TRACK_WRITE);
1762 kvmgt_protect_table_del(info, gfn);
1765 spin_unlock(&kvm->mmu_lock);
1768 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1770 struct intel_vgpu *itr;
1771 struct kvmgt_guest_info *info;
1775 mutex_lock(&vgpu->gvt->lock);
1776 for_each_active_vgpu(vgpu->gvt, itr, id) {
1777 if (!handle_valid(itr->handle))
1780 info = (struct kvmgt_guest_info *)itr->handle;
1781 if (kvm && kvm == info->kvm) {
1787 mutex_unlock(&vgpu->gvt->lock);
1791 static int kvmgt_guest_init(struct mdev_device *mdev)
1793 struct kvmgt_guest_info *info;
1794 struct intel_vgpu *vgpu;
1795 struct kvmgt_vdev *vdev;
1798 vgpu = mdev_get_drvdata(mdev);
1799 if (handle_valid(vgpu->handle))
1802 vdev = kvmgt_vdev(vgpu);
1804 if (!kvm || kvm->mm != current->mm) {
1805 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1809 if (__kvmgt_vgpu_exist(vgpu, kvm))
1812 info = vzalloc(sizeof(struct kvmgt_guest_info));
1816 vgpu->handle = (unsigned long)info;
1819 kvm_get_kvm(info->kvm);
1821 kvmgt_protect_table_init(info);
1822 gvt_cache_init(vgpu);
1824 info->track_node.track_write = kvmgt_page_track_write;
1825 info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1826 kvm_page_track_register_notifier(kvm, &info->track_node);
1828 info->debugfs_cache_entries = debugfs_create_ulong(
1829 "kvmgt_nr_cache_entries",
1830 0444, vgpu->debugfs,
1831 &vdev->nr_cache_entries);
1835 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1837 debugfs_remove(info->debugfs_cache_entries);
1839 kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1840 kvm_put_kvm(info->kvm);
1841 kvmgt_protect_table_destroy(info);
1842 gvt_cache_destroy(info->vgpu);
1848 static int kvmgt_attach_vgpu(void *p_vgpu, unsigned long *handle)
1850 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1852 vgpu->vdev = kzalloc(sizeof(struct kvmgt_vdev), GFP_KERNEL);
1857 kvmgt_vdev(vgpu)->vgpu = vgpu;
1862 static void kvmgt_detach_vgpu(void *p_vgpu)
1865 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1866 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1871 for (i = 0; i < vdev->num_regions; i++)
1872 if (vdev->region[i].ops->release)
1873 vdev->region[i].ops->release(vgpu,
1875 vdev->num_regions = 0;
1876 kfree(vdev->region);
1877 vdev->region = NULL;
1882 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1884 struct kvmgt_guest_info *info;
1885 struct intel_vgpu *vgpu;
1886 struct kvmgt_vdev *vdev;
1888 if (!handle_valid(handle))
1891 info = (struct kvmgt_guest_info *)handle;
1893 vdev = kvmgt_vdev(vgpu);
1896 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1897 * config and mmio register isn't restored to default during guest
1898 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1899 * may be enabled, then once this vgpu is active, it will get inject
1900 * vblank interrupt request. But msi_trigger is null until msi is
1901 * enabled by guest. so if msi_trigger is null, success is still
1902 * returned and don't inject interrupt into guest.
1904 if (vdev->msi_trigger == NULL)
1907 if (eventfd_signal(vdev->msi_trigger, 1) == 1)
1913 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1915 struct kvmgt_guest_info *info;
1918 if (!handle_valid(handle))
1919 return INTEL_GVT_INVALID_ADDR;
1921 info = (struct kvmgt_guest_info *)handle;
1923 pfn = gfn_to_pfn(info->kvm, gfn);
1924 if (is_error_noslot_pfn(pfn))
1925 return INTEL_GVT_INVALID_ADDR;
1930 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1931 unsigned long size, dma_addr_t *dma_addr)
1933 struct intel_vgpu *vgpu;
1934 struct kvmgt_vdev *vdev;
1935 struct gvt_dma *entry;
1938 if (!handle_valid(handle))
1941 vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
1942 vdev = kvmgt_vdev(vgpu);
1944 mutex_lock(&vdev->cache_lock);
1946 entry = __gvt_cache_find_gfn(vgpu, gfn);
1948 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1952 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1955 } else if (entry->size != size) {
1956 /* the same gfn with different size: unmap and re-map */
1957 gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1958 __gvt_cache_remove_entry(vgpu, entry);
1960 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1964 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1968 kref_get(&entry->ref);
1969 *dma_addr = entry->dma_addr;
1972 mutex_unlock(&vdev->cache_lock);
1976 gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1978 mutex_unlock(&vdev->cache_lock);
1982 static int kvmgt_dma_pin_guest_page(unsigned long handle, dma_addr_t dma_addr)
1984 struct kvmgt_guest_info *info;
1985 struct kvmgt_vdev *vdev;
1986 struct gvt_dma *entry;
1989 if (!handle_valid(handle))
1992 info = (struct kvmgt_guest_info *)handle;
1993 vdev = kvmgt_vdev(info->vgpu);
1995 mutex_lock(&vdev->cache_lock);
1996 entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1998 kref_get(&entry->ref);
2001 mutex_unlock(&vdev->cache_lock);
2006 static void __gvt_dma_release(struct kref *ref)
2008 struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
2010 gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
2012 __gvt_cache_remove_entry(entry->vgpu, entry);
2015 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
2017 struct intel_vgpu *vgpu;
2018 struct kvmgt_vdev *vdev;
2019 struct gvt_dma *entry;
2021 if (!handle_valid(handle))
2024 vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2025 vdev = kvmgt_vdev(vgpu);
2027 mutex_lock(&vdev->cache_lock);
2028 entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
2030 kref_put(&entry->ref, __gvt_dma_release);
2031 mutex_unlock(&vdev->cache_lock);
2034 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
2035 void *buf, unsigned long len, bool write)
2037 struct kvmgt_guest_info *info;
2040 bool kthread = current->mm == NULL;
2042 if (!handle_valid(handle))
2045 info = (struct kvmgt_guest_info *)handle;
2049 if (!mmget_not_zero(kvm->mm))
2054 idx = srcu_read_lock(&kvm->srcu);
2055 ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
2056 kvm_read_guest(kvm, gpa, buf, len);
2057 srcu_read_unlock(&kvm->srcu, idx);
2067 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
2068 void *buf, unsigned long len)
2070 return kvmgt_rw_gpa(handle, gpa, buf, len, false);
2073 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
2074 void *buf, unsigned long len)
2076 return kvmgt_rw_gpa(handle, gpa, buf, len, true);
2079 static unsigned long kvmgt_virt_to_pfn(void *addr)
2081 return PFN_DOWN(__pa(addr));
2084 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
2086 struct kvmgt_guest_info *info;
2091 if (!handle_valid(handle))
2094 info = (struct kvmgt_guest_info *)handle;
2097 idx = srcu_read_lock(&kvm->srcu);
2098 ret = kvm_is_visible_gfn(kvm, gfn);
2099 srcu_read_unlock(&kvm->srcu, idx);
2104 static struct intel_gvt_mpt kvmgt_mpt = {
2105 .type = INTEL_GVT_HYPERVISOR_KVM,
2106 .host_init = kvmgt_host_init,
2107 .host_exit = kvmgt_host_exit,
2108 .attach_vgpu = kvmgt_attach_vgpu,
2109 .detach_vgpu = kvmgt_detach_vgpu,
2110 .inject_msi = kvmgt_inject_msi,
2111 .from_virt_to_mfn = kvmgt_virt_to_pfn,
2112 .enable_page_track = kvmgt_page_track_add,
2113 .disable_page_track = kvmgt_page_track_remove,
2114 .read_gpa = kvmgt_read_gpa,
2115 .write_gpa = kvmgt_write_gpa,
2116 .gfn_to_mfn = kvmgt_gfn_to_pfn,
2117 .dma_map_guest_page = kvmgt_dma_map_guest_page,
2118 .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
2119 .dma_pin_guest_page = kvmgt_dma_pin_guest_page,
2120 .set_opregion = kvmgt_set_opregion,
2121 .set_edid = kvmgt_set_edid,
2122 .get_vfio_device = kvmgt_get_vfio_device,
2123 .put_vfio_device = kvmgt_put_vfio_device,
2124 .is_valid_gfn = kvmgt_is_valid_gfn,
2127 static int __init kvmgt_init(void)
2129 if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
2134 static void __exit kvmgt_exit(void)
2136 intel_gvt_unregister_hypervisor();
2139 module_init(kvmgt_init);
2140 module_exit(kvmgt_exit);
2142 MODULE_LICENSE("GPL and additional rights");
2143 MODULE_AUTHOR("Intel Corporation");