1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2015 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>
8 #include <linux/intel-iommu.h>
9 #include <linux/mmu_notifier.h>
10 #include <linux/sched.h>
11 #include <linux/sched/mm.h>
12 #include <linux/slab.h>
13 #include <linux/intel-svm.h>
14 #include <linux/rculist.h>
15 #include <linux/pci.h>
16 #include <linux/pci-ats.h>
17 #include <linux/dmar.h>
18 #include <linux/interrupt.h>
19 #include <linux/mm_types.h>
20 #include <linux/ioasid.h>
22 #include <asm/fpu/api.h>
26 static irqreturn_t prq_event_thread(int irq, void *d);
27 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
31 int intel_svm_enable_prq(struct intel_iommu *iommu)
36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
42 iommu->prq = page_address(pages);
44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
50 free_pages((unsigned long)iommu->prq, PRQ_ORDER);
56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
59 iommu->prq_name, iommu);
61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
71 init_completion(&iommu->prq_complete);
76 int intel_svm_finish_prq(struct intel_iommu *iommu)
78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
83 free_irq(iommu->pr_irq, iommu);
84 dmar_free_hwirq(iommu->pr_irq);
88 free_pages((unsigned long)iommu->prq, PRQ_ORDER);
94 static inline bool intel_svm_capable(struct intel_iommu *iommu)
96 return iommu->flags & VTD_FLAG_SVM_CAPABLE;
99 void intel_svm_check(struct intel_iommu *iommu)
101 if (!pasid_supported(iommu))
104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
105 !cap_fl1gp_support(iommu->cap)) {
106 pr_err("%s SVM disabled, incompatible 1GB page capability\n",
111 if (cpu_feature_enabled(X86_FEATURE_LA57) &&
112 !cap_5lp_support(iommu->cap)) {
113 pr_err("%s SVM disabled, incompatible paging mode\n",
118 iommu->flags |= VTD_FLAG_SVM_CAPABLE;
121 static void __flush_svm_range_dev(struct intel_svm *svm,
122 struct intel_svm_dev *sdev,
123 unsigned long address,
124 unsigned long pages, int ih)
129 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
130 QI_EIOTLB_DID(sdev->did) |
131 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
135 int mask = ilog2(__roundup_pow_of_two(pages));
137 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
138 QI_EIOTLB_DID(sdev->did) |
139 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
141 desc.qw1 = QI_EIOTLB_ADDR(address) |
147 qi_submit_sync(sdev->iommu, &desc, 1, 0);
149 if (sdev->dev_iotlb) {
150 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
151 QI_DEV_EIOTLB_SID(sdev->sid) |
152 QI_DEV_EIOTLB_QDEP(sdev->qdep) |
155 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
157 } else if (pages > 1) {
158 /* The least significant zero bit indicates the size. So,
159 * for example, an "address" value of 0x12345f000 will
160 * flush from 0x123440000 to 0x12347ffff (256KiB). */
161 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
162 unsigned long mask = __rounddown_pow_of_two(address ^ last);
164 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
165 (mask - 1)) | QI_DEV_EIOTLB_SIZE;
167 desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
171 qi_submit_sync(sdev->iommu, &desc, 1, 0);
175 static void intel_flush_svm_range_dev(struct intel_svm *svm,
176 struct intel_svm_dev *sdev,
177 unsigned long address,
178 unsigned long pages, int ih)
180 unsigned long shift = ilog2(__roundup_pow_of_two(pages));
181 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift));
182 unsigned long start = ALIGN_DOWN(address, align);
183 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align);
185 while (start < end) {
186 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih);
191 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
192 unsigned long pages, int ih)
194 struct intel_svm_dev *sdev;
197 list_for_each_entry_rcu(sdev, &svm->devs, list)
198 intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
202 /* Pages have been freed at this point */
203 static void intel_invalidate_range(struct mmu_notifier *mn,
204 struct mm_struct *mm,
205 unsigned long start, unsigned long end)
207 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
209 intel_flush_svm_range(svm, start,
210 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
213 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
215 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
216 struct intel_svm_dev *sdev;
218 /* This might end up being called from exit_mmap(), *before* the page
219 * tables are cleared. And __mmu_notifier_release() will delete us from
220 * the list of notifiers so that our invalidate_range() callback doesn't
221 * get called when the page tables are cleared. So we need to protect
222 * against hardware accessing those page tables.
224 * We do it by clearing the entry in the PASID table and then flushing
225 * the IOTLB and the PASID table caches. This might upset hardware;
226 * perhaps we'll want to point the PASID to a dummy PGD (like the zero
227 * page) so that we end up taking a fault that the hardware really
228 * *has* to handle gracefully without affecting other processes.
231 list_for_each_entry_rcu(sdev, &svm->devs, list)
232 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
238 static const struct mmu_notifier_ops intel_mmuops = {
239 .release = intel_mm_release,
240 .invalidate_range = intel_invalidate_range,
243 static DEFINE_MUTEX(pasid_mutex);
244 static LIST_HEAD(global_svm_list);
246 #define for_each_svm_dev(sdev, svm, d) \
247 list_for_each_entry((sdev), &(svm)->devs, list) \
248 if ((d) != (sdev)->dev) {} else
250 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
251 struct intel_svm **rsvm,
252 struct intel_svm_dev **rsdev)
254 struct intel_svm_dev *d, *sdev = NULL;
255 struct intel_svm *svm;
257 /* The caller should hold the pasid_mutex lock */
258 if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
261 if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
264 svm = ioasid_find(NULL, pasid, NULL);
272 * If we found svm for the PASID, there must be at least one device
275 if (WARN_ON(list_empty(&svm->devs)))
279 list_for_each_entry_rcu(d, &svm->devs, list) {
294 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
295 struct iommu_gpasid_bind_data *data)
297 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
298 struct intel_svm_dev *sdev = NULL;
299 struct dmar_domain *dmar_domain;
300 struct device_domain_info *info;
301 struct intel_svm *svm = NULL;
302 unsigned long iflags;
305 if (WARN_ON(!iommu) || !data)
308 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
311 /* IOMMU core ensures argsz is more than the start of the union */
312 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
315 /* Make sure no undefined flags are used in vendor data */
316 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
319 if (!dev_is_pci(dev))
322 /* VT-d supports devices with full 20 bit PASIDs only */
323 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
327 * We only check host PASID range, we have no knowledge to check
330 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
333 info = get_domain_info(dev);
337 dmar_domain = to_dmar_domain(domain);
339 mutex_lock(&pasid_mutex);
340 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
346 * Do not allow multiple bindings of the same device-PASID since
347 * there is only one SL page tables per PASID. We may revisit
348 * once sharing PGD across domains are supported.
350 dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
357 /* We come here when PASID has never been bond to a device. */
358 svm = kzalloc(sizeof(*svm), GFP_KERNEL);
363 /* REVISIT: upper layer/VFIO can track host process that bind
364 * the PASID. ioasid_set = mm might be sufficient for vfio to
365 * check pasid VMM ownership. We can drop the following line
366 * once VFIO and IOASID set check is in place.
368 svm->mm = get_task_mm(current);
369 svm->pasid = data->hpasid;
370 if (data->flags & IOMMU_SVA_GPASID_VAL) {
371 svm->gpasid = data->gpasid;
372 svm->flags |= SVM_FLAG_GUEST_PASID;
374 ioasid_set_data(data->hpasid, svm);
375 INIT_LIST_HEAD_RCU(&svm->devs);
378 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
384 sdev->sid = PCI_DEVID(info->bus, info->devfn);
387 /* Only count users if device has aux domains */
388 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
391 /* Set up device context entry for PASID if not enabled already */
392 ret = intel_iommu_enable_pasid(iommu, sdev->dev);
394 dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
400 * PASID table is per device for better security. Therefore, for
401 * each bind of a new device even with an existing PASID, we need to
402 * call the nested mode setup function here.
404 spin_lock_irqsave(&iommu->lock, iflags);
405 ret = intel_pasid_setup_nested(iommu, dev,
406 (pgd_t *)(uintptr_t)data->gpgd,
407 data->hpasid, &data->vendor.vtd, dmar_domain,
409 spin_unlock_irqrestore(&iommu->lock, iflags);
411 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
414 * PASID entry should be in cleared state if nested mode
415 * set up failed. So we only need to clear IOASID tracking
416 * data such that free call will succeed.
422 svm->flags |= SVM_FLAG_GUEST_MODE;
424 init_rcu_head(&sdev->rcu);
425 list_add_rcu(&sdev->list, &svm->devs);
427 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
428 ioasid_set_data(data->hpasid, NULL);
432 mutex_unlock(&pasid_mutex);
436 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
438 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
439 struct intel_svm_dev *sdev;
440 struct intel_svm *svm;
446 mutex_lock(&pasid_mutex);
447 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
452 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
455 list_del_rcu(&sdev->list);
456 intel_pasid_tear_down_entry(iommu, dev,
458 intel_svm_drain_prq(dev, svm->pasid);
459 kfree_rcu(sdev, rcu);
461 if (list_empty(&svm->devs)) {
463 * We do not free the IOASID here in that
464 * IOMMU driver did not allocate it.
465 * Unlike native SVM, IOASID for guest use was
466 * allocated prior to the bind call.
467 * In any case, if the free call comes before
468 * the unbind, IOMMU driver will get notified
469 * and perform cleanup.
471 ioasid_set_data(pasid, NULL);
477 mutex_unlock(&pasid_mutex);
481 static void _load_pasid(void *unused)
486 static void load_pasid(struct mm_struct *mm, u32 pasid)
488 mutex_lock(&mm->context.lock);
490 /* Synchronize with READ_ONCE in update_pasid(). */
491 smp_store_release(&mm->pasid, pasid);
493 /* Update PASID MSR on all CPUs running the mm's tasks. */
494 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
496 mutex_unlock(&mm->context.lock);
499 /* Caller must hold pasid_mutex, mm reference */
501 intel_svm_bind_mm(struct device *dev, unsigned int flags,
502 struct svm_dev_ops *ops,
503 struct mm_struct *mm, struct intel_svm_dev **sd)
505 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
506 struct device_domain_info *info;
507 struct intel_svm_dev *sdev;
508 struct intel_svm *svm = NULL;
509 unsigned long iflags;
513 if (!iommu || dmar_disabled)
516 if (!intel_svm_capable(iommu))
519 if (dev_is_pci(dev)) {
520 pasid_max = pci_max_pasids(to_pci_dev(dev));
526 /* Bind supervisor PASID shuld have mm = NULL */
527 if (flags & SVM_FLAG_SUPERVISOR_MODE) {
528 if (!ecap_srs(iommu->ecap) || mm) {
529 pr_err("Supervisor PASID with user provided mm.\n");
534 if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
537 list_for_each_entry(t, &global_svm_list, list) {
538 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
542 if (svm->pasid >= pasid_max) {
544 "Limited PASID width. Cannot use existing PASID %d\n",
550 /* Find the matching device in svm list */
551 for_each_svm_dev(sdev, svm, dev) {
552 if (sdev->ops != ops) {
564 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
572 ret = intel_iommu_enable_pasid(iommu, dev);
578 info = get_domain_info(dev);
579 sdev->did = FLPT_DEFAULT_DID;
580 sdev->sid = PCI_DEVID(info->bus, info->devfn);
581 if (info->ats_enabled) {
583 sdev->qdep = info->ats_qdep;
584 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
588 /* Finish the setup now we know we're keeping it */
591 init_rcu_head(&sdev->rcu);
594 svm = kzalloc(sizeof(*svm), GFP_KERNEL);
601 if (pasid_max > intel_pasid_max_id)
602 pasid_max = intel_pasid_max_id;
604 /* Do not use PASID 0, reserved for RID to PASID */
605 svm->pasid = ioasid_alloc(NULL, PASID_MIN,
607 if (svm->pasid == INVALID_IOASID) {
613 svm->notifier.ops = &intel_mmuops;
616 INIT_LIST_HEAD_RCU(&svm->devs);
617 INIT_LIST_HEAD(&svm->list);
620 ret = mmu_notifier_register(&svm->notifier, mm);
622 ioasid_put(svm->pasid);
629 spin_lock_irqsave(&iommu->lock, iflags);
630 ret = intel_pasid_setup_first_level(iommu, dev,
631 mm ? mm->pgd : init_mm.pgd,
632 svm->pasid, FLPT_DEFAULT_DID,
633 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
634 (cpu_feature_enabled(X86_FEATURE_LA57) ?
635 PASID_FLAG_FL5LP : 0));
636 spin_unlock_irqrestore(&iommu->lock, iflags);
639 mmu_notifier_unregister(&svm->notifier, mm);
640 ioasid_put(svm->pasid);
646 list_add_tail(&svm->list, &global_svm_list);
648 /* The newly allocated pasid is loaded to the mm. */
649 load_pasid(mm, svm->pasid);
653 * Binding a new device with existing PASID, need to setup
656 spin_lock_irqsave(&iommu->lock, iflags);
657 ret = intel_pasid_setup_first_level(iommu, dev,
658 mm ? mm->pgd : init_mm.pgd,
659 svm->pasid, FLPT_DEFAULT_DID,
660 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
661 (cpu_feature_enabled(X86_FEATURE_LA57) ?
662 PASID_FLAG_FL5LP : 0));
663 spin_unlock_irqrestore(&iommu->lock, iflags);
669 list_add_rcu(&sdev->list, &svm->devs);
671 sdev->pasid = svm->pasid;
680 /* Caller must hold pasid_mutex */
681 static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
683 struct intel_svm_dev *sdev;
684 struct intel_iommu *iommu;
685 struct intel_svm *svm;
688 iommu = device_to_iommu(dev, NULL, NULL);
692 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
699 list_del_rcu(&sdev->list);
700 /* Flush the PASID cache and IOTLB for this device.
701 * Note that we do depend on the hardware *not* using
702 * the PASID any more. Just as we depend on other
703 * devices never using PASIDs that they have no right
704 * to use. We have a *shared* PASID table, because it's
705 * large and has to be physically contiguous. So it's
706 * hard to be as defensive as we might like. */
707 intel_pasid_tear_down_entry(iommu, dev,
709 intel_svm_drain_prq(dev, svm->pasid);
710 kfree_rcu(sdev, rcu);
712 if (list_empty(&svm->devs)) {
713 ioasid_put(svm->pasid);
715 mmu_notifier_unregister(&svm->notifier, svm->mm);
716 /* Clear mm's pasid. */
717 load_pasid(svm->mm, PASID_DISABLED);
719 list_del(&svm->list);
720 /* We mandate that no page faults may be outstanding
721 * for the PASID when intel_svm_unbind_mm() is called.
722 * If that is not obeyed, subtle errors will happen.
723 * Let's make them less subtle... */
724 memset(svm, 0x6b, sizeof(*svm));
733 /* Page request queue descriptor */
734 struct page_req_dsc {
739 u64 priv_data_present:1;
762 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
764 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
766 unsigned long requested = 0;
769 requested |= VM_EXEC;
772 requested |= VM_READ;
775 requested |= VM_WRITE;
777 return (requested & ~vma->vm_flags) != 0;
780 static bool is_canonical_address(u64 addr)
782 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
783 long saddr = (long) addr;
785 return (((saddr << shift) >> shift) == saddr);
789 * intel_svm_drain_prq - Drain page requests and responses for a pasid
790 * @dev: target device
791 * @pasid: pasid for draining
793 * Drain all pending page requests and responses related to @pasid in both
794 * software and hardware. This is supposed to be called after the device
795 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
796 * and DevTLB have been invalidated.
798 * It waits until all pending page requests for @pasid in the page fault
799 * queue are completed by the prq handling thread. Then follow the steps
800 * described in VT-d spec CH7.10 to drain all page requests and page
801 * responses pending in the hardware.
803 static void intel_svm_drain_prq(struct device *dev, u32 pasid)
805 struct device_domain_info *info;
806 struct dmar_domain *domain;
807 struct intel_iommu *iommu;
808 struct qi_desc desc[3];
809 struct pci_dev *pdev;
814 info = get_domain_info(dev);
815 if (WARN_ON(!info || !dev_is_pci(dev)))
818 if (!info->pri_enabled)
822 domain = info->domain;
823 pdev = to_pci_dev(dev);
824 sid = PCI_DEVID(info->bus, info->devfn);
825 did = domain->iommu_did[iommu->seq_id];
826 qdep = pci_ats_queue_depth(pdev);
829 * Check and wait until all pending page requests in the queue are
830 * handled by the prq handling thread.
833 reinit_completion(&iommu->prq_complete);
834 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
835 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
836 while (head != tail) {
837 struct page_req_dsc *req;
839 req = &iommu->prq[head / sizeof(*req)];
840 if (!req->pasid_present || req->pasid != pasid) {
841 head = (head + sizeof(*req)) & PRQ_RING_MASK;
845 wait_for_completion(&iommu->prq_complete);
850 * Perform steps described in VT-d spec CH7.10 to drain page
851 * requests and responses in hardware.
853 memset(desc, 0, sizeof(desc));
854 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
857 desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
859 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
861 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
862 QI_DEV_EIOTLB_SID(sid) |
863 QI_DEV_EIOTLB_QDEP(qdep) |
865 QI_DEV_IOTLB_PFSID(info->pfsid);
867 reinit_completion(&iommu->prq_complete);
868 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
869 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
870 wait_for_completion(&iommu->prq_complete);
875 static int prq_to_iommu_prot(struct page_req_dsc *req)
880 prot |= IOMMU_FAULT_PERM_READ;
882 prot |= IOMMU_FAULT_PERM_WRITE;
884 prot |= IOMMU_FAULT_PERM_EXEC;
886 prot |= IOMMU_FAULT_PERM_PRIV;
892 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
894 struct iommu_fault_event event;
896 if (!dev || !dev_is_pci(dev))
899 /* Fill in event data for device specific processing */
900 memset(&event, 0, sizeof(struct iommu_fault_event));
901 event.fault.type = IOMMU_FAULT_PAGE_REQ;
902 event.fault.prm.addr = desc->addr;
903 event.fault.prm.pasid = desc->pasid;
904 event.fault.prm.grpid = desc->prg_index;
905 event.fault.prm.perm = prq_to_iommu_prot(desc);
908 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
909 if (desc->pasid_present) {
910 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
911 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
913 if (desc->priv_data_present) {
915 * Set last page in group bit if private data is present,
916 * page response is required as it does for LPIG.
917 * iommu_report_device_fault() doesn't understand this vendor
918 * specific requirement thus we set last_page as a workaround.
920 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
921 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
922 memcpy(event.fault.prm.private_data, desc->priv_data,
923 sizeof(desc->priv_data));
926 return iommu_report_device_fault(dev, &event);
929 static irqreturn_t prq_event_thread(int irq, void *d)
931 struct intel_svm_dev *sdev = NULL;
932 struct intel_iommu *iommu = d;
933 struct intel_svm *svm = NULL;
934 int head, tail, handled = 0;
936 /* Clear PPR bit before reading head/tail registers, to
937 * ensure that we get a new interrupt if needed. */
938 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
940 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
941 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
942 while (head != tail) {
943 struct vm_area_struct *vma;
944 struct page_req_dsc *req;
952 req = &iommu->prq[head / sizeof(*req)];
954 result = QI_RESP_FAILURE;
955 address = (u64)req->addr << VTD_PAGE_SHIFT;
956 if (!req->pasid_present) {
957 pr_err("%s: Page request without PASID: %08llx %08llx\n",
958 iommu->name, ((unsigned long long *)req)[0],
959 ((unsigned long long *)req)[1]);
963 if (!svm || svm->pasid != req->pasid) {
965 svm = ioasid_find(NULL, req->pasid, NULL);
966 /* It *can't* go away, because the driver is not permitted
967 * to unbind the mm while any page faults are outstanding.
968 * So we only need RCU to protect the internal idr code. */
970 if (IS_ERR_OR_NULL(svm)) {
971 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
972 iommu->name, req->pasid, ((unsigned long long *)req)[0],
973 ((unsigned long long *)req)[1]);
978 if (!sdev || sdev->sid != req->rid) {
979 struct intel_svm_dev *t;
983 list_for_each_entry_rcu(t, &svm->devs, list) {
984 if (t->sid == req->rid) {
992 result = QI_RESP_INVALID;
993 /* Since we're using init_mm.pgd directly, we should never take
994 * any faults on kernel addresses. */
998 /* If address is not canonical, return invalid response */
999 if (!is_canonical_address(address))
1003 * If prq is to be handled outside iommu driver via receiver of
1004 * the fault notifiers, we skip the page response here.
1006 if (svm->flags & SVM_FLAG_GUEST_MODE) {
1007 if (sdev && !intel_svm_prq_report(sdev->dev, req))
1013 /* If the mm is already defunct, don't handle faults. */
1014 if (!mmget_not_zero(svm->mm))
1017 mmap_read_lock(svm->mm);
1018 vma = find_extend_vma(svm->mm, address);
1019 if (!vma || address < vma->vm_start)
1022 if (access_error(vma, req))
1025 ret = handle_mm_fault(vma, address,
1026 req->wr_req ? FAULT_FLAG_WRITE : 0,
1028 if (ret & VM_FAULT_ERROR)
1031 result = QI_RESP_SUCCESS;
1033 mmap_read_unlock(svm->mm);
1037 if (sdev && sdev->ops && sdev->ops->fault_cb) {
1038 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
1039 (req->exe_req << 1) | (req->pm_req);
1040 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
1041 req->priv_data, rwxp, result);
1043 /* We get here in the error case where the PASID lookup failed,
1044 and these can be NULL. Do not use them below this point! */
1048 if (req->lpig || req->priv_data_present) {
1050 * Per VT-d spec. v3.0 ch7.7, system software must
1051 * respond with page group response if private data
1052 * is present (PDP) or last page in group (LPIG) bit
1053 * is set. This is an additional VT-d feature beyond
1056 resp.qw0 = QI_PGRP_PASID(req->pasid) |
1057 QI_PGRP_DID(req->rid) |
1058 QI_PGRP_PASID_P(req->pasid_present) |
1059 QI_PGRP_PDP(req->priv_data_present) |
1060 QI_PGRP_RESP_CODE(result) |
1062 resp.qw1 = QI_PGRP_IDX(req->prg_index) |
1063 QI_PGRP_LPIG(req->lpig);
1065 if (req->priv_data_present)
1066 memcpy(&resp.qw2, req->priv_data,
1067 sizeof(req->priv_data));
1070 qi_submit_sync(iommu, &resp, 1, 0);
1073 head = (head + sizeof(*req)) & PRQ_RING_MASK;
1076 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
1079 * Clear the page request overflow bit and wake up all threads that
1080 * are waiting for the completion of this handling.
1082 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
1083 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
1085 if (!completion_done(&iommu->prq_complete))
1086 complete(&iommu->prq_complete);
1088 return IRQ_RETVAL(handled);
1091 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
1093 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
1095 struct iommu_sva *sva = ERR_PTR(-EINVAL);
1096 struct intel_svm_dev *sdev = NULL;
1097 unsigned int flags = 0;
1101 * TODO: Consolidate with generic iommu-sva bind after it is merged.
1102 * It will require shared SVM data structures, i.e. combine io_mm
1103 * and intel_svm etc.
1106 flags = *(unsigned int *)drvdata;
1107 mutex_lock(&pasid_mutex);
1108 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
1114 WARN(!sdev, "SVM bind succeeded with no sdev!\n");
1116 mutex_unlock(&pasid_mutex);
1121 void intel_svm_unbind(struct iommu_sva *sva)
1123 struct intel_svm_dev *sdev;
1125 mutex_lock(&pasid_mutex);
1126 sdev = to_intel_svm_dev(sva);
1127 intel_svm_unbind_mm(sdev->dev, sdev->pasid);
1128 mutex_unlock(&pasid_mutex);
1131 u32 intel_svm_get_pasid(struct iommu_sva *sva)
1133 struct intel_svm_dev *sdev;
1136 mutex_lock(&pasid_mutex);
1137 sdev = to_intel_svm_dev(sva);
1138 pasid = sdev->pasid;
1139 mutex_unlock(&pasid_mutex);
1144 int intel_svm_page_response(struct device *dev,
1145 struct iommu_fault_event *evt,
1146 struct iommu_page_response *msg)
1148 struct iommu_fault_page_request *prm;
1149 struct intel_svm_dev *sdev = NULL;
1150 struct intel_svm *svm = NULL;
1151 struct intel_iommu *iommu;
1152 bool private_present;
1159 if (!dev || !dev_is_pci(dev))
1162 iommu = device_to_iommu(dev, &bus, &devfn);
1169 mutex_lock(&pasid_mutex);
1171 prm = &evt->fault.prm;
1172 sid = PCI_DEVID(bus, devfn);
1173 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
1174 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
1175 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
1177 if (!pasid_present) {
1182 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
1187 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
1194 * For responses from userspace, need to make sure that the
1195 * pasid has been bound to its mm.
1197 if (svm->flags & SVM_FLAG_GUEST_MODE) {
1198 struct mm_struct *mm;
1200 mm = get_task_mm(current);
1206 if (mm != svm->mm) {
1216 * Per VT-d spec. v3.0 ch7.7, system software must respond
1217 * with page group response if private data is present (PDP)
1218 * or last page in group (LPIG) bit is set. This is an
1219 * additional VT-d requirement beyond PCI ATS spec.
1221 if (last_page || private_present) {
1222 struct qi_desc desc;
1224 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
1225 QI_PGRP_PASID_P(pasid_present) |
1226 QI_PGRP_PDP(private_present) |
1227 QI_PGRP_RESP_CODE(msg->code) |
1229 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
1232 if (private_present)
1233 memcpy(&desc.qw2, prm->private_data,
1234 sizeof(prm->private_data));
1236 qi_submit_sync(iommu, &desc, 1, 0);
1239 mutex_unlock(&pasid_mutex);