1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
32 #include "cap_audit.h"
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN (1)
63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65 /* page table handling */
66 #define LEVEL_STRIDE (9)
67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69 static inline int agaw_to_level(int agaw)
74 static inline int agaw_to_width(int agaw)
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
79 static inline int width_to_agaw(int width)
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
84 static inline unsigned int level_to_offset_bits(int level)
86 return (level - 1) * LEVEL_STRIDE;
89 static inline int pfn_level_offset(u64 pfn, int level)
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
94 static inline u64 level_mask(int level)
96 return -1ULL << level_to_offset_bits(level);
99 static inline u64 level_size(int level)
101 return 1ULL << level_to_offset_bits(level);
104 static inline u64 align_to_level(u64 pfn, int level)
106 return (pfn + level_size(level) - 1) & level_mask(level);
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
122 return mm_to_dma_pfn(page_to_pfn(pg));
124 static inline unsigned long virt_to_dma_pfn(void *p)
126 return page_to_dma_pfn(virt_to_page(p));
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
156 return re->lo & VTD_PAGE_MASK;
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
168 return re->hi & VTD_PAGE_MASK;
171 static inline void context_clear_pasid_enable(struct context_entry *context)
173 context->lo &= ~(1ULL << 11);
176 static inline bool context_pasid_enabled(struct context_entry *context)
178 return !!(context->lo & (1ULL << 11));
181 static inline void context_set_copied(struct context_entry *context)
183 context->hi |= (1ull << 3);
186 static inline bool context_copied(struct context_entry *context)
188 return !!(context->hi & (1ULL << 3));
191 static inline bool __context_present(struct context_entry *context)
193 return (context->lo & 1);
196 bool context_present(struct context_entry *context)
198 return context_pasid_enabled(context) ?
199 __context_present(context) :
200 __context_present(context) && !context_copied(context);
203 static inline void context_set_present(struct context_entry *context)
208 static inline void context_set_fault_enable(struct context_entry *context)
210 context->lo &= (((u64)-1) << 2) | 1;
213 static inline void context_set_translation_type(struct context_entry *context,
216 context->lo &= (((u64)-1) << 4) | 3;
217 context->lo |= (value & 3) << 2;
220 static inline void context_set_address_root(struct context_entry *context,
223 context->lo &= ~VTD_PAGE_MASK;
224 context->lo |= value & VTD_PAGE_MASK;
227 static inline void context_set_address_width(struct context_entry *context,
230 context->hi |= value & 7;
233 static inline void context_set_domain_id(struct context_entry *context,
236 context->hi |= (value & ((1 << 16) - 1)) << 8;
239 static inline int context_domain_id(struct context_entry *c)
241 return((c->hi >> 8) & 0xffff);
244 static inline void context_clear_entry(struct context_entry *context)
251 * This domain is a statically identity mapping domain.
252 * 1. This domain creats a static 1:1 mapping to all usable memory.
253 * 2. It maps to each iommu if successful.
254 * 3. Each iommu mapps to this domain if successful.
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
259 #define for_each_domain_iommu(idx, domain) \
260 for (idx = 0; idx < g_num_of_iommus; idx++) \
261 if (domain->iommu_refcnt[idx])
263 struct dmar_rmrr_unit {
264 struct list_head list; /* list of rmrr units */
265 struct acpi_dmar_header *hdr; /* ACPI header */
266 u64 base_address; /* reserved base address*/
267 u64 end_address; /* reserved end address */
268 struct dmar_dev_scope *devices; /* target devices */
269 int devices_cnt; /* target device count */
272 struct dmar_atsr_unit {
273 struct list_head list; /* list of ATSR units */
274 struct acpi_dmar_header *hdr; /* ACPI header */
275 struct dmar_dev_scope *devices; /* target devices */
276 int devices_cnt; /* target device count */
277 u8 include_all:1; /* include all ports */
280 struct dmar_satc_unit {
281 struct list_head list; /* list of SATC units */
282 struct acpi_dmar_header *hdr; /* ACPI header */
283 struct dmar_dev_scope *devices; /* target devices */
284 struct intel_iommu *iommu; /* the corresponding iommu */
285 int devices_cnt; /* target device count */
286 u8 atc_required:1; /* ATS is required */
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
293 #define for_each_rmrr_units(rmrr) \
294 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
314 #define IDENTMAP_GFX 2
315 #define IDENTMAP_AZALIA 4
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
324 * Iterate over elements in device_domain_list and call the specified
325 * callback @fn against each element.
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328 void *data), void *data)
332 struct device_domain_info *info;
334 spin_lock_irqsave(&device_domain_lock, flags);
335 list_for_each_entry(info, &device_domain_list, global) {
336 ret = fn(info, data);
338 spin_unlock_irqrestore(&device_domain_lock, flags);
342 spin_unlock_irqrestore(&device_domain_lock, flags);
347 const struct iommu_ops intel_iommu_ops;
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
351 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
356 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
359 static void init_translation_status(struct intel_iommu *iommu)
363 gsts = readl(iommu->reg + DMAR_GSTS_REG);
364 if (gsts & DMA_GSTS_TES)
365 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
368 static int __init intel_iommu_setup(char *str)
374 if (!strncmp(str, "on", 2)) {
376 pr_info("IOMMU enabled\n");
377 } else if (!strncmp(str, "off", 3)) {
379 no_platform_optin = 1;
380 pr_info("IOMMU disabled\n");
381 } else if (!strncmp(str, "igfx_off", 8)) {
383 pr_info("Disable GFX device mapping\n");
384 } else if (!strncmp(str, "forcedac", 8)) {
385 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386 iommu_dma_forcedac = true;
387 } else if (!strncmp(str, "strict", 6)) {
388 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389 iommu_set_dma_strict();
390 } else if (!strncmp(str, "sp_off", 6)) {
391 pr_info("Disable supported super page\n");
392 intel_iommu_superpage = 0;
393 } else if (!strncmp(str, "sm_on", 5)) {
394 pr_info("Enable scalable mode if hardware supports\n");
396 } else if (!strncmp(str, "sm_off", 6)) {
397 pr_info("Scalable mode is disallowed\n");
399 } else if (!strncmp(str, "tboot_noforce", 13)) {
400 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401 intel_iommu_tboot_noforce = 1;
403 pr_notice("Unknown option - '%s'\n", str);
406 str += strcspn(str, ",");
413 __setup("intel_iommu=", intel_iommu_setup);
415 void *alloc_pgtable_page(int node)
420 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
422 vaddr = page_address(page);
426 void free_pgtable_page(void *vaddr)
428 free_page((unsigned long)vaddr);
431 static inline int domain_type_is_si(struct dmar_domain *domain)
433 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
438 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
444 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
446 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
454 sagaw = cap_sagaw(iommu->cap);
455 for (agaw = width_to_agaw(max_gaw);
457 if (test_bit(agaw, &sagaw))
465 * Calculate max SAGAW for each iommu.
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
469 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
473 * calculate agaw for each iommu.
474 * "SAGAW" may be different across iommus, use a default agaw, and
475 * get a supported less agaw for iommus that don't support the default agaw.
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
479 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
487 /* si_domain and vm domain should not get here. */
488 if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
491 for_each_domain_iommu(iommu_id, domain)
494 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
497 return g_iommus[iommu_id];
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
502 return sm_supported(iommu) ?
503 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
508 struct dmar_drhd_unit *drhd;
509 struct intel_iommu *iommu;
513 domain->iommu_coherency = true;
515 for_each_domain_iommu(i, domain) {
517 if (!iommu_paging_structure_coherency(g_iommus[i])) {
518 domain->iommu_coherency = false;
525 /* No hardware attached; use lowest common denominator */
527 for_each_active_iommu(iommu, drhd) {
528 if (!iommu_paging_structure_coherency(iommu)) {
529 domain->iommu_coherency = false;
536 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
538 struct dmar_drhd_unit *drhd;
539 struct intel_iommu *iommu;
543 for_each_active_iommu(iommu, drhd) {
546 * If the hardware is operating in the scalable mode,
547 * the snooping control is always supported since we
548 * always set PASID-table-entry.PGSNP bit if the domain
549 * is managed outside (UNMANAGED).
551 if (!sm_supported(iommu) &&
552 !ecap_sc_support(iommu->ecap)) {
563 static int domain_update_iommu_superpage(struct dmar_domain *domain,
564 struct intel_iommu *skip)
566 struct dmar_drhd_unit *drhd;
567 struct intel_iommu *iommu;
570 if (!intel_iommu_superpage)
573 /* set iommu_superpage to the smallest common denominator */
575 for_each_active_iommu(iommu, drhd) {
577 if (domain && domain_use_first_level(domain)) {
578 if (!cap_fl1gp_support(iommu->cap))
581 mask &= cap_super_page_val(iommu->cap);
593 static int domain_update_device_node(struct dmar_domain *domain)
595 struct device_domain_info *info;
596 int nid = NUMA_NO_NODE;
598 assert_spin_locked(&device_domain_lock);
600 if (list_empty(&domain->devices))
603 list_for_each_entry(info, &domain->devices, link) {
608 * There could possibly be multiple device numa nodes as devices
609 * within the same domain may sit behind different IOMMUs. There
610 * isn't perfect answer in such situation, so we select first
611 * come first served policy.
613 nid = dev_to_node(info->dev);
614 if (nid != NUMA_NO_NODE)
621 static void domain_update_iotlb(struct dmar_domain *domain);
623 /* Return the super pagesize bitmap if supported. */
624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
626 unsigned long bitmap = 0;
629 * 1-level super page supports page size of 2MiB, 2-level super page
630 * supports page size of both 2MiB and 1GiB.
632 if (domain->iommu_superpage == 1)
634 else if (domain->iommu_superpage == 2)
635 bitmap |= SZ_2M | SZ_1G;
640 /* Some capabilities may be different across iommus */
641 static void domain_update_iommu_cap(struct dmar_domain *domain)
643 domain_update_iommu_coherency(domain);
644 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
645 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
648 * If RHSA is missing, we should default to the device numa domain
651 if (domain->nid == NUMA_NO_NODE)
652 domain->nid = domain_update_device_node(domain);
655 * First-level translation restricts the input-address to a
656 * canonical address (i.e., address bits 63:N have the same
657 * value as address bit [N-1], where N is 48-bits with 4-level
658 * paging and 57-bits with 5-level paging). Hence, skip bit
661 if (domain_use_first_level(domain))
662 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
664 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
666 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
667 domain_update_iotlb(domain);
670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
673 struct root_entry *root = &iommu->root_entry[bus];
674 struct context_entry *context;
678 if (sm_supported(iommu)) {
686 context = phys_to_virt(*entry & VTD_PAGE_MASK);
688 unsigned long phy_addr;
692 context = alloc_pgtable_page(iommu->node);
696 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
697 phy_addr = virt_to_phys((void *)context);
698 *entry = phy_addr | 1;
699 __iommu_flush_cache(iommu, entry, sizeof(*entry));
701 return &context[devfn];
705 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
706 * sub-hierarchy of a candidate PCI-PCI bridge
707 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
708 * @bridge: the candidate PCI-PCI bridge
710 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
715 struct pci_dev *pdev, *pbridge;
717 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
720 pdev = to_pci_dev(dev);
721 pbridge = to_pci_dev(bridge);
723 if (pbridge->subordinate &&
724 pbridge->subordinate->number <= pdev->bus->number &&
725 pbridge->subordinate->busn_res.end >= pdev->bus->number)
731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
733 struct dmar_drhd_unit *drhd;
737 /* We know that this device on this chipset has its own IOMMU.
738 * If we find it under a different IOMMU, then the BIOS is lying
739 * to us. Hope that the IOMMU for this device is actually
740 * disabled, and it needs no translation...
742 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
745 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
750 /* we know that the this iommu should be at offset 0xa000 from vtbar */
751 drhd = dmar_find_matched_drhd_unit(pdev);
752 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
753 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
754 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
763 if (!iommu || iommu->drhd->ignored)
766 if (dev_is_pci(dev)) {
767 struct pci_dev *pdev = to_pci_dev(dev);
769 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
770 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
771 quirk_ioat_snb_local_iommu(pdev))
778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
780 struct dmar_drhd_unit *drhd = NULL;
781 struct pci_dev *pdev = NULL;
782 struct intel_iommu *iommu;
790 if (dev_is_pci(dev)) {
791 struct pci_dev *pf_pdev;
793 pdev = pci_real_dma_dev(to_pci_dev(dev));
795 /* VFs aren't listed in scope tables; we need to look up
796 * the PF instead to find the IOMMU. */
797 pf_pdev = pci_physfn(pdev);
799 segment = pci_domain_nr(pdev->bus);
800 } else if (has_acpi_companion(dev))
801 dev = &ACPI_COMPANION(dev)->dev;
804 for_each_iommu(iommu, drhd) {
805 if (pdev && segment != drhd->segment)
808 for_each_active_dev_scope(drhd->devices,
809 drhd->devices_cnt, i, tmp) {
811 /* For a VF use its original BDF# not that of the PF
812 * which we used for the IOMMU lookup. Strictly speaking
813 * we could do this for all PCI devices; we only need to
814 * get the BDF# from the scope table for ACPI matches. */
815 if (pdev && pdev->is_virtfn)
819 *bus = drhd->devices[i].bus;
820 *devfn = drhd->devices[i].devfn;
825 if (is_downstream_to_pci_bridge(dev, tmp))
829 if (pdev && drhd->include_all) {
832 *bus = pdev->bus->number;
833 *devfn = pdev->devfn;
840 if (iommu_is_dummy(iommu, dev))
848 static void domain_flush_cache(struct dmar_domain *domain,
849 void *addr, int size)
851 if (!domain->iommu_coherency)
852 clflush_cache_range(addr, size);
855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
857 struct context_entry *context;
861 spin_lock_irqsave(&iommu->lock, flags);
862 context = iommu_context_addr(iommu, bus, devfn, 0);
864 ret = context_present(context);
865 spin_unlock_irqrestore(&iommu->lock, flags);
869 static void free_context_table(struct intel_iommu *iommu)
873 struct context_entry *context;
875 spin_lock_irqsave(&iommu->lock, flags);
876 if (!iommu->root_entry) {
879 for (i = 0; i < ROOT_ENTRY_NR; i++) {
880 context = iommu_context_addr(iommu, i, 0, 0);
882 free_pgtable_page(context);
884 if (!sm_supported(iommu))
887 context = iommu_context_addr(iommu, i, 0x80, 0);
889 free_pgtable_page(context);
892 free_pgtable_page(iommu->root_entry);
893 iommu->root_entry = NULL;
895 spin_unlock_irqrestore(&iommu->lock, flags);
898 #ifdef CONFIG_DMAR_DEBUG
899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
901 struct device_domain_info *info;
902 struct dma_pte *parent, *pte;
903 struct dmar_domain *domain;
906 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
907 if (!info || !info->domain) {
908 pr_info("device [%02x:%02x.%d] not probed\n",
909 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
913 domain = info->domain;
914 level = agaw_to_level(domain->agaw);
915 parent = domain->pgd;
917 pr_info("no page table setup\n");
922 offset = pfn_level_offset(pfn, level);
923 pte = &parent[offset];
924 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
925 pr_info("PTE not present at level %d\n", level);
929 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
934 parent = phys_to_virt(dma_pte_addr(pte));
939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
940 unsigned long long addr, u32 pasid)
942 struct pasid_dir_entry *dir, *pde;
943 struct pasid_entry *entries, *pte;
944 struct context_entry *ctx_entry;
945 struct root_entry *rt_entry;
946 u8 devfn = source_id & 0xff;
947 u8 bus = source_id >> 8;
948 int i, dir_index, index;
950 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
952 /* root entry dump */
953 rt_entry = &iommu->root_entry[bus];
955 pr_info("root table entry is not present\n");
959 if (sm_supported(iommu))
960 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
961 rt_entry->hi, rt_entry->lo);
963 pr_info("root entry: 0x%016llx", rt_entry->lo);
965 /* context entry dump */
966 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
968 pr_info("context table entry is not present\n");
972 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
973 ctx_entry->hi, ctx_entry->lo);
975 /* legacy mode does not require PASID entries */
976 if (!sm_supported(iommu))
979 /* get the pointer to pasid directory entry */
980 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
982 pr_info("pasid directory entry is not present\n");
985 /* For request-without-pasid, get the pasid from context entry */
986 if (intel_iommu_sm && pasid == INVALID_IOASID)
987 pasid = PASID_RID2PASID;
989 dir_index = pasid >> PASID_PDE_SHIFT;
990 pde = &dir[dir_index];
991 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
993 /* get the pointer to the pasid table entry */
994 entries = get_pasid_table_from_pde(pde);
996 pr_info("pasid table entry is not present\n");
999 index = pasid & PASID_PTE_MASK;
1000 pte = &entries[index];
1001 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1002 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1005 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1010 unsigned long pfn, int *target_level)
1012 struct dma_pte *parent, *pte;
1013 int level = agaw_to_level(domain->agaw);
1016 BUG_ON(!domain->pgd);
1018 if (!domain_pfn_supported(domain, pfn))
1019 /* Address beyond IOMMU's addressing capabilities. */
1022 parent = domain->pgd;
1027 offset = pfn_level_offset(pfn, level);
1028 pte = &parent[offset];
1029 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1031 if (level == *target_level)
1034 if (!dma_pte_present(pte)) {
1037 tmp_page = alloc_pgtable_page(domain->nid);
1042 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1043 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1044 if (domain_use_first_level(domain)) {
1045 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1046 if (iommu_is_dma_domain(&domain->domain))
1047 pteval |= DMA_FL_PTE_ACCESS;
1049 if (cmpxchg64(&pte->val, 0ULL, pteval))
1050 /* Someone else set it while we were thinking; use theirs. */
1051 free_pgtable_page(tmp_page);
1053 domain_flush_cache(domain, pte, sizeof(*pte));
1058 parent = phys_to_virt(dma_pte_addr(pte));
1063 *target_level = level;
1068 /* return address's pte at specific level */
1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1071 int level, int *large_page)
1073 struct dma_pte *parent, *pte;
1074 int total = agaw_to_level(domain->agaw);
1077 parent = domain->pgd;
1078 while (level <= total) {
1079 offset = pfn_level_offset(pfn, total);
1080 pte = &parent[offset];
1084 if (!dma_pte_present(pte)) {
1085 *large_page = total;
1089 if (dma_pte_superpage(pte)) {
1090 *large_page = total;
1094 parent = phys_to_virt(dma_pte_addr(pte));
1100 /* clear last level pte, a tlb flush should be followed */
1101 static void dma_pte_clear_range(struct dmar_domain *domain,
1102 unsigned long start_pfn,
1103 unsigned long last_pfn)
1105 unsigned int large_page;
1106 struct dma_pte *first_pte, *pte;
1108 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1109 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1110 BUG_ON(start_pfn > last_pfn);
1112 /* we don't need lock here; nobody else touches the iova range */
1115 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1117 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1122 start_pfn += lvl_to_nr_pages(large_page);
1124 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1126 domain_flush_cache(domain, first_pte,
1127 (void *)pte - (void *)first_pte);
1129 } while (start_pfn && start_pfn <= last_pfn);
1132 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1133 int retain_level, struct dma_pte *pte,
1134 unsigned long pfn, unsigned long start_pfn,
1135 unsigned long last_pfn)
1137 pfn = max(start_pfn, pfn);
1138 pte = &pte[pfn_level_offset(pfn, level)];
1141 unsigned long level_pfn;
1142 struct dma_pte *level_pte;
1144 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1147 level_pfn = pfn & level_mask(level);
1148 level_pte = phys_to_virt(dma_pte_addr(pte));
1151 dma_pte_free_level(domain, level - 1, retain_level,
1152 level_pte, level_pfn, start_pfn,
1157 * Free the page table if we're below the level we want to
1158 * retain and the range covers the entire table.
1160 if (level < retain_level && !(start_pfn > level_pfn ||
1161 last_pfn < level_pfn + level_size(level) - 1)) {
1163 domain_flush_cache(domain, pte, sizeof(*pte));
1164 free_pgtable_page(level_pte);
1167 pfn += level_size(level);
1168 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1172 * clear last level (leaf) ptes and free page table pages below the
1173 * level we wish to keep intact.
1175 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1176 unsigned long start_pfn,
1177 unsigned long last_pfn,
1180 dma_pte_clear_range(domain, start_pfn, last_pfn);
1182 /* We don't need lock here; nobody else touches the iova range */
1183 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1184 domain->pgd, 0, start_pfn, last_pfn);
1187 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188 free_pgtable_page(domain->pgd);
1193 /* When a page at a given level is being unlinked from its parent, we don't
1194 need to *modify* it at all. All we need to do is make a list of all the
1195 pages which can be freed just as soon as we've flushed the IOTLB and we
1196 know the hardware page-walk will no longer touch them.
1197 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1199 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1200 int level, struct dma_pte *pte,
1201 struct list_head *freelist)
1205 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1206 list_add_tail(&pg->lru, freelist);
1211 pte = page_address(pg);
1213 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1214 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1216 } while (!first_pte_in_page(pte));
1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 struct dma_pte *pte, unsigned long pfn,
1221 unsigned long start_pfn, unsigned long last_pfn,
1222 struct list_head *freelist)
1224 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226 pfn = max(start_pfn, pfn);
1227 pte = &pte[pfn_level_offset(pfn, level)];
1230 unsigned long level_pfn = pfn & level_mask(level);
1232 if (!dma_pte_present(pte))
1235 /* If range covers entire pagetable, free it */
1236 if (start_pfn <= level_pfn &&
1237 last_pfn >= level_pfn + level_size(level) - 1) {
1238 /* These suborbinate page tables are going away entirely. Don't
1239 bother to clear them; we're just going to *free* them. */
1240 if (level > 1 && !dma_pte_superpage(pte))
1241 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1247 } else if (level > 1) {
1248 /* Recurse down into a level that isn't *entirely* obsolete */
1249 dma_pte_clear_level(domain, level - 1,
1250 phys_to_virt(dma_pte_addr(pte)),
1251 level_pfn, start_pfn, last_pfn,
1255 pfn = level_pfn + level_size(level);
1256 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1259 domain_flush_cache(domain, first_pte,
1260 (void *)++last_pte - (void *)first_pte);
1263 /* We can't just free the pages because the IOMMU may still be walking
1264 the page tables, and may have cached the intermediate levels. The
1265 pages can only be freed after the IOTLB flush has been done. */
1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1267 unsigned long last_pfn, struct list_head *freelist)
1269 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1270 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1271 BUG_ON(start_pfn > last_pfn);
1273 /* we don't need lock here; nobody else touches the iova range */
1274 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1275 domain->pgd, 0, start_pfn, last_pfn, freelist);
1278 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1279 struct page *pgd_page = virt_to_page(domain->pgd);
1280 list_add_tail(&pgd_page->lru, freelist);
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1288 struct root_entry *root;
1289 unsigned long flags;
1291 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1293 pr_err("Allocating root entry for %s failed\n",
1298 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1300 spin_lock_irqsave(&iommu->lock, flags);
1301 iommu->root_entry = root;
1302 spin_unlock_irqrestore(&iommu->lock, flags);
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1313 addr = virt_to_phys(iommu->root_entry);
1314 if (sm_supported(iommu))
1315 addr |= DMA_RTADDR_SMT;
1317 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1320 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1322 /* Make sure hardware complete it */
1323 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324 readl, (sts & DMA_GSTS_RTPS), sts);
1326 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1328 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1329 if (sm_supported(iommu))
1330 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1331 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1334 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1339 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1342 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1343 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1345 /* Make sure hardware complete it */
1346 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1347 readl, (!(val & DMA_GSTS_WBFS)), val);
1349 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352 /* return value determine if we need a write buffer flush */
1353 static void __iommu_flush_context(struct intel_iommu *iommu,
1354 u16 did, u16 source_id, u8 function_mask,
1361 case DMA_CCMD_GLOBAL_INVL:
1362 val = DMA_CCMD_GLOBAL_INVL;
1364 case DMA_CCMD_DOMAIN_INVL:
1365 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1367 case DMA_CCMD_DEVICE_INVL:
1368 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1369 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1374 val |= DMA_CCMD_ICC;
1376 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1377 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1379 /* Make sure hardware complete it */
1380 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1381 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1383 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1386 /* return value determine if we need a write buffer flush */
1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1388 u64 addr, unsigned int size_order, u64 type)
1390 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1391 u64 val = 0, val_iva = 0;
1395 case DMA_TLB_GLOBAL_FLUSH:
1396 /* global flush doesn't need set IVA_REG */
1397 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1399 case DMA_TLB_DSI_FLUSH:
1400 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1402 case DMA_TLB_PSI_FLUSH:
1403 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1404 /* IH bit is passed in as part of address */
1405 val_iva = size_order | addr;
1410 /* Note: set drain read/write */
1413 * This is probably to be super secure.. Looks like we can
1414 * ignore it without any impact.
1416 if (cap_read_drain(iommu->cap))
1417 val |= DMA_TLB_READ_DRAIN;
1419 if (cap_write_drain(iommu->cap))
1420 val |= DMA_TLB_WRITE_DRAIN;
1422 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 /* Note: Only uses first TLB reg currently */
1425 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1426 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1428 /* Make sure hardware complete it */
1429 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1430 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1432 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1434 /* check IOTLB invalidation granularity */
1435 if (DMA_TLB_IAIG(val) == 0)
1436 pr_err("Flush IOTLB failed\n");
1437 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1438 pr_debug("TLB flush request %Lx, actual %Lx\n",
1439 (unsigned long long)DMA_TLB_IIRG(type),
1440 (unsigned long long)DMA_TLB_IAIG(val));
1443 static struct device_domain_info *
1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1447 struct device_domain_info *info;
1449 assert_spin_locked(&device_domain_lock);
1454 list_for_each_entry(info, &domain->devices, link)
1455 if (info->iommu == iommu && info->bus == bus &&
1456 info->devfn == devfn) {
1457 if (info->ats_supported && info->dev)
1465 static void domain_update_iotlb(struct dmar_domain *domain)
1467 struct device_domain_info *info;
1468 bool has_iotlb_device = false;
1470 assert_spin_locked(&device_domain_lock);
1472 list_for_each_entry(info, &domain->devices, link)
1473 if (info->ats_enabled) {
1474 has_iotlb_device = true;
1478 domain->has_iotlb_device = has_iotlb_device;
1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1483 struct pci_dev *pdev;
1485 assert_spin_locked(&device_domain_lock);
1487 if (!info || !dev_is_pci(info->dev))
1490 pdev = to_pci_dev(info->dev);
1491 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1492 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1493 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1494 * reserved, which should be set to 0.
1496 if (!ecap_dit(info->iommu->ecap))
1499 struct pci_dev *pf_pdev;
1501 /* pdev will be returned if device is not a vf */
1502 pf_pdev = pci_physfn(pdev);
1503 info->pfsid = pci_dev_id(pf_pdev);
1506 #ifdef CONFIG_INTEL_IOMMU_SVM
1507 /* The PCIe spec, in its wisdom, declares that the behaviour of
1508 the device if you enable PASID support after ATS support is
1509 undefined. So always enable PASID support on devices which
1510 have it, even if we can't yet know if we're ever going to
1512 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513 info->pasid_enabled = 1;
1515 if (info->pri_supported &&
1516 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1517 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1518 info->pri_enabled = 1;
1520 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1521 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1522 info->ats_enabled = 1;
1523 domain_update_iotlb(info->domain);
1524 info->ats_qdep = pci_ats_queue_depth(pdev);
1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1530 struct pci_dev *pdev;
1532 assert_spin_locked(&device_domain_lock);
1534 if (!dev_is_pci(info->dev))
1537 pdev = to_pci_dev(info->dev);
1539 if (info->ats_enabled) {
1540 pci_disable_ats(pdev);
1541 info->ats_enabled = 0;
1542 domain_update_iotlb(info->domain);
1544 #ifdef CONFIG_INTEL_IOMMU_SVM
1545 if (info->pri_enabled) {
1546 pci_disable_pri(pdev);
1547 info->pri_enabled = 0;
1549 if (info->pasid_enabled) {
1550 pci_disable_pasid(pdev);
1551 info->pasid_enabled = 0;
1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1557 u64 addr, unsigned int mask)
1561 if (!info || !info->ats_enabled)
1564 sid = info->bus << 8 | info->devfn;
1565 qdep = info->ats_qdep;
1566 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1571 u64 addr, unsigned mask)
1573 unsigned long flags;
1574 struct device_domain_info *info;
1576 if (!domain->has_iotlb_device)
1579 spin_lock_irqsave(&device_domain_lock, flags);
1580 list_for_each_entry(info, &domain->devices, link)
1581 __iommu_flush_dev_iotlb(info, addr, mask);
1583 spin_unlock_irqrestore(&device_domain_lock, flags);
1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1587 struct dmar_domain *domain,
1588 unsigned long pfn, unsigned int pages,
1591 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1592 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1593 u16 did = domain->iommu_did[iommu->seq_id];
1600 if (domain_use_first_level(domain)) {
1601 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1604 * Fallback to domain selective flush if no PSI support or
1605 * the size is too big. PSI requires page size to be 2 ^ x,
1606 * and the base address is naturally aligned to the size.
1608 if (!cap_pgsel_inv(iommu->cap) ||
1609 mask > cap_max_amask_val(iommu->cap))
1610 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1613 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1618 * In caching mode, changes of pages from non-present to present require
1619 * flush. However, device IOTLB doesn't need to be flushed in this case.
1621 if (!cap_caching_mode(iommu->cap) || !map)
1622 iommu_flush_dev_iotlb(domain, addr, mask);
1625 /* Notification for newly created mappings */
1626 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1627 struct dmar_domain *domain,
1628 unsigned long pfn, unsigned int pages)
1631 * It's a non-present to present mapping. Only flush if caching mode
1634 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1635 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1637 iommu_flush_write_buffer(iommu);
1640 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1642 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1645 for_each_domain_iommu(idx, dmar_domain) {
1646 struct intel_iommu *iommu = g_iommus[idx];
1647 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1649 if (domain_use_first_level(dmar_domain))
1650 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1652 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1655 if (!cap_caching_mode(iommu->cap))
1656 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1660 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1663 unsigned long flags;
1665 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1668 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1670 pmen &= ~DMA_PMEN_EPM;
1671 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1673 /* wait for the protected region status bit to clear */
1674 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1675 readl, !(pmen & DMA_PMEN_PRS), pmen);
1677 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1680 static void iommu_enable_translation(struct intel_iommu *iommu)
1683 unsigned long flags;
1685 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1686 iommu->gcmd |= DMA_GCMD_TE;
1687 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1689 /* Make sure hardware complete it */
1690 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1691 readl, (sts & DMA_GSTS_TES), sts);
1693 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1696 static void iommu_disable_translation(struct intel_iommu *iommu)
1701 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1702 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1705 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1706 iommu->gcmd &= ~DMA_GCMD_TE;
1707 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1709 /* Make sure hardware complete it */
1710 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1711 readl, (!(sts & DMA_GSTS_TES)), sts);
1713 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1716 static int iommu_init_domains(struct intel_iommu *iommu)
1720 ndomains = cap_ndoms(iommu->cap);
1721 pr_debug("%s: Number of Domains supported <%d>\n",
1722 iommu->name, ndomains);
1724 spin_lock_init(&iommu->lock);
1726 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1727 if (!iommu->domain_ids)
1731 * If Caching mode is set, then invalid translations are tagged
1732 * with domain-id 0, hence we need to pre-allocate it. We also
1733 * use domain-id 0 as a marker for non-allocated domain-id, so
1734 * make sure it is not used for a real domain.
1736 set_bit(0, iommu->domain_ids);
1739 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1740 * entry for first-level or pass-through translation modes should
1741 * be programmed with a domain id different from those used for
1742 * second-level or nested translation. We reserve a domain id for
1745 if (sm_supported(iommu))
1746 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1751 static void disable_dmar_iommu(struct intel_iommu *iommu)
1753 struct device_domain_info *info, *tmp;
1754 unsigned long flags;
1756 if (!iommu->domain_ids)
1759 spin_lock_irqsave(&device_domain_lock, flags);
1760 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1761 if (info->iommu != iommu)
1764 if (!info->dev || !info->domain)
1767 __dmar_remove_one_dev_info(info);
1769 spin_unlock_irqrestore(&device_domain_lock, flags);
1771 if (iommu->gcmd & DMA_GCMD_TE)
1772 iommu_disable_translation(iommu);
1775 static void free_dmar_iommu(struct intel_iommu *iommu)
1777 if (iommu->domain_ids) {
1778 bitmap_free(iommu->domain_ids);
1779 iommu->domain_ids = NULL;
1782 g_iommus[iommu->seq_id] = NULL;
1784 /* free context mapping */
1785 free_context_table(iommu);
1787 #ifdef CONFIG_INTEL_IOMMU_SVM
1788 if (pasid_supported(iommu)) {
1789 if (ecap_prs(iommu->ecap))
1790 intel_svm_finish_prq(iommu);
1792 if (vccap_pasid(iommu->vccap))
1793 ioasid_unregister_allocator(&iommu->pasid_allocator);
1799 * Check and return whether first level is used by default for
1802 static bool first_level_by_default(unsigned int type)
1804 /* Only SL is available in legacy mode */
1805 if (!scalable_mode_support())
1808 /* Only level (either FL or SL) is available, just use it */
1809 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1810 return intel_cap_flts_sanity();
1812 /* Both levels are available, decide it based on domain type */
1813 return type != IOMMU_DOMAIN_UNMANAGED;
1816 static struct dmar_domain *alloc_domain(unsigned int type)
1818 struct dmar_domain *domain;
1820 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1824 domain->nid = NUMA_NO_NODE;
1825 if (first_level_by_default(type))
1826 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1827 domain->has_iotlb_device = false;
1828 INIT_LIST_HEAD(&domain->devices);
1833 /* Must be called with iommu->lock */
1834 static int domain_attach_iommu(struct dmar_domain *domain,
1835 struct intel_iommu *iommu)
1837 unsigned long ndomains;
1840 assert_spin_locked(&device_domain_lock);
1841 assert_spin_locked(&iommu->lock);
1843 domain->iommu_refcnt[iommu->seq_id] += 1;
1844 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845 ndomains = cap_ndoms(iommu->cap);
1846 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1848 if (num >= ndomains) {
1849 pr_err("%s: No free domain ids\n", iommu->name);
1850 domain->iommu_refcnt[iommu->seq_id] -= 1;
1854 set_bit(num, iommu->domain_ids);
1855 domain->iommu_did[iommu->seq_id] = num;
1856 domain->nid = iommu->node;
1857 domain_update_iommu_cap(domain);
1863 static void domain_detach_iommu(struct dmar_domain *domain,
1864 struct intel_iommu *iommu)
1868 assert_spin_locked(&device_domain_lock);
1869 assert_spin_locked(&iommu->lock);
1871 domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1873 num = domain->iommu_did[iommu->seq_id];
1874 clear_bit(num, iommu->domain_ids);
1875 domain_update_iommu_cap(domain);
1876 domain->iommu_did[iommu->seq_id] = 0;
1880 static inline int guestwidth_to_adjustwidth(int gaw)
1883 int r = (gaw - 12) % 9;
1894 static void domain_exit(struct dmar_domain *domain)
1897 /* Remove associated devices and clear attached or cached domains */
1898 domain_remove_dev_info(domain);
1901 LIST_HEAD(freelist);
1903 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1904 put_pages_list(&freelist);
1911 * Get the PASID directory size for scalable mode context entry.
1912 * Value of X in the PDTS field of a scalable mode context entry
1913 * indicates PASID directory with 2^(X + 7) entries.
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1917 unsigned long pds, max_pde;
1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1928 * Set the RID_PASID field of a scalable mode context entry. The
1929 * IOMMU hardware will use the PASID value set in this field for
1930 * DMA translations of DMA requests without PASID.
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1935 context->hi |= pasid & ((1 << 20) - 1);
1939 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1942 static inline void context_set_sm_dte(struct context_entry *context)
1944 context->lo |= (1 << 2);
1948 * Set the PRE(Page Request Enable) field of a scalable mode context
1951 static inline void context_set_sm_pre(struct context_entry *context)
1953 context->lo |= (1 << 4);
1956 /* Convert value to context PASID directory size field coding. */
1957 #define context_pdts(pds) (((pds) & 0x7) << 9)
1959 static int domain_context_mapping_one(struct dmar_domain *domain,
1960 struct intel_iommu *iommu,
1961 struct pasid_table *table,
1964 u16 did = domain->iommu_did[iommu->seq_id];
1965 int translation = CONTEXT_TT_MULTI_LEVEL;
1966 struct device_domain_info *info = NULL;
1967 struct context_entry *context;
1968 unsigned long flags;
1973 if (hw_pass_through && domain_type_is_si(domain))
1974 translation = CONTEXT_TT_PASS_THROUGH;
1976 pr_debug("Set context mapping for %02x:%02x.%d\n",
1977 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1979 BUG_ON(!domain->pgd);
1981 spin_lock_irqsave(&device_domain_lock, flags);
1982 spin_lock(&iommu->lock);
1985 context = iommu_context_addr(iommu, bus, devfn, 1);
1990 if (context_present(context))
1994 * For kdump cases, old valid entries may be cached due to the
1995 * in-flight DMA and copied pgtable, but there is no unmapping
1996 * behaviour for them, thus we need an explicit cache flush for
1997 * the newly-mapped device. For kdump, at this point, the device
1998 * is supposed to finish reset at its driver probe stage, so no
1999 * in-flight DMA will exist, and we don't need to worry anymore
2002 if (context_copied(context)) {
2003 u16 did_old = context_domain_id(context);
2005 if (did_old < cap_ndoms(iommu->cap)) {
2006 iommu->flush.flush_context(iommu, did_old,
2007 (((u16)bus) << 8) | devfn,
2008 DMA_CCMD_MASK_NOBIT,
2009 DMA_CCMD_DEVICE_INVL);
2010 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2015 context_clear_entry(context);
2017 if (sm_supported(iommu)) {
2022 /* Setup the PASID DIR pointer: */
2023 pds = context_get_sm_pds(table);
2024 context->lo = (u64)virt_to_phys(table->table) |
2027 /* Setup the RID_PASID field: */
2028 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2031 * Setup the Device-TLB enable bit and Page request
2034 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2035 if (info && info->ats_supported)
2036 context_set_sm_dte(context);
2037 if (info && info->pri_supported)
2038 context_set_sm_pre(context);
2040 struct dma_pte *pgd = domain->pgd;
2043 context_set_domain_id(context, did);
2045 if (translation != CONTEXT_TT_PASS_THROUGH) {
2047 * Skip top levels of page tables for iommu which has
2048 * less agaw than default. Unnecessary for PT mode.
2050 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2052 pgd = phys_to_virt(dma_pte_addr(pgd));
2053 if (!dma_pte_present(pgd))
2057 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2058 if (info && info->ats_supported)
2059 translation = CONTEXT_TT_DEV_IOTLB;
2061 translation = CONTEXT_TT_MULTI_LEVEL;
2063 context_set_address_root(context, virt_to_phys(pgd));
2064 context_set_address_width(context, agaw);
2067 * In pass through mode, AW must be programmed to
2068 * indicate the largest AGAW value supported by
2069 * hardware. And ASR is ignored by hardware.
2071 context_set_address_width(context, iommu->msagaw);
2074 context_set_translation_type(context, translation);
2077 context_set_fault_enable(context);
2078 context_set_present(context);
2079 if (!ecap_coherent(iommu->ecap))
2080 clflush_cache_range(context, sizeof(*context));
2083 * It's a non-present to present mapping. If hardware doesn't cache
2084 * non-present entry we only need to flush the write-buffer. If the
2085 * _does_ cache non-present entries, then it does so in the special
2086 * domain #0, which we have to flush:
2088 if (cap_caching_mode(iommu->cap)) {
2089 iommu->flush.flush_context(iommu, 0,
2090 (((u16)bus) << 8) | devfn,
2091 DMA_CCMD_MASK_NOBIT,
2092 DMA_CCMD_DEVICE_INVL);
2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2095 iommu_flush_write_buffer(iommu);
2097 iommu_enable_dev_iotlb(info);
2102 spin_unlock(&iommu->lock);
2103 spin_unlock_irqrestore(&device_domain_lock, flags);
2108 struct domain_context_mapping_data {
2109 struct dmar_domain *domain;
2110 struct intel_iommu *iommu;
2111 struct pasid_table *table;
2114 static int domain_context_mapping_cb(struct pci_dev *pdev,
2115 u16 alias, void *opaque)
2117 struct domain_context_mapping_data *data = opaque;
2119 return domain_context_mapping_one(data->domain, data->iommu,
2120 data->table, PCI_BUS_NUM(alias),
2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2127 struct domain_context_mapping_data data;
2128 struct pasid_table *table;
2129 struct intel_iommu *iommu;
2132 iommu = device_to_iommu(dev, &bus, &devfn);
2136 table = intel_pasid_get_table(dev);
2138 if (!dev_is_pci(dev))
2139 return domain_context_mapping_one(domain, iommu, table,
2142 data.domain = domain;
2146 return pci_for_each_dma_alias(to_pci_dev(dev),
2147 &domain_context_mapping_cb, &data);
2150 static int domain_context_mapped_cb(struct pci_dev *pdev,
2151 u16 alias, void *opaque)
2153 struct intel_iommu *iommu = opaque;
2155 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158 static int domain_context_mapped(struct device *dev)
2160 struct intel_iommu *iommu;
2163 iommu = device_to_iommu(dev, &bus, &devfn);
2167 if (!dev_is_pci(dev))
2168 return device_context_mapped(iommu, bus, devfn);
2170 return !pci_for_each_dma_alias(to_pci_dev(dev),
2171 domain_context_mapped_cb, iommu);
2174 /* Returns a number of VTD pages, but aligned to MM page size */
2175 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178 host_addr &= ~PAGE_MASK;
2179 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182 /* Return largest possible superpage level for a given mapping */
2183 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2184 unsigned long iov_pfn,
2185 unsigned long phy_pfn,
2186 unsigned long pages)
2188 int support, level = 1;
2189 unsigned long pfnmerge;
2191 support = domain->iommu_superpage;
2193 /* To use a large page, the virtual *and* physical addresses
2194 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2195 of them will mean we have to use smaller pages. So just
2196 merge them and check both at once. */
2197 pfnmerge = iov_pfn | phy_pfn;
2199 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2200 pages >>= VTD_STRIDE_SHIFT;
2203 pfnmerge >>= VTD_STRIDE_SHIFT;
2211 * Ensure that old small page tables are removed to make room for superpage(s).
2212 * We're going to add new large pages, so make sure we don't remove their parent
2213 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2215 static void switch_to_super_page(struct dmar_domain *domain,
2216 unsigned long start_pfn,
2217 unsigned long end_pfn, int level)
2219 unsigned long lvl_pages = lvl_to_nr_pages(level);
2220 struct dma_pte *pte = NULL;
2223 while (start_pfn <= end_pfn) {
2225 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2227 if (dma_pte_present(pte)) {
2228 dma_pte_free_pagetable(domain, start_pfn,
2229 start_pfn + lvl_pages - 1,
2232 for_each_domain_iommu(i, domain)
2233 iommu_flush_iotlb_psi(g_iommus[i], domain,
2234 start_pfn, lvl_pages,
2239 start_pfn += lvl_pages;
2240 if (first_pte_in_page(pte))
2246 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2247 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2249 struct dma_pte *first_pte = NULL, *pte = NULL;
2250 unsigned int largepage_lvl = 0;
2251 unsigned long lvl_pages = 0;
2255 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2257 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2260 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2261 attr |= DMA_FL_PTE_PRESENT;
2262 if (domain_use_first_level(domain)) {
2263 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2264 if (prot & DMA_PTE_WRITE)
2265 attr |= DMA_FL_PTE_DIRTY;
2268 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2270 while (nr_pages > 0) {
2274 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2275 phys_pfn, nr_pages);
2277 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2282 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2284 /* It is large page*/
2285 if (largepage_lvl > 1) {
2286 unsigned long end_pfn;
2287 unsigned long pages_to_remove;
2289 pteval |= DMA_PTE_LARGE_PAGE;
2290 pages_to_remove = min_t(unsigned long, nr_pages,
2291 nr_pte_to_next_page(pte) * lvl_pages);
2292 end_pfn = iov_pfn + pages_to_remove - 1;
2293 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2295 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2299 /* We don't need lock here, nobody else
2300 * touches the iova range
2302 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2304 static int dumps = 5;
2305 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2306 iov_pfn, tmp, (unsigned long long)pteval);
2309 debug_dma_dump_mappings(NULL);
2314 nr_pages -= lvl_pages;
2315 iov_pfn += lvl_pages;
2316 phys_pfn += lvl_pages;
2317 pteval += lvl_pages * VTD_PAGE_SIZE;
2319 /* If the next PTE would be the first in a new page, then we
2320 * need to flush the cache on the entries we've just written.
2321 * And then we'll need to recalculate 'pte', so clear it and
2322 * let it get set again in the if (!pte) block above.
2324 * If we're done (!nr_pages) we need to flush the cache too.
2326 * Also if we've been setting superpages, we may need to
2327 * recalculate 'pte' and switch back to smaller pages for the
2328 * end of the mapping, if the trailing size is not enough to
2329 * use another superpage (i.e. nr_pages < lvl_pages).
2332 if (!nr_pages || first_pte_in_page(pte) ||
2333 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2334 domain_flush_cache(domain, first_pte,
2335 (void *)pte - (void *)first_pte);
2343 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2345 struct intel_iommu *iommu = info->iommu;
2346 struct context_entry *context;
2347 unsigned long flags;
2353 spin_lock_irqsave(&iommu->lock, flags);
2354 context = iommu_context_addr(iommu, bus, devfn, 0);
2356 spin_unlock_irqrestore(&iommu->lock, flags);
2360 if (sm_supported(iommu)) {
2361 if (hw_pass_through && domain_type_is_si(info->domain))
2362 did_old = FLPT_DEFAULT_DID;
2364 did_old = info->domain->iommu_did[iommu->seq_id];
2366 did_old = context_domain_id(context);
2369 context_clear_entry(context);
2370 __iommu_flush_cache(iommu, context, sizeof(*context));
2371 spin_unlock_irqrestore(&iommu->lock, flags);
2372 iommu->flush.flush_context(iommu,
2374 (((u16)bus) << 8) | devfn,
2375 DMA_CCMD_MASK_NOBIT,
2376 DMA_CCMD_DEVICE_INVL);
2378 if (sm_supported(iommu))
2379 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2381 iommu->flush.flush_iotlb(iommu,
2387 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2390 static void domain_remove_dev_info(struct dmar_domain *domain)
2392 struct device_domain_info *info, *tmp;
2393 unsigned long flags;
2395 spin_lock_irqsave(&device_domain_lock, flags);
2396 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2397 __dmar_remove_one_dev_info(info);
2398 spin_unlock_irqrestore(&device_domain_lock, flags);
2401 static inline struct device_domain_info *
2402 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2404 struct device_domain_info *info;
2406 list_for_each_entry(info, &device_domain_list, global)
2407 if (info->segment == segment && info->bus == bus &&
2408 info->devfn == devfn)
2414 static int domain_setup_first_level(struct intel_iommu *iommu,
2415 struct dmar_domain *domain,
2419 struct dma_pte *pgd = domain->pgd;
2424 * Skip top levels of page tables for iommu which has
2425 * less agaw than default. Unnecessary for PT mode.
2427 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2428 pgd = phys_to_virt(dma_pte_addr(pgd));
2429 if (!dma_pte_present(pgd))
2433 level = agaw_to_level(agaw);
2434 if (level != 4 && level != 5)
2437 if (pasid != PASID_RID2PASID)
2438 flags |= PASID_FLAG_SUPERVISOR_MODE;
2440 flags |= PASID_FLAG_FL5LP;
2442 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2443 flags |= PASID_FLAG_PAGE_SNOOP;
2445 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2446 domain->iommu_did[iommu->seq_id],
2450 static bool dev_is_real_dma_subdevice(struct device *dev)
2452 return dev && dev_is_pci(dev) &&
2453 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2459 struct dmar_domain *domain)
2461 struct device_domain_info *info = dev_iommu_priv_get(dev);
2462 unsigned long flags;
2465 spin_lock_irqsave(&device_domain_lock, flags);
2466 info->domain = domain;
2467 spin_lock(&iommu->lock);
2468 ret = domain_attach_iommu(domain, iommu);
2469 spin_unlock(&iommu->lock);
2471 spin_unlock_irqrestore(&device_domain_lock, flags);
2474 list_add(&info->link, &domain->devices);
2475 spin_unlock_irqrestore(&device_domain_lock, flags);
2477 /* PASID table is mandatory for a PCI device in scalable mode. */
2478 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2479 ret = intel_pasid_alloc_table(dev);
2481 dev_err(dev, "PASID table allocation failed\n");
2482 dmar_remove_one_dev_info(dev);
2486 /* Setup the PASID entry for requests without PASID: */
2487 spin_lock_irqsave(&iommu->lock, flags);
2488 if (hw_pass_through && domain_type_is_si(domain))
2489 ret = intel_pasid_setup_pass_through(iommu, domain,
2490 dev, PASID_RID2PASID);
2491 else if (domain_use_first_level(domain))
2492 ret = domain_setup_first_level(iommu, domain, dev,
2495 ret = intel_pasid_setup_second_level(iommu, domain,
2496 dev, PASID_RID2PASID);
2497 spin_unlock_irqrestore(&iommu->lock, flags);
2499 dev_err(dev, "Setup RID2PASID failed\n");
2500 dmar_remove_one_dev_info(dev);
2505 if (dev && domain_context_mapping(domain, dev)) {
2506 dev_err(dev, "Domain context map failed\n");
2507 dmar_remove_one_dev_info(dev);
2514 static int iommu_domain_identity_map(struct dmar_domain *domain,
2515 unsigned long first_vpfn,
2516 unsigned long last_vpfn)
2519 * RMRR range might have overlap with physical memory range,
2522 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2524 return __domain_mapping(domain, first_vpfn,
2525 first_vpfn, last_vpfn - first_vpfn + 1,
2526 DMA_PTE_READ|DMA_PTE_WRITE);
2529 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2531 static int __init si_domain_init(int hw)
2533 struct dmar_rmrr_unit *rmrr;
2537 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2541 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2542 domain_exit(si_domain);
2549 for_each_online_node(nid) {
2550 unsigned long start_pfn, end_pfn;
2553 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2554 ret = iommu_domain_identity_map(si_domain,
2555 mm_to_dma_pfn(start_pfn),
2556 mm_to_dma_pfn(end_pfn));
2563 * Identity map the RMRRs so that devices with RMRRs could also use
2566 for_each_rmrr_units(rmrr) {
2567 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2569 unsigned long long start = rmrr->base_address;
2570 unsigned long long end = rmrr->end_address;
2572 if (WARN_ON(end < start ||
2573 end >> agaw_to_width(si_domain->agaw)))
2576 ret = iommu_domain_identity_map(si_domain,
2577 mm_to_dma_pfn(start >> PAGE_SHIFT),
2578 mm_to_dma_pfn(end >> PAGE_SHIFT));
2587 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2589 struct dmar_domain *ndomain;
2590 struct intel_iommu *iommu;
2593 iommu = device_to_iommu(dev, &bus, &devfn);
2597 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2598 if (ndomain != domain)
2604 static bool device_has_rmrr(struct device *dev)
2606 struct dmar_rmrr_unit *rmrr;
2611 for_each_rmrr_units(rmrr) {
2613 * Return TRUE if this RMRR contains the device that
2616 for_each_active_dev_scope(rmrr->devices,
2617 rmrr->devices_cnt, i, tmp)
2619 is_downstream_to_pci_bridge(dev, tmp)) {
2629 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2630 * is relaxable (ie. is allowed to be not enforced under some conditions)
2631 * @dev: device handle
2633 * We assume that PCI USB devices with RMRRs have them largely
2634 * for historical reasons and that the RMRR space is not actively used post
2635 * boot. This exclusion may change if vendors begin to abuse it.
2637 * The same exception is made for graphics devices, with the requirement that
2638 * any use of the RMRR regions will be torn down before assigning the device
2641 * Return: true if the RMRR is relaxable, false otherwise
2643 static bool device_rmrr_is_relaxable(struct device *dev)
2645 struct pci_dev *pdev;
2647 if (!dev_is_pci(dev))
2650 pdev = to_pci_dev(dev);
2651 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2658 * There are a couple cases where we need to restrict the functionality of
2659 * devices associated with RMRRs. The first is when evaluating a device for
2660 * identity mapping because problems exist when devices are moved in and out
2661 * of domains and their respective RMRR information is lost. This means that
2662 * a device with associated RMRRs will never be in a "passthrough" domain.
2663 * The second is use of the device through the IOMMU API. This interface
2664 * expects to have full control of the IOVA space for the device. We cannot
2665 * satisfy both the requirement that RMRR access is maintained and have an
2666 * unencumbered IOVA space. We also have no ability to quiesce the device's
2667 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2668 * We therefore prevent devices associated with an RMRR from participating in
2669 * the IOMMU API, which eliminates them from device assignment.
2671 * In both cases, devices which have relaxable RMRRs are not concerned by this
2672 * restriction. See device_rmrr_is_relaxable comment.
2674 static bool device_is_rmrr_locked(struct device *dev)
2676 if (!device_has_rmrr(dev))
2679 if (device_rmrr_is_relaxable(dev))
2686 * Return the required default domain type for a specific device.
2688 * @dev: the device in query
2689 * @startup: true if this is during early boot
2692 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2693 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2694 * - 0: both identity and dynamic domains work for this device
2696 static int device_def_domain_type(struct device *dev)
2698 if (dev_is_pci(dev)) {
2699 struct pci_dev *pdev = to_pci_dev(dev);
2701 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2702 return IOMMU_DOMAIN_IDENTITY;
2704 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2705 return IOMMU_DOMAIN_IDENTITY;
2711 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2714 * Start from the sane iommu hardware state.
2715 * If the queued invalidation is already initialized by us
2716 * (for example, while enabling interrupt-remapping) then
2717 * we got the things already rolling from a sane state.
2721 * Clear any previous faults.
2723 dmar_fault(-1, iommu);
2725 * Disable queued invalidation if supported and already enabled
2726 * before OS handover.
2728 dmar_disable_qi(iommu);
2731 if (dmar_enable_qi(iommu)) {
2733 * Queued Invalidate not enabled, use Register Based Invalidate
2735 iommu->flush.flush_context = __iommu_flush_context;
2736 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2737 pr_info("%s: Using Register based invalidation\n",
2740 iommu->flush.flush_context = qi_flush_context;
2741 iommu->flush.flush_iotlb = qi_flush_iotlb;
2742 pr_info("%s: Using Queued invalidation\n", iommu->name);
2746 static int copy_context_table(struct intel_iommu *iommu,
2747 struct root_entry *old_re,
2748 struct context_entry **tbl,
2751 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2752 struct context_entry *new_ce = NULL, ce;
2753 struct context_entry *old_ce = NULL;
2754 struct root_entry re;
2755 phys_addr_t old_ce_phys;
2757 tbl_idx = ext ? bus * 2 : bus;
2758 memcpy(&re, old_re, sizeof(re));
2760 for (devfn = 0; devfn < 256; devfn++) {
2761 /* First calculate the correct index */
2762 idx = (ext ? devfn * 2 : devfn) % 256;
2765 /* First save what we may have and clean up */
2767 tbl[tbl_idx] = new_ce;
2768 __iommu_flush_cache(iommu, new_ce,
2778 old_ce_phys = root_entry_lctp(&re);
2780 old_ce_phys = root_entry_uctp(&re);
2783 if (ext && devfn == 0) {
2784 /* No LCTP, try UCTP */
2793 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2798 new_ce = alloc_pgtable_page(iommu->node);
2805 /* Now copy the context entry */
2806 memcpy(&ce, old_ce + idx, sizeof(ce));
2808 if (!__context_present(&ce))
2811 did = context_domain_id(&ce);
2812 if (did >= 0 && did < cap_ndoms(iommu->cap))
2813 set_bit(did, iommu->domain_ids);
2816 * We need a marker for copied context entries. This
2817 * marker needs to work for the old format as well as
2818 * for extended context entries.
2820 * Bit 67 of the context entry is used. In the old
2821 * format this bit is available to software, in the
2822 * extended format it is the PGE bit, but PGE is ignored
2823 * by HW if PASIDs are disabled (and thus still
2826 * So disable PASIDs first and then mark the entry
2827 * copied. This means that we don't copy PASID
2828 * translations from the old kernel, but this is fine as
2829 * faults there are not fatal.
2831 context_clear_pasid_enable(&ce);
2832 context_set_copied(&ce);
2837 tbl[tbl_idx + pos] = new_ce;
2839 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2848 static int copy_translation_tables(struct intel_iommu *iommu)
2850 struct context_entry **ctxt_tbls;
2851 struct root_entry *old_rt;
2852 phys_addr_t old_rt_phys;
2853 int ctxt_table_entries;
2854 unsigned long flags;
2859 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2860 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2861 new_ext = !!ecap_ecs(iommu->ecap);
2864 * The RTT bit can only be changed when translation is disabled,
2865 * but disabling translation means to open a window for data
2866 * corruption. So bail out and don't copy anything if we would
2867 * have to change the bit.
2872 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2876 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2880 /* This is too big for the stack - allocate it from slab */
2881 ctxt_table_entries = ext ? 512 : 256;
2883 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2887 for (bus = 0; bus < 256; bus++) {
2888 ret = copy_context_table(iommu, &old_rt[bus],
2889 ctxt_tbls, bus, ext);
2891 pr_err("%s: Failed to copy context table for bus %d\n",
2897 spin_lock_irqsave(&iommu->lock, flags);
2899 /* Context tables are copied, now write them to the root_entry table */
2900 for (bus = 0; bus < 256; bus++) {
2901 int idx = ext ? bus * 2 : bus;
2904 if (ctxt_tbls[idx]) {
2905 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2906 iommu->root_entry[bus].lo = val;
2909 if (!ext || !ctxt_tbls[idx + 1])
2912 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2913 iommu->root_entry[bus].hi = val;
2916 spin_unlock_irqrestore(&iommu->lock, flags);
2920 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2930 #ifdef CONFIG_INTEL_IOMMU_SVM
2931 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2933 struct intel_iommu *iommu = data;
2937 return INVALID_IOASID;
2939 * VT-d virtual command interface always uses the full 20 bit
2940 * PASID range. Host can partition guest PASID range based on
2941 * policies but it is out of guest's control.
2943 if (min < PASID_MIN || max > intel_pasid_max_id)
2944 return INVALID_IOASID;
2946 if (vcmd_alloc_pasid(iommu, &ioasid))
2947 return INVALID_IOASID;
2952 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2954 struct intel_iommu *iommu = data;
2959 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2960 * We can only free the PASID when all the devices are unbound.
2962 if (ioasid_find(NULL, ioasid, NULL)) {
2963 pr_alert("Cannot free active IOASID %d\n", ioasid);
2966 vcmd_free_pasid(iommu, ioasid);
2969 static void register_pasid_allocator(struct intel_iommu *iommu)
2972 * If we are running in the host, no need for custom allocator
2973 * in that PASIDs are allocated from the host system-wide.
2975 if (!cap_caching_mode(iommu->cap))
2978 if (!sm_supported(iommu)) {
2979 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2984 * Register a custom PASID allocator if we are running in a guest,
2985 * guest PASID must be obtained via virtual command interface.
2986 * There can be multiple vIOMMUs in each guest but only one allocator
2987 * is active. All vIOMMU allocators will eventually be calling the same
2990 if (!vccap_pasid(iommu->vccap))
2993 pr_info("Register custom PASID allocator\n");
2994 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2995 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2996 iommu->pasid_allocator.pdata = (void *)iommu;
2997 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2998 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3000 * Disable scalable mode on this IOMMU if there
3001 * is no custom allocator. Mixing SM capable vIOMMU
3002 * and non-SM vIOMMU are not supported.
3009 static int __init init_dmars(void)
3011 struct dmar_drhd_unit *drhd;
3012 struct intel_iommu *iommu;
3018 * initialize and program root entry to not present
3021 for_each_drhd_unit(drhd) {
3023 * lock not needed as this is only incremented in the single
3024 * threaded kernel __init code path all other access are read
3027 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3031 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3034 /* Preallocate enough resources for IOMMU hot-addition */
3035 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3036 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3038 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3045 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3049 for_each_iommu(iommu, drhd) {
3050 if (drhd->ignored) {
3051 iommu_disable_translation(iommu);
3056 * Find the max pasid size of all IOMMU's in the system.
3057 * We need to ensure the system pasid table is no bigger
3058 * than the smallest supported.
3060 if (pasid_supported(iommu)) {
3061 u32 temp = 2 << ecap_pss(iommu->ecap);
3063 intel_pasid_max_id = min_t(u32, temp,
3064 intel_pasid_max_id);
3067 g_iommus[iommu->seq_id] = iommu;
3069 intel_iommu_init_qi(iommu);
3071 ret = iommu_init_domains(iommu);
3075 init_translation_status(iommu);
3077 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3078 iommu_disable_translation(iommu);
3079 clear_translation_pre_enabled(iommu);
3080 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3086 * we could share the same root & context tables
3087 * among all IOMMU's. Need to Split it later.
3089 ret = iommu_alloc_root_entry(iommu);
3093 if (translation_pre_enabled(iommu)) {
3094 pr_info("Translation already enabled - trying to copy translation structures\n");
3096 ret = copy_translation_tables(iommu);
3099 * We found the IOMMU with translation
3100 * enabled - but failed to copy over the
3101 * old root-entry table. Try to proceed
3102 * by disabling translation now and
3103 * allocating a clean root-entry table.
3104 * This might cause DMAR faults, but
3105 * probably the dump will still succeed.
3107 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3109 iommu_disable_translation(iommu);
3110 clear_translation_pre_enabled(iommu);
3112 pr_info("Copied translation tables from previous kernel for %s\n",
3117 if (!ecap_pass_through(iommu->ecap))
3118 hw_pass_through = 0;
3119 intel_svm_check(iommu);
3123 * Now that qi is enabled on all iommus, set the root entry and flush
3124 * caches. This is required on some Intel X58 chipsets, otherwise the
3125 * flush_context function will loop forever and the boot hangs.
3127 for_each_active_iommu(iommu, drhd) {
3128 iommu_flush_write_buffer(iommu);
3129 #ifdef CONFIG_INTEL_IOMMU_SVM
3130 register_pasid_allocator(iommu);
3132 iommu_set_root_entry(iommu);
3135 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3140 iommu_identity_mapping |= IDENTMAP_GFX;
3142 check_tylersburg_isoch();
3144 ret = si_domain_init(hw_pass_through);
3151 * global invalidate context cache
3152 * global invalidate iotlb
3153 * enable translation
3155 for_each_iommu(iommu, drhd) {
3156 if (drhd->ignored) {
3158 * we always have to disable PMRs or DMA may fail on
3162 iommu_disable_protect_mem_regions(iommu);
3166 iommu_flush_write_buffer(iommu);
3168 #ifdef CONFIG_INTEL_IOMMU_SVM
3169 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3171 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3172 * could cause possible lock race condition.
3174 up_write(&dmar_global_lock);
3175 ret = intel_svm_enable_prq(iommu);
3176 down_write(&dmar_global_lock);
3181 ret = dmar_set_interrupt(iommu);
3189 for_each_active_iommu(iommu, drhd) {
3190 disable_dmar_iommu(iommu);
3191 free_dmar_iommu(iommu);
3200 static void __init init_no_remapping_devices(void)
3202 struct dmar_drhd_unit *drhd;
3206 for_each_drhd_unit(drhd) {
3207 if (!drhd->include_all) {
3208 for_each_active_dev_scope(drhd->devices,
3209 drhd->devices_cnt, i, dev)
3211 /* ignore DMAR unit if no devices exist */
3212 if (i == drhd->devices_cnt)
3217 for_each_active_drhd_unit(drhd) {
3218 if (drhd->include_all)
3221 for_each_active_dev_scope(drhd->devices,
3222 drhd->devices_cnt, i, dev)
3223 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3225 if (i < drhd->devices_cnt)
3228 /* This IOMMU has *only* gfx devices. Either bypass it or
3229 set the gfx_mapped flag, as appropriate */
3230 drhd->gfx_dedicated = 1;
3236 #ifdef CONFIG_SUSPEND
3237 static int init_iommu_hw(void)
3239 struct dmar_drhd_unit *drhd;
3240 struct intel_iommu *iommu = NULL;
3242 for_each_active_iommu(iommu, drhd)
3244 dmar_reenable_qi(iommu);
3246 for_each_iommu(iommu, drhd) {
3247 if (drhd->ignored) {
3249 * we always have to disable PMRs or DMA may fail on
3253 iommu_disable_protect_mem_regions(iommu);
3257 iommu_flush_write_buffer(iommu);
3258 iommu_set_root_entry(iommu);
3259 iommu_enable_translation(iommu);
3260 iommu_disable_protect_mem_regions(iommu);
3266 static void iommu_flush_all(void)
3268 struct dmar_drhd_unit *drhd;
3269 struct intel_iommu *iommu;
3271 for_each_active_iommu(iommu, drhd) {
3272 iommu->flush.flush_context(iommu, 0, 0, 0,
3273 DMA_CCMD_GLOBAL_INVL);
3274 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3275 DMA_TLB_GLOBAL_FLUSH);
3279 static int iommu_suspend(void)
3281 struct dmar_drhd_unit *drhd;
3282 struct intel_iommu *iommu = NULL;
3285 for_each_active_iommu(iommu, drhd) {
3286 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3288 if (!iommu->iommu_state)
3294 for_each_active_iommu(iommu, drhd) {
3295 iommu_disable_translation(iommu);
3297 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3299 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3300 readl(iommu->reg + DMAR_FECTL_REG);
3301 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3302 readl(iommu->reg + DMAR_FEDATA_REG);
3303 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3304 readl(iommu->reg + DMAR_FEADDR_REG);
3305 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3306 readl(iommu->reg + DMAR_FEUADDR_REG);
3308 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3313 for_each_active_iommu(iommu, drhd)
3314 kfree(iommu->iommu_state);
3319 static void iommu_resume(void)
3321 struct dmar_drhd_unit *drhd;
3322 struct intel_iommu *iommu = NULL;
3325 if (init_iommu_hw()) {
3327 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3329 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3333 for_each_active_iommu(iommu, drhd) {
3335 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3337 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3338 iommu->reg + DMAR_FECTL_REG);
3339 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3340 iommu->reg + DMAR_FEDATA_REG);
3341 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3342 iommu->reg + DMAR_FEADDR_REG);
3343 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3344 iommu->reg + DMAR_FEUADDR_REG);
3346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3349 for_each_active_iommu(iommu, drhd)
3350 kfree(iommu->iommu_state);
3353 static struct syscore_ops iommu_syscore_ops = {
3354 .resume = iommu_resume,
3355 .suspend = iommu_suspend,
3358 static void __init init_iommu_pm_ops(void)
3360 register_syscore_ops(&iommu_syscore_ops);
3364 static inline void init_iommu_pm_ops(void) {}
3365 #endif /* CONFIG_PM */
3367 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3369 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3370 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3371 rmrr->end_address <= rmrr->base_address ||
3372 arch_rmrr_sanity_check(rmrr))
3378 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3380 struct acpi_dmar_reserved_memory *rmrr;
3381 struct dmar_rmrr_unit *rmrru;
3383 rmrr = (struct acpi_dmar_reserved_memory *)header;
3384 if (rmrr_sanity_check(rmrr)) {
3386 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3387 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3388 rmrr->base_address, rmrr->end_address,
3389 dmi_get_system_info(DMI_BIOS_VENDOR),
3390 dmi_get_system_info(DMI_BIOS_VERSION),
3391 dmi_get_system_info(DMI_PRODUCT_VERSION));
3392 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3395 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3399 rmrru->hdr = header;
3401 rmrru->base_address = rmrr->base_address;
3402 rmrru->end_address = rmrr->end_address;
3404 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3405 ((void *)rmrr) + rmrr->header.length,
3406 &rmrru->devices_cnt);
3407 if (rmrru->devices_cnt && rmrru->devices == NULL)
3410 list_add(&rmrru->list, &dmar_rmrr_units);
3419 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3421 struct dmar_atsr_unit *atsru;
3422 struct acpi_dmar_atsr *tmp;
3424 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3426 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3427 if (atsr->segment != tmp->segment)
3429 if (atsr->header.length != tmp->header.length)
3431 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3438 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3440 struct acpi_dmar_atsr *atsr;
3441 struct dmar_atsr_unit *atsru;
3443 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3446 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3447 atsru = dmar_find_atsr(atsr);
3451 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3456 * If memory is allocated from slab by ACPI _DSM method, we need to
3457 * copy the memory content because the memory buffer will be freed
3460 atsru->hdr = (void *)(atsru + 1);
3461 memcpy(atsru->hdr, hdr, hdr->length);
3462 atsru->include_all = atsr->flags & 0x1;
3463 if (!atsru->include_all) {
3464 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3465 (void *)atsr + atsr->header.length,
3466 &atsru->devices_cnt);
3467 if (atsru->devices_cnt && atsru->devices == NULL) {
3473 list_add_rcu(&atsru->list, &dmar_atsr_units);
3478 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3480 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3484 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3486 struct acpi_dmar_atsr *atsr;
3487 struct dmar_atsr_unit *atsru;
3489 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3490 atsru = dmar_find_atsr(atsr);
3492 list_del_rcu(&atsru->list);
3494 intel_iommu_free_atsr(atsru);
3500 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3504 struct acpi_dmar_atsr *atsr;
3505 struct dmar_atsr_unit *atsru;
3507 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3508 atsru = dmar_find_atsr(atsr);
3512 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3513 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3521 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3523 struct dmar_satc_unit *satcu;
3524 struct acpi_dmar_satc *tmp;
3526 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3528 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3529 if (satc->segment != tmp->segment)
3531 if (satc->header.length != tmp->header.length)
3533 if (memcmp(satc, tmp, satc->header.length) == 0)
3540 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3542 struct acpi_dmar_satc *satc;
3543 struct dmar_satc_unit *satcu;
3545 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3548 satc = container_of(hdr, struct acpi_dmar_satc, header);
3549 satcu = dmar_find_satc(satc);
3553 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3557 satcu->hdr = (void *)(satcu + 1);
3558 memcpy(satcu->hdr, hdr, hdr->length);
3559 satcu->atc_required = satc->flags & 0x1;
3560 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3561 (void *)satc + satc->header.length,
3562 &satcu->devices_cnt);
3563 if (satcu->devices_cnt && !satcu->devices) {
3567 list_add_rcu(&satcu->list, &dmar_satc_units);
3572 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3575 struct intel_iommu *iommu = dmaru->iommu;
3577 if (g_iommus[iommu->seq_id])
3580 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3584 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3585 pr_warn("%s: Doesn't support hardware pass through.\n",
3589 if (!ecap_sc_support(iommu->ecap) &&
3590 domain_update_iommu_snooping(iommu)) {
3591 pr_warn("%s: Doesn't support snooping.\n",
3595 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3596 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3597 pr_warn("%s: Doesn't support large page.\n",
3603 * Disable translation if already enabled prior to OS handover.
3605 if (iommu->gcmd & DMA_GCMD_TE)
3606 iommu_disable_translation(iommu);
3608 g_iommus[iommu->seq_id] = iommu;
3609 ret = iommu_init_domains(iommu);
3611 ret = iommu_alloc_root_entry(iommu);
3615 intel_svm_check(iommu);
3617 if (dmaru->ignored) {
3619 * we always have to disable PMRs or DMA may fail on this device
3622 iommu_disable_protect_mem_regions(iommu);
3626 intel_iommu_init_qi(iommu);
3627 iommu_flush_write_buffer(iommu);
3629 #ifdef CONFIG_INTEL_IOMMU_SVM
3630 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3631 ret = intel_svm_enable_prq(iommu);
3636 ret = dmar_set_interrupt(iommu);
3640 iommu_set_root_entry(iommu);
3641 iommu_enable_translation(iommu);
3643 iommu_disable_protect_mem_regions(iommu);
3647 disable_dmar_iommu(iommu);
3649 free_dmar_iommu(iommu);
3653 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3656 struct intel_iommu *iommu = dmaru->iommu;
3658 if (!intel_iommu_enabled)
3664 ret = intel_iommu_add(dmaru);
3666 disable_dmar_iommu(iommu);
3667 free_dmar_iommu(iommu);
3673 static void intel_iommu_free_dmars(void)
3675 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3676 struct dmar_atsr_unit *atsru, *atsr_n;
3677 struct dmar_satc_unit *satcu, *satc_n;
3679 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3680 list_del(&rmrru->list);
3681 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3685 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3686 list_del(&atsru->list);
3687 intel_iommu_free_atsr(atsru);
3689 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3690 list_del(&satcu->list);
3691 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3696 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3698 struct dmar_satc_unit *satcu;
3699 struct acpi_dmar_satc *satc;
3703 dev = pci_physfn(dev);
3706 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3707 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3708 if (satc->segment != pci_domain_nr(dev->bus))
3710 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3711 if (to_pci_dev(tmp) == dev)
3720 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3723 struct pci_bus *bus;
3724 struct pci_dev *bridge = NULL;
3726 struct acpi_dmar_atsr *atsr;
3727 struct dmar_atsr_unit *atsru;
3728 struct dmar_satc_unit *satcu;
3730 dev = pci_physfn(dev);
3731 satcu = dmar_find_matched_satc_unit(dev);
3734 * This device supports ATS as it is in SATC table.
3735 * When IOMMU is in legacy mode, enabling ATS is done
3736 * automatically by HW for the device that requires
3737 * ATS, hence OS should not enable this device ATS
3738 * to avoid duplicated TLB invalidation.
3740 return !(satcu->atc_required && !sm_supported(iommu));
3742 for (bus = dev->bus; bus; bus = bus->parent) {
3744 /* If it's an integrated device, allow ATS */
3747 /* Connected via non-PCIe: no ATS */
3748 if (!pci_is_pcie(bridge) ||
3749 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3751 /* If we found the root port, look it up in the ATSR */
3752 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3757 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3758 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3759 if (atsr->segment != pci_domain_nr(dev->bus))
3762 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3763 if (tmp == &bridge->dev)
3766 if (atsru->include_all)
3776 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3779 struct dmar_rmrr_unit *rmrru;
3780 struct dmar_atsr_unit *atsru;
3781 struct dmar_satc_unit *satcu;
3782 struct acpi_dmar_atsr *atsr;
3783 struct acpi_dmar_reserved_memory *rmrr;
3784 struct acpi_dmar_satc *satc;
3786 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3789 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3790 rmrr = container_of(rmrru->hdr,
3791 struct acpi_dmar_reserved_memory, header);
3792 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3793 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3794 ((void *)rmrr) + rmrr->header.length,
3795 rmrr->segment, rmrru->devices,
3796 rmrru->devices_cnt);
3799 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3800 dmar_remove_dev_scope(info, rmrr->segment,
3801 rmrru->devices, rmrru->devices_cnt);
3805 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3806 if (atsru->include_all)
3809 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3810 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3811 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3812 (void *)atsr + atsr->header.length,
3813 atsr->segment, atsru->devices,
3814 atsru->devices_cnt);
3819 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3820 if (dmar_remove_dev_scope(info, atsr->segment,
3821 atsru->devices, atsru->devices_cnt))
3825 list_for_each_entry(satcu, &dmar_satc_units, list) {
3826 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3827 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3828 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3829 (void *)satc + satc->header.length,
3830 satc->segment, satcu->devices,
3831 satcu->devices_cnt);
3836 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3837 if (dmar_remove_dev_scope(info, satc->segment,
3838 satcu->devices, satcu->devices_cnt))
3846 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3847 unsigned long val, void *v)
3849 struct memory_notify *mhp = v;
3850 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3851 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3855 case MEM_GOING_ONLINE:
3856 if (iommu_domain_identity_map(si_domain,
3857 start_vpfn, last_vpfn)) {
3858 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3859 start_vpfn, last_vpfn);
3865 case MEM_CANCEL_ONLINE:
3867 struct dmar_drhd_unit *drhd;
3868 struct intel_iommu *iommu;
3869 LIST_HEAD(freelist);
3871 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3874 for_each_active_iommu(iommu, drhd)
3875 iommu_flush_iotlb_psi(iommu, si_domain,
3876 start_vpfn, mhp->nr_pages,
3877 list_empty(&freelist), 0);
3879 put_pages_list(&freelist);
3887 static struct notifier_block intel_iommu_memory_nb = {
3888 .notifier_call = intel_iommu_memory_notifier,
3892 static void intel_disable_iommus(void)
3894 struct intel_iommu *iommu = NULL;
3895 struct dmar_drhd_unit *drhd;
3897 for_each_iommu(iommu, drhd)
3898 iommu_disable_translation(iommu);
3901 void intel_iommu_shutdown(void)
3903 struct dmar_drhd_unit *drhd;
3904 struct intel_iommu *iommu = NULL;
3906 if (no_iommu || dmar_disabled)
3909 down_write(&dmar_global_lock);
3911 /* Disable PMRs explicitly here. */
3912 for_each_iommu(iommu, drhd)
3913 iommu_disable_protect_mem_regions(iommu);
3915 /* Make sure the IOMMUs are switched off */
3916 intel_disable_iommus();
3918 up_write(&dmar_global_lock);
3921 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3923 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3925 return container_of(iommu_dev, struct intel_iommu, iommu);
3928 static ssize_t version_show(struct device *dev,
3929 struct device_attribute *attr, char *buf)
3931 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3932 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3933 return sprintf(buf, "%d:%d\n",
3934 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3936 static DEVICE_ATTR_RO(version);
3938 static ssize_t address_show(struct device *dev,
3939 struct device_attribute *attr, char *buf)
3941 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3942 return sprintf(buf, "%llx\n", iommu->reg_phys);
3944 static DEVICE_ATTR_RO(address);
3946 static ssize_t cap_show(struct device *dev,
3947 struct device_attribute *attr, char *buf)
3949 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3950 return sprintf(buf, "%llx\n", iommu->cap);
3952 static DEVICE_ATTR_RO(cap);
3954 static ssize_t ecap_show(struct device *dev,
3955 struct device_attribute *attr, char *buf)
3957 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3958 return sprintf(buf, "%llx\n", iommu->ecap);
3960 static DEVICE_ATTR_RO(ecap);
3962 static ssize_t domains_supported_show(struct device *dev,
3963 struct device_attribute *attr, char *buf)
3965 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3966 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3968 static DEVICE_ATTR_RO(domains_supported);
3970 static ssize_t domains_used_show(struct device *dev,
3971 struct device_attribute *attr, char *buf)
3973 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3974 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3975 cap_ndoms(iommu->cap)));
3977 static DEVICE_ATTR_RO(domains_used);
3979 static struct attribute *intel_iommu_attrs[] = {
3980 &dev_attr_version.attr,
3981 &dev_attr_address.attr,
3983 &dev_attr_ecap.attr,
3984 &dev_attr_domains_supported.attr,
3985 &dev_attr_domains_used.attr,
3989 static struct attribute_group intel_iommu_group = {
3990 .name = "intel-iommu",
3991 .attrs = intel_iommu_attrs,
3994 const struct attribute_group *intel_iommu_groups[] = {
3999 static inline bool has_external_pci(void)
4001 struct pci_dev *pdev = NULL;
4003 for_each_pci_dev(pdev)
4004 if (pdev->external_facing)
4010 static int __init platform_optin_force_iommu(void)
4012 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4015 if (no_iommu || dmar_disabled)
4016 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4019 * If Intel-IOMMU is disabled by default, we will apply identity
4020 * map for all devices except those marked as being untrusted.
4023 iommu_set_default_passthrough(false);
4031 static int __init probe_acpi_namespace_devices(void)
4033 struct dmar_drhd_unit *drhd;
4034 /* To avoid a -Wunused-but-set-variable warning. */
4035 struct intel_iommu *iommu __maybe_unused;
4039 for_each_active_iommu(iommu, drhd) {
4040 for_each_active_dev_scope(drhd->devices,
4041 drhd->devices_cnt, i, dev) {
4042 struct acpi_device_physical_node *pn;
4043 struct iommu_group *group;
4044 struct acpi_device *adev;
4046 if (dev->bus != &acpi_bus_type)
4049 adev = to_acpi_device(dev);
4050 mutex_lock(&adev->physical_node_lock);
4051 list_for_each_entry(pn,
4052 &adev->physical_node_list, node) {
4053 group = iommu_group_get(pn->dev);
4055 iommu_group_put(group);
4059 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4060 ret = iommu_probe_device(pn->dev);
4064 mutex_unlock(&adev->physical_node_lock);
4074 int __init intel_iommu_init(void)
4077 struct dmar_drhd_unit *drhd;
4078 struct intel_iommu *iommu;
4081 * Intel IOMMU is required for a TXT/tboot launch or platform
4082 * opt in, so enforce that.
4084 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4085 platform_optin_force_iommu();
4087 down_write(&dmar_global_lock);
4088 if (dmar_table_init()) {
4090 panic("tboot: Failed to initialize DMAR table\n");
4094 if (dmar_dev_scope_init() < 0) {
4096 panic("tboot: Failed to initialize DMAR device scope\n");
4100 up_write(&dmar_global_lock);
4103 * The bus notifier takes the dmar_global_lock, so lockdep will
4104 * complain later when we register it under the lock.
4106 dmar_register_bus_notifier();
4108 down_write(&dmar_global_lock);
4111 intel_iommu_debugfs_init();
4113 if (no_iommu || dmar_disabled) {
4115 * We exit the function here to ensure IOMMU's remapping and
4116 * mempool aren't setup, which means that the IOMMU's PMRs
4117 * won't be disabled via the call to init_dmars(). So disable
4118 * it explicitly here. The PMRs were setup by tboot prior to
4119 * calling SENTER, but the kernel is expected to reset/tear
4122 if (intel_iommu_tboot_noforce) {
4123 for_each_iommu(iommu, drhd)
4124 iommu_disable_protect_mem_regions(iommu);
4128 * Make sure the IOMMUs are switched off, even when we
4129 * boot into a kexec kernel and the previous kernel left
4132 intel_disable_iommus();
4136 if (list_empty(&dmar_rmrr_units))
4137 pr_info("No RMRR found\n");
4139 if (list_empty(&dmar_atsr_units))
4140 pr_info("No ATSR found\n");
4142 if (list_empty(&dmar_satc_units))
4143 pr_info("No SATC found\n");
4146 intel_iommu_gfx_mapped = 1;
4148 init_no_remapping_devices();
4153 panic("tboot: Failed to initialize DMARs\n");
4154 pr_err("Initialization failed\n");
4157 up_write(&dmar_global_lock);
4159 init_iommu_pm_ops();
4161 down_read(&dmar_global_lock);
4162 for_each_active_iommu(iommu, drhd) {
4164 * The flush queue implementation does not perform
4165 * page-selective invalidations that are required for efficient
4166 * TLB flushes in virtual environments. The benefit of batching
4167 * is likely to be much lower than the overhead of synchronizing
4168 * the virtual and physical IOMMU page-tables.
4170 if (cap_caching_mode(iommu->cap)) {
4171 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4172 iommu_set_dma_strict();
4174 iommu_device_sysfs_add(&iommu->iommu, NULL,
4177 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4179 up_read(&dmar_global_lock);
4181 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4182 if (si_domain && !hw_pass_through)
4183 register_memory_notifier(&intel_iommu_memory_nb);
4185 down_read(&dmar_global_lock);
4186 if (probe_acpi_namespace_devices())
4187 pr_warn("ACPI name space devices didn't probe correctly\n");
4189 /* Finally, we enable the DMA remapping hardware. */
4190 for_each_iommu(iommu, drhd) {
4191 if (!drhd->ignored && !translation_pre_enabled(iommu))
4192 iommu_enable_translation(iommu);
4194 iommu_disable_protect_mem_regions(iommu);
4196 up_read(&dmar_global_lock);
4198 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4200 intel_iommu_enabled = 1;
4205 intel_iommu_free_dmars();
4206 up_write(&dmar_global_lock);
4210 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4212 struct device_domain_info *info = opaque;
4214 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4219 * NB - intel-iommu lacks any sort of reference counting for the users of
4220 * dependent devices. If multiple endpoints have intersecting dependent
4221 * devices, unbinding the driver from any one of them will possibly leave
4222 * the others unable to operate.
4224 static void domain_context_clear(struct device_domain_info *info)
4226 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4229 pci_for_each_dma_alias(to_pci_dev(info->dev),
4230 &domain_context_clear_one_cb, info);
4233 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4235 struct dmar_domain *domain;
4236 struct intel_iommu *iommu;
4237 unsigned long flags;
4239 assert_spin_locked(&device_domain_lock);
4244 iommu = info->iommu;
4245 domain = info->domain;
4247 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4248 if (dev_is_pci(info->dev) && sm_supported(iommu))
4249 intel_pasid_tear_down_entry(iommu, info->dev,
4250 PASID_RID2PASID, false);
4252 iommu_disable_dev_iotlb(info);
4253 domain_context_clear(info);
4254 intel_pasid_free_table(info->dev);
4257 list_del(&info->link);
4259 spin_lock_irqsave(&iommu->lock, flags);
4260 domain_detach_iommu(domain, iommu);
4261 spin_unlock_irqrestore(&iommu->lock, flags);
4264 static void dmar_remove_one_dev_info(struct device *dev)
4266 struct device_domain_info *info;
4267 unsigned long flags;
4269 spin_lock_irqsave(&device_domain_lock, flags);
4270 info = dev_iommu_priv_get(dev);
4272 __dmar_remove_one_dev_info(info);
4273 spin_unlock_irqrestore(&device_domain_lock, flags);
4276 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4280 /* calculate AGAW */
4281 domain->gaw = guest_width;
4282 adjust_width = guestwidth_to_adjustwidth(guest_width);
4283 domain->agaw = width_to_agaw(adjust_width);
4285 domain->iommu_coherency = false;
4286 domain->iommu_snooping = false;
4287 domain->iommu_superpage = 0;
4288 domain->max_addr = 0;
4290 /* always allocate the top pgd */
4291 domain->pgd = alloc_pgtable_page(domain->nid);
4294 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4298 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4300 struct dmar_domain *dmar_domain;
4301 struct iommu_domain *domain;
4304 case IOMMU_DOMAIN_DMA:
4305 case IOMMU_DOMAIN_DMA_FQ:
4306 case IOMMU_DOMAIN_UNMANAGED:
4307 dmar_domain = alloc_domain(type);
4309 pr_err("Can't allocate dmar_domain\n");
4312 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4313 pr_err("Domain initialization failed\n");
4314 domain_exit(dmar_domain);
4318 domain = &dmar_domain->domain;
4319 domain->geometry.aperture_start = 0;
4320 domain->geometry.aperture_end =
4321 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4322 domain->geometry.force_aperture = true;
4325 case IOMMU_DOMAIN_IDENTITY:
4326 return &si_domain->domain;
4334 static void intel_iommu_domain_free(struct iommu_domain *domain)
4336 if (domain != &si_domain->domain)
4337 domain_exit(to_dmar_domain(domain));
4340 static int prepare_domain_attach_device(struct iommu_domain *domain,
4343 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4344 struct intel_iommu *iommu;
4347 iommu = device_to_iommu(dev, NULL, NULL);
4351 /* check if this iommu agaw is sufficient for max mapped address */
4352 addr_width = agaw_to_width(iommu->agaw);
4353 if (addr_width > cap_mgaw(iommu->cap))
4354 addr_width = cap_mgaw(iommu->cap);
4356 if (dmar_domain->max_addr > (1LL << addr_width)) {
4357 dev_err(dev, "%s: iommu width (%d) is not "
4358 "sufficient for the mapped address (%llx)\n",
4359 __func__, addr_width, dmar_domain->max_addr);
4362 dmar_domain->gaw = addr_width;
4365 * Knock out extra levels of page tables if necessary
4367 while (iommu->agaw < dmar_domain->agaw) {
4368 struct dma_pte *pte;
4370 pte = dmar_domain->pgd;
4371 if (dma_pte_present(pte)) {
4372 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4373 free_pgtable_page(pte);
4375 dmar_domain->agaw--;
4381 static int intel_iommu_attach_device(struct iommu_domain *domain,
4386 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4387 device_is_rmrr_locked(dev)) {
4388 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4392 /* normally dev is not mapped */
4393 if (unlikely(domain_context_mapped(dev))) {
4394 struct device_domain_info *info = dev_iommu_priv_get(dev);
4397 dmar_remove_one_dev_info(dev);
4400 ret = prepare_domain_attach_device(domain, dev);
4404 return domain_add_dev_info(to_dmar_domain(domain), dev);
4407 static void intel_iommu_detach_device(struct iommu_domain *domain,
4410 dmar_remove_one_dev_info(dev);
4413 static int intel_iommu_map(struct iommu_domain *domain,
4414 unsigned long iova, phys_addr_t hpa,
4415 size_t size, int iommu_prot, gfp_t gfp)
4417 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4421 if (iommu_prot & IOMMU_READ)
4422 prot |= DMA_PTE_READ;
4423 if (iommu_prot & IOMMU_WRITE)
4424 prot |= DMA_PTE_WRITE;
4425 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4426 prot |= DMA_PTE_SNP;
4428 max_addr = iova + size;
4429 if (dmar_domain->max_addr < max_addr) {
4432 /* check if minimum agaw is sufficient for mapped address */
4433 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4434 if (end < max_addr) {
4435 pr_err("%s: iommu width (%d) is not "
4436 "sufficient for the mapped address (%llx)\n",
4437 __func__, dmar_domain->gaw, max_addr);
4440 dmar_domain->max_addr = max_addr;
4442 /* Round up size to next multiple of PAGE_SIZE, if it and
4443 the low bits of hpa would take us onto the next page */
4444 size = aligned_nrpages(hpa, size);
4445 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4446 hpa >> VTD_PAGE_SHIFT, size, prot);
4449 static int intel_iommu_map_pages(struct iommu_domain *domain,
4450 unsigned long iova, phys_addr_t paddr,
4451 size_t pgsize, size_t pgcount,
4452 int prot, gfp_t gfp, size_t *mapped)
4454 unsigned long pgshift = __ffs(pgsize);
4455 size_t size = pgcount << pgshift;
4458 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4461 if (!IS_ALIGNED(iova | paddr, pgsize))
4464 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4471 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4472 unsigned long iova, size_t size,
4473 struct iommu_iotlb_gather *gather)
4475 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4476 unsigned long start_pfn, last_pfn;
4479 /* Cope with horrid API which requires us to unmap more than the
4480 size argument if it happens to be a large-page mapping. */
4481 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4483 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4484 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4486 start_pfn = iova >> VTD_PAGE_SHIFT;
4487 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4489 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4491 if (dmar_domain->max_addr == iova + size)
4492 dmar_domain->max_addr = iova;
4494 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4499 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4501 size_t pgsize, size_t pgcount,
4502 struct iommu_iotlb_gather *gather)
4504 unsigned long pgshift = __ffs(pgsize);
4505 size_t size = pgcount << pgshift;
4507 return intel_iommu_unmap(domain, iova, size, gather);
4510 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4511 struct iommu_iotlb_gather *gather)
4513 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4514 unsigned long iova_pfn = IOVA_PFN(gather->start);
4515 size_t size = gather->end - gather->start;
4516 unsigned long start_pfn;
4517 unsigned long nrpages;
4520 nrpages = aligned_nrpages(gather->start, size);
4521 start_pfn = mm_to_dma_pfn(iova_pfn);
4523 for_each_domain_iommu(iommu_id, dmar_domain)
4524 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4526 list_empty(&gather->freelist), 0);
4528 put_pages_list(&gather->freelist);
4531 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4534 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4535 struct dma_pte *pte;
4539 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4540 if (pte && dma_pte_present(pte))
4541 phys = dma_pte_addr(pte) +
4542 (iova & (BIT_MASK(level_to_offset_bits(level) +
4543 VTD_PAGE_SHIFT) - 1));
4548 static bool intel_iommu_capable(enum iommu_cap cap)
4550 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4551 return domain_update_iommu_snooping(NULL);
4552 if (cap == IOMMU_CAP_INTR_REMAP)
4553 return irq_remapping_enabled == 1;
4558 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4560 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4561 struct device_domain_info *info;
4562 struct intel_iommu *iommu;
4563 unsigned long flags;
4566 iommu = device_to_iommu(dev, &bus, &devfn);
4568 return ERR_PTR(-ENODEV);
4570 info = kzalloc(sizeof(*info), GFP_KERNEL);
4572 return ERR_PTR(-ENOMEM);
4574 if (dev_is_real_dma_subdevice(dev)) {
4575 info->bus = pdev->bus->number;
4576 info->devfn = pdev->devfn;
4577 info->segment = pci_domain_nr(pdev->bus);
4580 info->devfn = devfn;
4581 info->segment = iommu->segment;
4585 info->iommu = iommu;
4586 if (dev_is_pci(dev)) {
4587 if (ecap_dev_iotlb_support(iommu->ecap) &&
4588 pci_ats_supported(pdev) &&
4589 dmar_ats_supported(pdev, iommu))
4590 info->ats_supported = 1;
4592 if (sm_supported(iommu)) {
4593 if (pasid_supported(iommu)) {
4594 int features = pci_pasid_features(pdev);
4597 info->pasid_supported = features | 1;
4600 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4601 pci_pri_supported(pdev))
4602 info->pri_supported = 1;
4606 spin_lock_irqsave(&device_domain_lock, flags);
4607 list_add(&info->global, &device_domain_list);
4608 dev_iommu_priv_set(dev, info);
4609 spin_unlock_irqrestore(&device_domain_lock, flags);
4611 return &iommu->iommu;
4614 static void intel_iommu_release_device(struct device *dev)
4616 struct device_domain_info *info = dev_iommu_priv_get(dev);
4617 unsigned long flags;
4619 dmar_remove_one_dev_info(dev);
4621 spin_lock_irqsave(&device_domain_lock, flags);
4622 dev_iommu_priv_set(dev, NULL);
4623 list_del(&info->global);
4624 spin_unlock_irqrestore(&device_domain_lock, flags);
4627 set_dma_ops(dev, NULL);
4630 static void intel_iommu_probe_finalize(struct device *dev)
4632 set_dma_ops(dev, NULL);
4633 iommu_setup_dma_ops(dev, 0, U64_MAX);
4636 static void intel_iommu_get_resv_regions(struct device *device,
4637 struct list_head *head)
4639 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4640 struct iommu_resv_region *reg;
4641 struct dmar_rmrr_unit *rmrr;
4642 struct device *i_dev;
4645 down_read(&dmar_global_lock);
4646 for_each_rmrr_units(rmrr) {
4647 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4649 struct iommu_resv_region *resv;
4650 enum iommu_resv_type type;
4653 if (i_dev != device &&
4654 !is_downstream_to_pci_bridge(device, i_dev))
4657 length = rmrr->end_address - rmrr->base_address + 1;
4659 type = device_rmrr_is_relaxable(device) ?
4660 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4662 resv = iommu_alloc_resv_region(rmrr->base_address,
4663 length, prot, type);
4667 list_add_tail(&resv->list, head);
4670 up_read(&dmar_global_lock);
4672 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4673 if (dev_is_pci(device)) {
4674 struct pci_dev *pdev = to_pci_dev(device);
4676 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4677 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4678 IOMMU_RESV_DIRECT_RELAXABLE);
4680 list_add_tail(®->list, head);
4683 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4685 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4686 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4690 list_add_tail(®->list, head);
4693 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4695 struct device_domain_info *info = dev_iommu_priv_get(dev);
4696 struct context_entry *context;
4697 struct dmar_domain *domain;
4698 unsigned long flags;
4702 domain = info->domain;
4706 spin_lock_irqsave(&device_domain_lock, flags);
4707 spin_lock(&iommu->lock);
4710 if (!info->pasid_supported)
4713 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4714 if (WARN_ON(!context))
4717 ctx_lo = context[0].lo;
4719 if (!(ctx_lo & CONTEXT_PASIDE)) {
4720 ctx_lo |= CONTEXT_PASIDE;
4721 context[0].lo = ctx_lo;
4723 iommu->flush.flush_context(iommu,
4724 domain->iommu_did[iommu->seq_id],
4725 PCI_DEVID(info->bus, info->devfn),
4726 DMA_CCMD_MASK_NOBIT,
4727 DMA_CCMD_DEVICE_INVL);
4730 /* Enable PASID support in the device, if it wasn't already */
4731 if (!info->pasid_enabled)
4732 iommu_enable_dev_iotlb(info);
4737 spin_unlock(&iommu->lock);
4738 spin_unlock_irqrestore(&device_domain_lock, flags);
4743 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4745 if (dev_is_pci(dev))
4746 return pci_device_group(dev);
4747 return generic_device_group(dev);
4750 static int intel_iommu_enable_sva(struct device *dev)
4752 struct device_domain_info *info = dev_iommu_priv_get(dev);
4753 struct intel_iommu *iommu;
4756 if (!info || dmar_disabled)
4759 iommu = info->iommu;
4763 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4766 if (intel_iommu_enable_pasid(iommu, dev))
4769 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4772 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4774 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4779 static int intel_iommu_disable_sva(struct device *dev)
4781 struct device_domain_info *info = dev_iommu_priv_get(dev);
4782 struct intel_iommu *iommu = info->iommu;
4785 ret = iommu_unregister_device_fault_handler(dev);
4787 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4792 static int intel_iommu_enable_iopf(struct device *dev)
4794 struct device_domain_info *info = dev_iommu_priv_get(dev);
4796 if (info && info->pri_supported)
4803 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4806 case IOMMU_DEV_FEAT_IOPF:
4807 return intel_iommu_enable_iopf(dev);
4809 case IOMMU_DEV_FEAT_SVA:
4810 return intel_iommu_enable_sva(dev);
4818 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4821 case IOMMU_DEV_FEAT_IOPF:
4824 case IOMMU_DEV_FEAT_SVA:
4825 return intel_iommu_disable_sva(dev);
4832 static bool intel_iommu_is_attach_deferred(struct device *dev)
4834 struct device_domain_info *info = dev_iommu_priv_get(dev);
4836 return translation_pre_enabled(info->iommu) && !info->domain;
4840 * Check that the device does not live on an external facing PCI port that is
4841 * marked as untrusted. Such devices should not be able to apply quirks and
4842 * thus not be able to bypass the IOMMU restrictions.
4844 static bool risky_device(struct pci_dev *pdev)
4846 if (pdev->untrusted) {
4848 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4849 pdev->vendor, pdev->device);
4850 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4856 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4857 unsigned long iova, size_t size)
4859 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860 unsigned long pages = aligned_nrpages(iova, size);
4861 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4862 struct intel_iommu *iommu;
4865 for_each_domain_iommu(iommu_id, dmar_domain) {
4866 iommu = g_iommus[iommu_id];
4867 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
4871 const struct iommu_ops intel_iommu_ops = {
4872 .capable = intel_iommu_capable,
4873 .domain_alloc = intel_iommu_domain_alloc,
4874 .probe_device = intel_iommu_probe_device,
4875 .probe_finalize = intel_iommu_probe_finalize,
4876 .release_device = intel_iommu_release_device,
4877 .get_resv_regions = intel_iommu_get_resv_regions,
4878 .put_resv_regions = generic_iommu_put_resv_regions,
4879 .device_group = intel_iommu_device_group,
4880 .dev_enable_feat = intel_iommu_dev_enable_feat,
4881 .dev_disable_feat = intel_iommu_dev_disable_feat,
4882 .is_attach_deferred = intel_iommu_is_attach_deferred,
4883 .def_domain_type = device_def_domain_type,
4884 .pgsize_bitmap = SZ_4K,
4885 #ifdef CONFIG_INTEL_IOMMU_SVM
4886 .sva_bind = intel_svm_bind,
4887 .sva_unbind = intel_svm_unbind,
4888 .sva_get_pasid = intel_svm_get_pasid,
4889 .page_response = intel_svm_page_response,
4891 .default_domain_ops = &(const struct iommu_domain_ops) {
4892 .attach_dev = intel_iommu_attach_device,
4893 .detach_dev = intel_iommu_detach_device,
4894 .map_pages = intel_iommu_map_pages,
4895 .unmap_pages = intel_iommu_unmap_pages,
4896 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4897 .flush_iotlb_all = intel_flush_iotlb_all,
4898 .iotlb_sync = intel_iommu_tlb_sync,
4899 .iova_to_phys = intel_iommu_iova_to_phys,
4900 .free = intel_iommu_domain_free,
4904 static void quirk_iommu_igfx(struct pci_dev *dev)
4906 if (risky_device(dev))
4909 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4913 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4922 /* Broadwell igfx malfunctions with dmar */
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4948 static void quirk_iommu_rwbf(struct pci_dev *dev)
4950 if (risky_device(dev))
4954 * Mobile 4 Series Chipset neglects to set RWBF capability,
4955 * but needs it. Same seems to hold for the desktop versions.
4957 pci_info(dev, "Forcing write-buffer flush capability\n");
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4970 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4971 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4972 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4973 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4974 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4975 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4976 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4977 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4979 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4983 if (risky_device(dev))
4986 if (pci_read_config_word(dev, GGC, &ggc))
4989 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4990 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4992 } else if (dmar_map_gfx) {
4993 /* we have to ensure the gfx device is idle before we flush */
4994 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4995 iommu_set_dma_strict();
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5003 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5007 if (!IS_GFX_DEVICE(dev))
5010 ver = (dev->device >> 8) & 0xff;
5011 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5012 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5013 ver != 0x9a && ver != 0xa7)
5016 if (risky_device(dev))
5019 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5020 iommu_skip_te_disable = 1;
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5024 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5025 ISOCH DMAR unit for the Azalia sound device, but not give it any
5026 TLB entries, which causes it to deadlock. Check for that. We do
5027 this in a function called from init_dmars(), instead of in a PCI
5028 quirk, because we don't want to print the obnoxious "BIOS broken"
5029 message if VT-d is actually disabled.
5031 static void __init check_tylersburg_isoch(void)
5033 struct pci_dev *pdev;
5034 uint32_t vtisochctrl;
5036 /* If there's no Azalia in the system anyway, forget it. */
5037 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5041 if (risky_device(pdev)) {
5048 /* System Management Registers. Might be hidden, in which case
5049 we can't do the sanity check. But that's OK, because the
5050 known-broken BIOSes _don't_ actually hide it, so far. */
5051 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5055 if (risky_device(pdev)) {
5060 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5067 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5068 if (vtisochctrl & 1)
5071 /* Drop all bits other than the number of TLB entries */
5072 vtisochctrl &= 0x1c;
5074 /* If we have the recommended number of TLB entries (16), fine. */
5075 if (vtisochctrl == 0x10)
5078 /* Zero TLB entries? You get to ride the short bus to school. */
5080 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5081 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5082 dmi_get_system_info(DMI_BIOS_VENDOR),
5083 dmi_get_system_info(DMI_BIOS_VERSION),
5084 dmi_get_system_info(DMI_PRODUCT_VERSION));
5085 iommu_identity_mapping |= IDENTMAP_AZALIA;
5089 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",