1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "../irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
361 #define IDENTMAP_GFX 2
362 #define IDENTMAP_AZALIA 4
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
397 struct device_domain_info *info;
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
403 spin_unlock_irqrestore(&device_domain_lock, flags);
407 spin_unlock_irqrestore(&device_domain_lock, flags);
412 const struct iommu_ops intel_iommu_ops;
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
424 static void init_translation_status(struct intel_iommu *iommu)
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
433 static int __init intel_iommu_setup(char *str)
438 if (!strncmp(str, "on", 2)) {
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
468 str += strcspn(str, ",");
474 __setup("intel_iommu=", intel_iommu_setup);
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 struct dmar_domain **domains;
484 domains = iommu->domains[idx];
488 return domains[did & 0xff];
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
494 struct dmar_domain **domains;
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
506 domains[did & 0xff] = domain;
509 void *alloc_pgtable_page(int node)
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 vaddr = page_address(page);
520 void free_pgtable_page(void *vaddr)
522 free_page((unsigned long)vaddr);
525 static inline void *alloc_domain_mem(void)
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 static void free_domain_mem(void *vaddr)
532 kmem_cache_free(iommu_domain_cache, vaddr);
535 static inline void * alloc_devinfo_mem(void)
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 static inline void free_devinfo_mem(void *vaddr)
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
545 static inline int domain_type_is_si(struct dmar_domain *domain)
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
571 if (test_bit(agaw, &sagaw))
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 for_each_domain_iommu(iommu_id, domain)
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 return g_iommus[iommu_id];
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
627 domain->iommu_coherency = 1;
629 for_each_domain_iommu(i, domain) {
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
639 /* No hardware attached; use lowest common denominator */
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
657 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_sc_support(iommu->ecap)) {
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 struct intel_iommu *skip)
673 struct dmar_drhd_unit *drhd;
674 struct intel_iommu *iommu;
677 if (!intel_iommu_superpage) {
681 /* set iommu_superpage to the smallest common denominator */
683 for_each_active_iommu(iommu, drhd) {
685 if (domain && domain_use_first_level(domain)) {
686 if (!cap_fl1gp_support(iommu->cap))
689 mask &= cap_super_page_val(iommu->cap);
701 static int domain_update_device_node(struct dmar_domain *domain)
703 struct device_domain_info *info;
704 int nid = NUMA_NO_NODE;
706 assert_spin_locked(&device_domain_lock);
708 if (list_empty(&domain->devices))
711 list_for_each_entry(info, &domain->devices, link) {
716 * There could possibly be multiple device numa nodes as devices
717 * within the same domain may sit behind different IOMMUs. There
718 * isn't perfect answer in such situation, so we select first
719 * come first served policy.
721 nid = dev_to_node(info->dev);
722 if (nid != NUMA_NO_NODE)
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
732 domain_update_iommu_coherency(domain);
733 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
737 * If RHSA is missing, we should default to the device numa domain
740 if (domain->nid == NUMA_NO_NODE)
741 domain->nid = domain_update_device_node(domain);
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
747 struct root_entry *root = &iommu->root_entry[bus];
748 struct context_entry *context;
752 if (sm_supported(iommu)) {
760 context = phys_to_virt(*entry & VTD_PAGE_MASK);
762 unsigned long phy_addr;
766 context = alloc_pgtable_page(iommu->node);
770 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771 phy_addr = virt_to_phys((void *)context);
772 *entry = phy_addr | 1;
773 __iommu_flush_cache(iommu, entry, sizeof(*entry));
775 return &context[devfn];
778 static bool attach_deferred(struct device *dev)
780 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
784 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785 * sub-hierarchy of a candidate PCI-PCI bridge
786 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787 * @bridge: the candidate PCI-PCI bridge
789 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
794 struct pci_dev *pdev, *pbridge;
796 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
799 pdev = to_pci_dev(dev);
800 pbridge = to_pci_dev(bridge);
802 if (pbridge->subordinate &&
803 pbridge->subordinate->number <= pdev->bus->number &&
804 pbridge->subordinate->busn_res.end >= pdev->bus->number)
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
812 struct dmar_drhd_unit *drhd;
816 /* We know that this device on this chipset has its own IOMMU.
817 * If we find it under a different IOMMU, then the BIOS is lying
818 * to us. Hope that the IOMMU for this device is actually
819 * disabled, and it needs no translation...
821 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
824 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
829 /* we know that the this iommu should be at offset 0xa000 from vtbar */
830 drhd = dmar_find_matched_drhd_unit(pdev);
831 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
842 if (!iommu || iommu->drhd->ignored)
845 if (dev_is_pci(dev)) {
846 struct pci_dev *pdev = to_pci_dev(dev);
848 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850 quirk_ioat_snb_local_iommu(pdev))
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
859 struct dmar_drhd_unit *drhd = NULL;
860 struct pci_dev *pdev = NULL;
861 struct intel_iommu *iommu;
869 if (dev_is_pci(dev)) {
870 struct pci_dev *pf_pdev;
872 pdev = pci_real_dma_dev(to_pci_dev(dev));
874 /* VFs aren't listed in scope tables; we need to look up
875 * the PF instead to find the IOMMU. */
876 pf_pdev = pci_physfn(pdev);
878 segment = pci_domain_nr(pdev->bus);
879 } else if (has_acpi_companion(dev))
880 dev = &ACPI_COMPANION(dev)->dev;
883 for_each_iommu(iommu, drhd) {
884 if (pdev && segment != drhd->segment)
887 for_each_active_dev_scope(drhd->devices,
888 drhd->devices_cnt, i, tmp) {
890 /* For a VF use its original BDF# not that of the PF
891 * which we used for the IOMMU lookup. Strictly speaking
892 * we could do this for all PCI devices; we only need to
893 * get the BDF# from the scope table for ACPI matches. */
894 if (pdev && pdev->is_virtfn)
898 *bus = drhd->devices[i].bus;
899 *devfn = drhd->devices[i].devfn;
904 if (is_downstream_to_pci_bridge(dev, tmp))
908 if (pdev && drhd->include_all) {
911 *bus = pdev->bus->number;
912 *devfn = pdev->devfn;
919 if (iommu_is_dummy(iommu, dev))
927 static void domain_flush_cache(struct dmar_domain *domain,
928 void *addr, int size)
930 if (!domain->iommu_coherency)
931 clflush_cache_range(addr, size);
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
936 struct context_entry *context;
940 spin_lock_irqsave(&iommu->lock, flags);
941 context = iommu_context_addr(iommu, bus, devfn, 0);
943 ret = context_present(context);
944 spin_unlock_irqrestore(&iommu->lock, flags);
948 static void free_context_table(struct intel_iommu *iommu)
952 struct context_entry *context;
954 spin_lock_irqsave(&iommu->lock, flags);
955 if (!iommu->root_entry) {
958 for (i = 0; i < ROOT_ENTRY_NR; i++) {
959 context = iommu_context_addr(iommu, i, 0, 0);
961 free_pgtable_page(context);
963 if (!sm_supported(iommu))
966 context = iommu_context_addr(iommu, i, 0x80, 0);
968 free_pgtable_page(context);
971 free_pgtable_page(iommu->root_entry);
972 iommu->root_entry = NULL;
974 spin_unlock_irqrestore(&iommu->lock, flags);
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978 unsigned long pfn, int *target_level)
980 struct dma_pte *parent, *pte;
981 int level = agaw_to_level(domain->agaw);
984 BUG_ON(!domain->pgd);
986 if (!domain_pfn_supported(domain, pfn))
987 /* Address beyond IOMMU's addressing capabilities. */
990 parent = domain->pgd;
995 offset = pfn_level_offset(pfn, level);
996 pte = &parent[offset];
997 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
999 if (level == *target_level)
1002 if (!dma_pte_present(pte)) {
1005 tmp_page = alloc_pgtable_page(domain->nid);
1010 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012 if (domain_use_first_level(domain))
1013 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014 if (cmpxchg64(&pte->val, 0ULL, pteval))
1015 /* Someone else set it while we were thinking; use theirs. */
1016 free_pgtable_page(tmp_page);
1018 domain_flush_cache(domain, pte, sizeof(*pte));
1023 parent = phys_to_virt(dma_pte_addr(pte));
1028 *target_level = level;
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1036 int level, int *large_page)
1038 struct dma_pte *parent, *pte;
1039 int total = agaw_to_level(domain->agaw);
1042 parent = domain->pgd;
1043 while (level <= total) {
1044 offset = pfn_level_offset(pfn, total);
1045 pte = &parent[offset];
1049 if (!dma_pte_present(pte)) {
1050 *large_page = total;
1054 if (dma_pte_superpage(pte)) {
1055 *large_page = total;
1059 parent = phys_to_virt(dma_pte_addr(pte));
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067 unsigned long start_pfn,
1068 unsigned long last_pfn)
1070 unsigned int large_page;
1071 struct dma_pte *first_pte, *pte;
1073 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075 BUG_ON(start_pfn > last_pfn);
1077 /* we don't need lock here; nobody else touches the iova range */
1080 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1082 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1087 start_pfn += lvl_to_nr_pages(large_page);
1089 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1091 domain_flush_cache(domain, first_pte,
1092 (void *)pte - (void *)first_pte);
1094 } while (start_pfn && start_pfn <= last_pfn);
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098 int retain_level, struct dma_pte *pte,
1099 unsigned long pfn, unsigned long start_pfn,
1100 unsigned long last_pfn)
1102 pfn = max(start_pfn, pfn);
1103 pte = &pte[pfn_level_offset(pfn, level)];
1106 unsigned long level_pfn;
1107 struct dma_pte *level_pte;
1109 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1112 level_pfn = pfn & level_mask(level);
1113 level_pte = phys_to_virt(dma_pte_addr(pte));
1116 dma_pte_free_level(domain, level - 1, retain_level,
1117 level_pte, level_pfn, start_pfn,
1122 * Free the page table if we're below the level we want to
1123 * retain and the range covers the entire table.
1125 if (level < retain_level && !(start_pfn > level_pfn ||
1126 last_pfn < level_pfn + level_size(level) - 1)) {
1128 domain_flush_cache(domain, pte, sizeof(*pte));
1129 free_pgtable_page(level_pte);
1132 pfn += level_size(level);
1133 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1137 * clear last level (leaf) ptes and free page table pages below the
1138 * level we wish to keep intact.
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141 unsigned long start_pfn,
1142 unsigned long last_pfn,
1145 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147 BUG_ON(start_pfn > last_pfn);
1149 dma_pte_clear_range(domain, start_pfn, last_pfn);
1151 /* We don't need lock here; nobody else touches the iova range */
1152 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153 domain->pgd, 0, start_pfn, last_pfn);
1156 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157 free_pgtable_page(domain->pgd);
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163 need to *modify* it at all. All we need to do is make a list of all the
1164 pages which can be freed just as soon as we've flushed the IOTLB and we
1165 know the hardware page-walk will no longer touch them.
1166 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169 int level, struct dma_pte *pte,
1170 struct page *freelist)
1174 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175 pg->freelist = freelist;
1181 pte = page_address(pg);
1183 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184 freelist = dma_pte_list_pagetables(domain, level - 1,
1187 } while (!first_pte_in_page(pte));
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193 struct dma_pte *pte, unsigned long pfn,
1194 unsigned long start_pfn,
1195 unsigned long last_pfn,
1196 struct page *freelist)
1198 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1200 pfn = max(start_pfn, pfn);
1201 pte = &pte[pfn_level_offset(pfn, level)];
1204 unsigned long level_pfn;
1206 if (!dma_pte_present(pte))
1209 level_pfn = pfn & level_mask(level);
1211 /* If range covers entire pagetable, free it */
1212 if (start_pfn <= level_pfn &&
1213 last_pfn >= level_pfn + level_size(level) - 1) {
1214 /* These suborbinate page tables are going away entirely. Don't
1215 bother to clear them; we're just going to *free* them. */
1216 if (level > 1 && !dma_pte_superpage(pte))
1217 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1223 } else if (level > 1) {
1224 /* Recurse down into a level that isn't *entirely* obsolete */
1225 freelist = dma_pte_clear_level(domain, level - 1,
1226 phys_to_virt(dma_pte_addr(pte)),
1227 level_pfn, start_pfn, last_pfn,
1231 pfn += level_size(level);
1232 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1235 domain_flush_cache(domain, first_pte,
1236 (void *)++last_pte - (void *)first_pte);
1241 /* We can't just free the pages because the IOMMU may still be walking
1242 the page tables, and may have cached the intermediate levels. The
1243 pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245 unsigned long start_pfn,
1246 unsigned long last_pfn,
1247 struct page *freelist)
1249 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1250 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1251 BUG_ON(start_pfn > last_pfn);
1253 /* we don't need lock here; nobody else touches the iova range */
1254 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1255 domain->pgd, 0, start_pfn, last_pfn,
1259 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260 struct page *pgd_page = virt_to_page(domain->pgd);
1261 pgd_page->freelist = freelist;
1262 freelist = pgd_page;
1270 static void dma_free_pagelist(struct page *freelist)
1274 while ((pg = freelist)) {
1275 freelist = pg->freelist;
1276 free_pgtable_page(page_address(pg));
1280 static void iova_entry_free(unsigned long data)
1282 struct page *freelist = (struct page *)data;
1284 dma_free_pagelist(freelist);
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1290 struct root_entry *root;
1291 unsigned long flags;
1293 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1295 pr_err("Allocating root entry for %s failed\n",
1300 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1302 spin_lock_irqsave(&iommu->lock, flags);
1303 iommu->root_entry = root;
1304 spin_unlock_irqrestore(&iommu->lock, flags);
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1315 addr = virt_to_phys(iommu->root_entry);
1316 if (sm_supported(iommu))
1317 addr |= DMA_RTADDR_SMT;
1319 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1322 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1324 /* Make sure hardware complete it */
1325 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326 readl, (sts & DMA_GSTS_RTPS), sts);
1328 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1336 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1339 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1342 /* Make sure hardware complete it */
1343 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344 readl, (!(val & DMA_GSTS_WBFS)), val);
1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351 u16 did, u16 source_id, u8 function_mask,
1358 case DMA_CCMD_GLOBAL_INVL:
1359 val = DMA_CCMD_GLOBAL_INVL;
1361 case DMA_CCMD_DOMAIN_INVL:
1362 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1364 case DMA_CCMD_DEVICE_INVL:
1365 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1371 val |= DMA_CCMD_ICC;
1373 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1376 /* Make sure hardware complete it */
1377 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1380 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385 u64 addr, unsigned int size_order, u64 type)
1387 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388 u64 val = 0, val_iva = 0;
1392 case DMA_TLB_GLOBAL_FLUSH:
1393 /* global flush doesn't need set IVA_REG */
1394 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1396 case DMA_TLB_DSI_FLUSH:
1397 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399 case DMA_TLB_PSI_FLUSH:
1400 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401 /* IH bit is passed in as part of address */
1402 val_iva = size_order | addr;
1407 /* Note: set drain read/write */
1410 * This is probably to be super secure.. Looks like we can
1411 * ignore it without any impact.
1413 if (cap_read_drain(iommu->cap))
1414 val |= DMA_TLB_READ_DRAIN;
1416 if (cap_write_drain(iommu->cap))
1417 val |= DMA_TLB_WRITE_DRAIN;
1419 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420 /* Note: Only uses first TLB reg currently */
1422 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1425 /* Make sure hardware complete it */
1426 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1429 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1431 /* check IOTLB invalidation granularity */
1432 if (DMA_TLB_IAIG(val) == 0)
1433 pr_err("Flush IOTLB failed\n");
1434 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436 (unsigned long long)DMA_TLB_IIRG(type),
1437 (unsigned long long)DMA_TLB_IAIG(val));
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1444 struct device_domain_info *info;
1446 assert_spin_locked(&device_domain_lock);
1451 list_for_each_entry(info, &domain->devices, link)
1452 if (info->iommu == iommu && info->bus == bus &&
1453 info->devfn == devfn) {
1454 if (info->ats_supported && info->dev)
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1464 struct device_domain_info *info;
1465 bool has_iotlb_device = false;
1467 assert_spin_locked(&device_domain_lock);
1469 list_for_each_entry(info, &domain->devices, link) {
1470 struct pci_dev *pdev;
1472 if (!info->dev || !dev_is_pci(info->dev))
1475 pdev = to_pci_dev(info->dev);
1476 if (pdev->ats_enabled) {
1477 has_iotlb_device = true;
1482 domain->has_iotlb_device = has_iotlb_device;
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1487 struct pci_dev *pdev;
1489 assert_spin_locked(&device_domain_lock);
1491 if (!info || !dev_is_pci(info->dev))
1494 pdev = to_pci_dev(info->dev);
1495 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498 * reserved, which should be set to 0.
1500 if (!ecap_dit(info->iommu->ecap))
1503 struct pci_dev *pf_pdev;
1505 /* pdev will be returned if device is not a vf */
1506 pf_pdev = pci_physfn(pdev);
1507 info->pfsid = pci_dev_id(pf_pdev);
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511 /* The PCIe spec, in its wisdom, declares that the behaviour of
1512 the device if you enable PASID support after ATS support is
1513 undefined. So always enable PASID support on devices which
1514 have it, even if we can't yet know if we're ever going to
1516 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517 info->pasid_enabled = 1;
1519 if (info->pri_supported &&
1520 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1521 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522 info->pri_enabled = 1;
1524 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526 info->ats_enabled = 1;
1527 domain_update_iotlb(info->domain);
1528 info->ats_qdep = pci_ats_queue_depth(pdev);
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1534 struct pci_dev *pdev;
1536 assert_spin_locked(&device_domain_lock);
1538 if (!dev_is_pci(info->dev))
1541 pdev = to_pci_dev(info->dev);
1543 if (info->ats_enabled) {
1544 pci_disable_ats(pdev);
1545 info->ats_enabled = 0;
1546 domain_update_iotlb(info->domain);
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549 if (info->pri_enabled) {
1550 pci_disable_pri(pdev);
1551 info->pri_enabled = 0;
1553 if (info->pasid_enabled) {
1554 pci_disable_pasid(pdev);
1555 info->pasid_enabled = 0;
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561 u64 addr, unsigned mask)
1564 unsigned long flags;
1565 struct device_domain_info *info;
1567 if (!domain->has_iotlb_device)
1570 spin_lock_irqsave(&device_domain_lock, flags);
1571 list_for_each_entry(info, &domain->devices, link) {
1572 if (!info->ats_enabled)
1575 sid = info->bus << 8 | info->devfn;
1576 qdep = info->ats_qdep;
1577 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1580 spin_unlock_irqrestore(&device_domain_lock, flags);
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584 struct dmar_domain *domain,
1585 u64 addr, unsigned long npages, bool ih)
1587 u16 did = domain->iommu_did[iommu->seq_id];
1589 if (domain->default_pasid)
1590 qi_flush_piotlb(iommu, did, domain->default_pasid,
1593 if (!list_empty(&domain->devices))
1594 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598 struct dmar_domain *domain,
1599 unsigned long pfn, unsigned int pages,
1602 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604 u16 did = domain->iommu_did[iommu->seq_id];
1611 if (domain_use_first_level(domain)) {
1612 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1615 * Fallback to domain selective flush if no PSI support or
1616 * the size is too big. PSI requires page size to be 2 ^ x,
1617 * and the base address is naturally aligned to the size.
1619 if (!cap_pgsel_inv(iommu->cap) ||
1620 mask > cap_max_amask_val(iommu->cap))
1621 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1624 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1629 * In caching mode, changes of pages from non-present to present require
1630 * flush. However, device IOTLB doesn't need to be flushed in this case.
1632 if (!cap_caching_mode(iommu->cap) || !map)
1633 iommu_flush_dev_iotlb(domain, addr, mask);
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638 struct dmar_domain *domain,
1639 unsigned long pfn, unsigned int pages)
1642 * It's a non-present to present mapping. Only flush if caching mode
1645 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1648 iommu_flush_write_buffer(iommu);
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1653 struct dmar_domain *domain;
1656 domain = container_of(iovad, struct dmar_domain, iovad);
1658 for_each_domain_iommu(idx, domain) {
1659 struct intel_iommu *iommu = g_iommus[idx];
1660 u16 did = domain->iommu_did[iommu->seq_id];
1662 if (domain_use_first_level(domain))
1663 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1665 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1668 if (!cap_caching_mode(iommu->cap))
1669 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670 0, MAX_AGAW_PFN_WIDTH);
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1677 unsigned long flags;
1679 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1682 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684 pmen &= ~DMA_PMEN_EPM;
1685 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1687 /* wait for the protected region status bit to clear */
1688 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689 readl, !(pmen & DMA_PMEN_PRS), pmen);
1691 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1697 unsigned long flags;
1699 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700 iommu->gcmd |= DMA_GCMD_TE;
1701 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1703 /* Make sure hardware complete it */
1704 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705 readl, (sts & DMA_GSTS_TES), sts);
1707 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1715 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1719 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720 iommu->gcmd &= ~DMA_GCMD_TE;
1721 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1723 /* Make sure hardware complete it */
1724 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725 readl, (!(sts & DMA_GSTS_TES)), sts);
1727 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1732 u32 ndomains, nlongs;
1735 ndomains = cap_ndoms(iommu->cap);
1736 pr_debug("%s: Number of Domains supported <%d>\n",
1737 iommu->name, ndomains);
1738 nlongs = BITS_TO_LONGS(ndomains);
1740 spin_lock_init(&iommu->lock);
1742 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743 if (!iommu->domain_ids) {
1744 pr_err("%s: Allocating domain id array failed\n",
1749 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750 iommu->domains = kzalloc(size, GFP_KERNEL);
1752 if (iommu->domains) {
1753 size = 256 * sizeof(struct dmar_domain *);
1754 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1757 if (!iommu->domains || !iommu->domains[0]) {
1758 pr_err("%s: Allocating domain array failed\n",
1760 kfree(iommu->domain_ids);
1761 kfree(iommu->domains);
1762 iommu->domain_ids = NULL;
1763 iommu->domains = NULL;
1768 * If Caching mode is set, then invalid translations are tagged
1769 * with domain-id 0, hence we need to pre-allocate it. We also
1770 * use domain-id 0 as a marker for non-allocated domain-id, so
1771 * make sure it is not used for a real domain.
1773 set_bit(0, iommu->domain_ids);
1776 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777 * entry for first-level or pass-through translation modes should
1778 * be programmed with a domain id different from those used for
1779 * second-level or nested translation. We reserve a domain id for
1782 if (sm_supported(iommu))
1783 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1790 struct device_domain_info *info, *tmp;
1791 unsigned long flags;
1793 if (!iommu->domains || !iommu->domain_ids)
1796 spin_lock_irqsave(&device_domain_lock, flags);
1797 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798 if (info->iommu != iommu)
1801 if (!info->dev || !info->domain)
1804 __dmar_remove_one_dev_info(info);
1806 spin_unlock_irqrestore(&device_domain_lock, flags);
1808 if (iommu->gcmd & DMA_GCMD_TE)
1809 iommu_disable_translation(iommu);
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1814 if ((iommu->domains) && (iommu->domain_ids)) {
1815 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1818 for (i = 0; i < elems; i++)
1819 kfree(iommu->domains[i]);
1820 kfree(iommu->domains);
1821 kfree(iommu->domain_ids);
1822 iommu->domains = NULL;
1823 iommu->domain_ids = NULL;
1826 g_iommus[iommu->seq_id] = NULL;
1828 /* free context mapping */
1829 free_context_table(iommu);
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832 if (pasid_supported(iommu)) {
1833 if (ecap_prs(iommu->ecap))
1834 intel_svm_finish_prq(iommu);
1836 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837 ioasid_unregister_allocator(&iommu->pasid_allocator);
1843 * Check and return whether first level is used by default for
1846 static bool first_level_by_default(void)
1848 struct dmar_drhd_unit *drhd;
1849 struct intel_iommu *iommu;
1850 static int first_level_support = -1;
1852 if (likely(first_level_support != -1))
1853 return first_level_support;
1855 first_level_support = 1;
1858 for_each_active_iommu(iommu, drhd) {
1859 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860 first_level_support = 0;
1866 return first_level_support;
1869 static struct dmar_domain *alloc_domain(int flags)
1871 struct dmar_domain *domain;
1873 domain = alloc_domain_mem();
1877 memset(domain, 0, sizeof(*domain));
1878 domain->nid = NUMA_NO_NODE;
1879 domain->flags = flags;
1880 if (first_level_by_default())
1881 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882 domain->has_iotlb_device = false;
1883 INIT_LIST_HEAD(&domain->devices);
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890 struct intel_iommu *iommu)
1892 unsigned long ndomains;
1895 assert_spin_locked(&device_domain_lock);
1896 assert_spin_locked(&iommu->lock);
1898 domain->iommu_refcnt[iommu->seq_id] += 1;
1899 domain->iommu_count += 1;
1900 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901 ndomains = cap_ndoms(iommu->cap);
1902 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1904 if (num >= ndomains) {
1905 pr_err("%s: No free domain ids\n", iommu->name);
1906 domain->iommu_refcnt[iommu->seq_id] -= 1;
1907 domain->iommu_count -= 1;
1911 set_bit(num, iommu->domain_ids);
1912 set_iommu_domain(iommu, num, domain);
1914 domain->iommu_did[iommu->seq_id] = num;
1915 domain->nid = iommu->node;
1917 domain_update_iommu_cap(domain);
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924 struct intel_iommu *iommu)
1928 assert_spin_locked(&device_domain_lock);
1929 assert_spin_locked(&iommu->lock);
1931 domain->iommu_refcnt[iommu->seq_id] -= 1;
1932 count = --domain->iommu_count;
1933 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934 num = domain->iommu_did[iommu->seq_id];
1935 clear_bit(num, iommu->domain_ids);
1936 set_iommu_domain(iommu, num, NULL);
1938 domain_update_iommu_cap(domain);
1939 domain->iommu_did[iommu->seq_id] = 0;
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1948 static int dmar_init_reserved_ranges(void)
1950 struct pci_dev *pdev = NULL;
1954 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1956 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957 &reserved_rbtree_key);
1959 /* IOAPIC ranges shouldn't be accessed by DMA */
1960 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961 IOVA_PFN(IOAPIC_RANGE_END));
1963 pr_err("Reserve IOAPIC range failed\n");
1967 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968 for_each_pci_dev(pdev) {
1971 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972 r = &pdev->resource[i];
1973 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1975 iova = reserve_iova(&reserved_iova_list,
1979 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1990 int r = (gaw - 12) % 9;
2001 static void domain_exit(struct dmar_domain *domain)
2004 /* Remove associated devices and clear attached or cached domains */
2005 domain_remove_dev_info(domain);
2008 if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009 put_iova_domain(&domain->iovad);
2012 struct page *freelist;
2014 freelist = domain_unmap(domain, 0,
2015 DOMAIN_MAX_PFN(domain->gaw), NULL);
2016 dma_free_pagelist(freelist);
2019 free_domain_mem(domain);
2023 * Get the PASID directory size for scalable mode context entry.
2024 * Value of X in the PDTS field of a scalable mode context entry
2025 * indicates PASID directory with 2^(X + 7) entries.
2027 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2031 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2032 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2040 * Set the RID_PASID field of a scalable mode context entry. The
2041 * IOMMU hardware will use the PASID value set in this field for
2042 * DMA translations of DMA requests without PASID.
2045 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2047 context->hi |= pasid & ((1 << 20) - 1);
2051 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2054 static inline void context_set_sm_dte(struct context_entry *context)
2056 context->lo |= (1 << 2);
2060 * Set the PRE(Page Request Enable) field of a scalable mode context
2063 static inline void context_set_sm_pre(struct context_entry *context)
2065 context->lo |= (1 << 4);
2068 /* Convert value to context PASID directory size field coding. */
2069 #define context_pdts(pds) (((pds) & 0x7) << 9)
2071 static int domain_context_mapping_one(struct dmar_domain *domain,
2072 struct intel_iommu *iommu,
2073 struct pasid_table *table,
2076 u16 did = domain->iommu_did[iommu->seq_id];
2077 int translation = CONTEXT_TT_MULTI_LEVEL;
2078 struct device_domain_info *info = NULL;
2079 struct context_entry *context;
2080 unsigned long flags;
2085 if (hw_pass_through && domain_type_is_si(domain))
2086 translation = CONTEXT_TT_PASS_THROUGH;
2088 pr_debug("Set context mapping for %02x:%02x.%d\n",
2089 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2091 BUG_ON(!domain->pgd);
2093 spin_lock_irqsave(&device_domain_lock, flags);
2094 spin_lock(&iommu->lock);
2097 context = iommu_context_addr(iommu, bus, devfn, 1);
2102 if (context_present(context))
2106 * For kdump cases, old valid entries may be cached due to the
2107 * in-flight DMA and copied pgtable, but there is no unmapping
2108 * behaviour for them, thus we need an explicit cache flush for
2109 * the newly-mapped device. For kdump, at this point, the device
2110 * is supposed to finish reset at its driver probe stage, so no
2111 * in-flight DMA will exist, and we don't need to worry anymore
2114 if (context_copied(context)) {
2115 u16 did_old = context_domain_id(context);
2117 if (did_old < cap_ndoms(iommu->cap)) {
2118 iommu->flush.flush_context(iommu, did_old,
2119 (((u16)bus) << 8) | devfn,
2120 DMA_CCMD_MASK_NOBIT,
2121 DMA_CCMD_DEVICE_INVL);
2122 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2127 context_clear_entry(context);
2129 if (sm_supported(iommu)) {
2134 /* Setup the PASID DIR pointer: */
2135 pds = context_get_sm_pds(table);
2136 context->lo = (u64)virt_to_phys(table->table) |
2139 /* Setup the RID_PASID field: */
2140 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2143 * Setup the Device-TLB enable bit and Page request
2146 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2147 if (info && info->ats_supported)
2148 context_set_sm_dte(context);
2149 if (info && info->pri_supported)
2150 context_set_sm_pre(context);
2152 struct dma_pte *pgd = domain->pgd;
2155 context_set_domain_id(context, did);
2157 if (translation != CONTEXT_TT_PASS_THROUGH) {
2159 * Skip top levels of page tables for iommu which has
2160 * less agaw than default. Unnecessary for PT mode.
2162 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2164 pgd = phys_to_virt(dma_pte_addr(pgd));
2165 if (!dma_pte_present(pgd))
2169 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2170 if (info && info->ats_supported)
2171 translation = CONTEXT_TT_DEV_IOTLB;
2173 translation = CONTEXT_TT_MULTI_LEVEL;
2175 context_set_address_root(context, virt_to_phys(pgd));
2176 context_set_address_width(context, agaw);
2179 * In pass through mode, AW must be programmed to
2180 * indicate the largest AGAW value supported by
2181 * hardware. And ASR is ignored by hardware.
2183 context_set_address_width(context, iommu->msagaw);
2186 context_set_translation_type(context, translation);
2189 context_set_fault_enable(context);
2190 context_set_present(context);
2191 if (!ecap_coherent(iommu->ecap))
2192 clflush_cache_range(context, sizeof(*context));
2195 * It's a non-present to present mapping. If hardware doesn't cache
2196 * non-present entry we only need to flush the write-buffer. If the
2197 * _does_ cache non-present entries, then it does so in the special
2198 * domain #0, which we have to flush:
2200 if (cap_caching_mode(iommu->cap)) {
2201 iommu->flush.flush_context(iommu, 0,
2202 (((u16)bus) << 8) | devfn,
2203 DMA_CCMD_MASK_NOBIT,
2204 DMA_CCMD_DEVICE_INVL);
2205 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2207 iommu_flush_write_buffer(iommu);
2209 iommu_enable_dev_iotlb(info);
2214 spin_unlock(&iommu->lock);
2215 spin_unlock_irqrestore(&device_domain_lock, flags);
2220 struct domain_context_mapping_data {
2221 struct dmar_domain *domain;
2222 struct intel_iommu *iommu;
2223 struct pasid_table *table;
2226 static int domain_context_mapping_cb(struct pci_dev *pdev,
2227 u16 alias, void *opaque)
2229 struct domain_context_mapping_data *data = opaque;
2231 return domain_context_mapping_one(data->domain, data->iommu,
2232 data->table, PCI_BUS_NUM(alias),
2237 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2239 struct domain_context_mapping_data data;
2240 struct pasid_table *table;
2241 struct intel_iommu *iommu;
2244 iommu = device_to_iommu(dev, &bus, &devfn);
2248 table = intel_pasid_get_table(dev);
2250 if (!dev_is_pci(dev))
2251 return domain_context_mapping_one(domain, iommu, table,
2254 data.domain = domain;
2258 return pci_for_each_dma_alias(to_pci_dev(dev),
2259 &domain_context_mapping_cb, &data);
2262 static int domain_context_mapped_cb(struct pci_dev *pdev,
2263 u16 alias, void *opaque)
2265 struct intel_iommu *iommu = opaque;
2267 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2270 static int domain_context_mapped(struct device *dev)
2272 struct intel_iommu *iommu;
2275 iommu = device_to_iommu(dev, &bus, &devfn);
2279 if (!dev_is_pci(dev))
2280 return device_context_mapped(iommu, bus, devfn);
2282 return !pci_for_each_dma_alias(to_pci_dev(dev),
2283 domain_context_mapped_cb, iommu);
2286 /* Returns a number of VTD pages, but aligned to MM page size */
2287 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2290 host_addr &= ~PAGE_MASK;
2291 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2294 /* Return largest possible superpage level for a given mapping */
2295 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2296 unsigned long iov_pfn,
2297 unsigned long phy_pfn,
2298 unsigned long pages)
2300 int support, level = 1;
2301 unsigned long pfnmerge;
2303 support = domain->iommu_superpage;
2305 /* To use a large page, the virtual *and* physical addresses
2306 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2307 of them will mean we have to use smaller pages. So just
2308 merge them and check both at once. */
2309 pfnmerge = iov_pfn | phy_pfn;
2311 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2312 pages >>= VTD_STRIDE_SHIFT;
2315 pfnmerge >>= VTD_STRIDE_SHIFT;
2322 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2323 struct scatterlist *sg, unsigned long phys_pfn,
2324 unsigned long nr_pages, int prot)
2326 struct dma_pte *first_pte = NULL, *pte = NULL;
2328 unsigned long sg_res = 0;
2329 unsigned int largepage_lvl = 0;
2330 unsigned long lvl_pages = 0;
2333 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2335 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2338 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2339 if (domain_use_first_level(domain))
2340 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2344 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2347 while (nr_pages > 0) {
2351 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2353 sg_res = aligned_nrpages(sg->offset, sg->length);
2354 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2355 sg->dma_length = sg->length;
2356 pteval = (sg_phys(sg) - pgoff) | attr;
2357 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2361 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2363 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2366 /* It is large page*/
2367 if (largepage_lvl > 1) {
2368 unsigned long nr_superpages, end_pfn;
2370 pteval |= DMA_PTE_LARGE_PAGE;
2371 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2373 nr_superpages = sg_res / lvl_pages;
2374 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2377 * Ensure that old small page tables are
2378 * removed to make room for superpage(s).
2379 * We're adding new large pages, so make sure
2380 * we don't remove their parent tables.
2382 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2385 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2389 /* We don't need lock here, nobody else
2390 * touches the iova range
2392 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2394 static int dumps = 5;
2395 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2396 iov_pfn, tmp, (unsigned long long)pteval);
2399 debug_dma_dump_mappings(NULL);
2404 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2406 BUG_ON(nr_pages < lvl_pages);
2407 BUG_ON(sg_res < lvl_pages);
2409 nr_pages -= lvl_pages;
2410 iov_pfn += lvl_pages;
2411 phys_pfn += lvl_pages;
2412 pteval += lvl_pages * VTD_PAGE_SIZE;
2413 sg_res -= lvl_pages;
2415 /* If the next PTE would be the first in a new page, then we
2416 need to flush the cache on the entries we've just written.
2417 And then we'll need to recalculate 'pte', so clear it and
2418 let it get set again in the if (!pte) block above.
2420 If we're done (!nr_pages) we need to flush the cache too.
2422 Also if we've been setting superpages, we may need to
2423 recalculate 'pte' and switch back to smaller pages for the
2424 end of the mapping, if the trailing size is not enough to
2425 use another superpage (i.e. sg_res < lvl_pages). */
2427 if (!nr_pages || first_pte_in_page(pte) ||
2428 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2429 domain_flush_cache(domain, first_pte,
2430 (void *)pte - (void *)first_pte);
2434 if (!sg_res && nr_pages)
2440 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2441 struct scatterlist *sg, unsigned long phys_pfn,
2442 unsigned long nr_pages, int prot)
2445 struct intel_iommu *iommu;
2447 /* Do the real mapping first */
2448 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2452 for_each_domain_iommu(iommu_id, domain) {
2453 iommu = g_iommus[iommu_id];
2454 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2460 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2461 struct scatterlist *sg, unsigned long nr_pages,
2464 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2467 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2468 unsigned long phys_pfn, unsigned long nr_pages,
2471 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2474 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2476 unsigned long flags;
2477 struct context_entry *context;
2483 spin_lock_irqsave(&iommu->lock, flags);
2484 context = iommu_context_addr(iommu, bus, devfn, 0);
2486 spin_unlock_irqrestore(&iommu->lock, flags);
2489 did_old = context_domain_id(context);
2490 context_clear_entry(context);
2491 __iommu_flush_cache(iommu, context, sizeof(*context));
2492 spin_unlock_irqrestore(&iommu->lock, flags);
2493 iommu->flush.flush_context(iommu,
2495 (((u16)bus) << 8) | devfn,
2496 DMA_CCMD_MASK_NOBIT,
2497 DMA_CCMD_DEVICE_INVL);
2498 iommu->flush.flush_iotlb(iommu,
2505 static inline void unlink_domain_info(struct device_domain_info *info)
2507 assert_spin_locked(&device_domain_lock);
2508 list_del(&info->link);
2509 list_del(&info->global);
2511 dev_iommu_priv_set(info->dev, NULL);
2514 static void domain_remove_dev_info(struct dmar_domain *domain)
2516 struct device_domain_info *info, *tmp;
2517 unsigned long flags;
2519 spin_lock_irqsave(&device_domain_lock, flags);
2520 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2521 __dmar_remove_one_dev_info(info);
2522 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 struct dmar_domain *find_domain(struct device *dev)
2527 struct device_domain_info *info;
2529 if (unlikely(!dev || !dev->iommu))
2532 if (unlikely(attach_deferred(dev)))
2535 /* No lock here, assumes no domain exit in normal case */
2536 info = get_domain_info(dev);
2538 return info->domain;
2543 static void do_deferred_attach(struct device *dev)
2545 struct iommu_domain *domain;
2547 dev_iommu_priv_set(dev, NULL);
2548 domain = iommu_get_domain_for_dev(dev);
2550 intel_iommu_attach_device(domain, dev);
2553 static inline struct device_domain_info *
2554 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2556 struct device_domain_info *info;
2558 list_for_each_entry(info, &device_domain_list, global)
2559 if (info->segment == segment && info->bus == bus &&
2560 info->devfn == devfn)
2566 static int domain_setup_first_level(struct intel_iommu *iommu,
2567 struct dmar_domain *domain,
2571 int flags = PASID_FLAG_SUPERVISOR_MODE;
2572 struct dma_pte *pgd = domain->pgd;
2576 * Skip top levels of page tables for iommu which has
2577 * less agaw than default. Unnecessary for PT mode.
2579 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2580 pgd = phys_to_virt(dma_pte_addr(pgd));
2581 if (!dma_pte_present(pgd))
2585 level = agaw_to_level(agaw);
2586 if (level != 4 && level != 5)
2589 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2591 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2592 domain->iommu_did[iommu->seq_id],
2596 static bool dev_is_real_dma_subdevice(struct device *dev)
2598 return dev && dev_is_pci(dev) &&
2599 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2602 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2605 struct dmar_domain *domain)
2607 struct dmar_domain *found = NULL;
2608 struct device_domain_info *info;
2609 unsigned long flags;
2612 info = alloc_devinfo_mem();
2616 if (!dev_is_real_dma_subdevice(dev)) {
2618 info->devfn = devfn;
2619 info->segment = iommu->segment;
2621 struct pci_dev *pdev = to_pci_dev(dev);
2623 info->bus = pdev->bus->number;
2624 info->devfn = pdev->devfn;
2625 info->segment = pci_domain_nr(pdev->bus);
2628 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2629 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2632 info->domain = domain;
2633 info->iommu = iommu;
2634 info->pasid_table = NULL;
2635 info->auxd_enabled = 0;
2636 INIT_LIST_HEAD(&info->auxiliary_domains);
2638 if (dev && dev_is_pci(dev)) {
2639 struct pci_dev *pdev = to_pci_dev(info->dev);
2641 if (ecap_dev_iotlb_support(iommu->ecap) &&
2642 pci_ats_supported(pdev) &&
2643 dmar_find_matched_atsr_unit(pdev))
2644 info->ats_supported = 1;
2646 if (sm_supported(iommu)) {
2647 if (pasid_supported(iommu)) {
2648 int features = pci_pasid_features(pdev);
2650 info->pasid_supported = features | 1;
2653 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2654 pci_pri_supported(pdev))
2655 info->pri_supported = 1;
2659 spin_lock_irqsave(&device_domain_lock, flags);
2661 found = find_domain(dev);
2664 struct device_domain_info *info2;
2665 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2668 found = info2->domain;
2674 spin_unlock_irqrestore(&device_domain_lock, flags);
2675 free_devinfo_mem(info);
2676 /* Caller must free the original domain */
2680 spin_lock(&iommu->lock);
2681 ret = domain_attach_iommu(domain, iommu);
2682 spin_unlock(&iommu->lock);
2685 spin_unlock_irqrestore(&device_domain_lock, flags);
2686 free_devinfo_mem(info);
2690 list_add(&info->link, &domain->devices);
2691 list_add(&info->global, &device_domain_list);
2693 dev_iommu_priv_set(dev, info);
2694 spin_unlock_irqrestore(&device_domain_lock, flags);
2696 /* PASID table is mandatory for a PCI device in scalable mode. */
2697 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2698 ret = intel_pasid_alloc_table(dev);
2700 dev_err(dev, "PASID table allocation failed\n");
2701 dmar_remove_one_dev_info(dev);
2705 /* Setup the PASID entry for requests without PASID: */
2706 spin_lock_irqsave(&iommu->lock, flags);
2707 if (hw_pass_through && domain_type_is_si(domain))
2708 ret = intel_pasid_setup_pass_through(iommu, domain,
2709 dev, PASID_RID2PASID);
2710 else if (domain_use_first_level(domain))
2711 ret = domain_setup_first_level(iommu, domain, dev,
2714 ret = intel_pasid_setup_second_level(iommu, domain,
2715 dev, PASID_RID2PASID);
2716 spin_unlock_irqrestore(&iommu->lock, flags);
2718 dev_err(dev, "Setup RID2PASID failed\n");
2719 dmar_remove_one_dev_info(dev);
2724 if (dev && domain_context_mapping(domain, dev)) {
2725 dev_err(dev, "Domain context map failed\n");
2726 dmar_remove_one_dev_info(dev);
2733 static int iommu_domain_identity_map(struct dmar_domain *domain,
2734 unsigned long first_vpfn,
2735 unsigned long last_vpfn)
2738 * RMRR range might have overlap with physical memory range,
2741 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2743 return __domain_mapping(domain, first_vpfn, NULL,
2744 first_vpfn, last_vpfn - first_vpfn + 1,
2745 DMA_PTE_READ|DMA_PTE_WRITE);
2748 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2750 static int __init si_domain_init(int hw)
2752 struct dmar_rmrr_unit *rmrr;
2756 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2760 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2761 domain_exit(si_domain);
2768 for_each_online_node(nid) {
2769 unsigned long start_pfn, end_pfn;
2772 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2773 ret = iommu_domain_identity_map(si_domain,
2774 mm_to_dma_pfn(start_pfn),
2775 mm_to_dma_pfn(end_pfn));
2782 * Identity map the RMRRs so that devices with RMRRs could also use
2785 for_each_rmrr_units(rmrr) {
2786 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2788 unsigned long long start = rmrr->base_address;
2789 unsigned long long end = rmrr->end_address;
2791 if (WARN_ON(end < start ||
2792 end >> agaw_to_width(si_domain->agaw)))
2795 ret = iommu_domain_identity_map(si_domain,
2796 mm_to_dma_pfn(start >> PAGE_SHIFT),
2797 mm_to_dma_pfn(end >> PAGE_SHIFT));
2806 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2808 struct dmar_domain *ndomain;
2809 struct intel_iommu *iommu;
2812 iommu = device_to_iommu(dev, &bus, &devfn);
2816 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2817 if (ndomain != domain)
2823 static bool device_has_rmrr(struct device *dev)
2825 struct dmar_rmrr_unit *rmrr;
2830 for_each_rmrr_units(rmrr) {
2832 * Return TRUE if this RMRR contains the device that
2835 for_each_active_dev_scope(rmrr->devices,
2836 rmrr->devices_cnt, i, tmp)
2838 is_downstream_to_pci_bridge(dev, tmp)) {
2848 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2849 * is relaxable (ie. is allowed to be not enforced under some conditions)
2850 * @dev: device handle
2852 * We assume that PCI USB devices with RMRRs have them largely
2853 * for historical reasons and that the RMRR space is not actively used post
2854 * boot. This exclusion may change if vendors begin to abuse it.
2856 * The same exception is made for graphics devices, with the requirement that
2857 * any use of the RMRR regions will be torn down before assigning the device
2860 * Return: true if the RMRR is relaxable, false otherwise
2862 static bool device_rmrr_is_relaxable(struct device *dev)
2864 struct pci_dev *pdev;
2866 if (!dev_is_pci(dev))
2869 pdev = to_pci_dev(dev);
2870 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2877 * There are a couple cases where we need to restrict the functionality of
2878 * devices associated with RMRRs. The first is when evaluating a device for
2879 * identity mapping because problems exist when devices are moved in and out
2880 * of domains and their respective RMRR information is lost. This means that
2881 * a device with associated RMRRs will never be in a "passthrough" domain.
2882 * The second is use of the device through the IOMMU API. This interface
2883 * expects to have full control of the IOVA space for the device. We cannot
2884 * satisfy both the requirement that RMRR access is maintained and have an
2885 * unencumbered IOVA space. We also have no ability to quiesce the device's
2886 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2887 * We therefore prevent devices associated with an RMRR from participating in
2888 * the IOMMU API, which eliminates them from device assignment.
2890 * In both cases, devices which have relaxable RMRRs are not concerned by this
2891 * restriction. See device_rmrr_is_relaxable comment.
2893 static bool device_is_rmrr_locked(struct device *dev)
2895 if (!device_has_rmrr(dev))
2898 if (device_rmrr_is_relaxable(dev))
2905 * Return the required default domain type for a specific device.
2907 * @dev: the device in query
2908 * @startup: true if this is during early boot
2911 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2912 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2913 * - 0: both identity and dynamic domains work for this device
2915 static int device_def_domain_type(struct device *dev)
2917 if (dev_is_pci(dev)) {
2918 struct pci_dev *pdev = to_pci_dev(dev);
2921 * Prevent any device marked as untrusted from getting
2922 * placed into the statically identity mapping domain.
2924 if (pdev->untrusted)
2925 return IOMMU_DOMAIN_DMA;
2927 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2928 return IOMMU_DOMAIN_IDENTITY;
2930 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2931 return IOMMU_DOMAIN_IDENTITY;
2937 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2940 * Start from the sane iommu hardware state.
2941 * If the queued invalidation is already initialized by us
2942 * (for example, while enabling interrupt-remapping) then
2943 * we got the things already rolling from a sane state.
2947 * Clear any previous faults.
2949 dmar_fault(-1, iommu);
2951 * Disable queued invalidation if supported and already enabled
2952 * before OS handover.
2954 dmar_disable_qi(iommu);
2957 if (dmar_enable_qi(iommu)) {
2959 * Queued Invalidate not enabled, use Register Based Invalidate
2961 iommu->flush.flush_context = __iommu_flush_context;
2962 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2963 pr_info("%s: Using Register based invalidation\n",
2966 iommu->flush.flush_context = qi_flush_context;
2967 iommu->flush.flush_iotlb = qi_flush_iotlb;
2968 pr_info("%s: Using Queued invalidation\n", iommu->name);
2972 static int copy_context_table(struct intel_iommu *iommu,
2973 struct root_entry *old_re,
2974 struct context_entry **tbl,
2977 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2978 struct context_entry *new_ce = NULL, ce;
2979 struct context_entry *old_ce = NULL;
2980 struct root_entry re;
2981 phys_addr_t old_ce_phys;
2983 tbl_idx = ext ? bus * 2 : bus;
2984 memcpy(&re, old_re, sizeof(re));
2986 for (devfn = 0; devfn < 256; devfn++) {
2987 /* First calculate the correct index */
2988 idx = (ext ? devfn * 2 : devfn) % 256;
2991 /* First save what we may have and clean up */
2993 tbl[tbl_idx] = new_ce;
2994 __iommu_flush_cache(iommu, new_ce,
3004 old_ce_phys = root_entry_lctp(&re);
3006 old_ce_phys = root_entry_uctp(&re);
3009 if (ext && devfn == 0) {
3010 /* No LCTP, try UCTP */
3019 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3024 new_ce = alloc_pgtable_page(iommu->node);
3031 /* Now copy the context entry */
3032 memcpy(&ce, old_ce + idx, sizeof(ce));
3034 if (!__context_present(&ce))
3037 did = context_domain_id(&ce);
3038 if (did >= 0 && did < cap_ndoms(iommu->cap))
3039 set_bit(did, iommu->domain_ids);
3042 * We need a marker for copied context entries. This
3043 * marker needs to work for the old format as well as
3044 * for extended context entries.
3046 * Bit 67 of the context entry is used. In the old
3047 * format this bit is available to software, in the
3048 * extended format it is the PGE bit, but PGE is ignored
3049 * by HW if PASIDs are disabled (and thus still
3052 * So disable PASIDs first and then mark the entry
3053 * copied. This means that we don't copy PASID
3054 * translations from the old kernel, but this is fine as
3055 * faults there are not fatal.
3057 context_clear_pasid_enable(&ce);
3058 context_set_copied(&ce);
3063 tbl[tbl_idx + pos] = new_ce;
3065 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3074 static int copy_translation_tables(struct intel_iommu *iommu)
3076 struct context_entry **ctxt_tbls;
3077 struct root_entry *old_rt;
3078 phys_addr_t old_rt_phys;
3079 int ctxt_table_entries;
3080 unsigned long flags;
3085 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3086 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3087 new_ext = !!ecap_ecs(iommu->ecap);
3090 * The RTT bit can only be changed when translation is disabled,
3091 * but disabling translation means to open a window for data
3092 * corruption. So bail out and don't copy anything if we would
3093 * have to change the bit.
3098 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3102 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3106 /* This is too big for the stack - allocate it from slab */
3107 ctxt_table_entries = ext ? 512 : 256;
3109 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3113 for (bus = 0; bus < 256; bus++) {
3114 ret = copy_context_table(iommu, &old_rt[bus],
3115 ctxt_tbls, bus, ext);
3117 pr_err("%s: Failed to copy context table for bus %d\n",
3123 spin_lock_irqsave(&iommu->lock, flags);
3125 /* Context tables are copied, now write them to the root_entry table */
3126 for (bus = 0; bus < 256; bus++) {
3127 int idx = ext ? bus * 2 : bus;
3130 if (ctxt_tbls[idx]) {
3131 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3132 iommu->root_entry[bus].lo = val;
3135 if (!ext || !ctxt_tbls[idx + 1])
3138 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3139 iommu->root_entry[bus].hi = val;
3142 spin_unlock_irqrestore(&iommu->lock, flags);
3146 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3156 #ifdef CONFIG_INTEL_IOMMU_SVM
3157 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3159 struct intel_iommu *iommu = data;
3163 return INVALID_IOASID;
3165 * VT-d virtual command interface always uses the full 20 bit
3166 * PASID range. Host can partition guest PASID range based on
3167 * policies but it is out of guest's control.
3169 if (min < PASID_MIN || max > intel_pasid_max_id)
3170 return INVALID_IOASID;
3172 if (vcmd_alloc_pasid(iommu, &ioasid))
3173 return INVALID_IOASID;
3178 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3180 struct intel_iommu *iommu = data;
3185 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3186 * We can only free the PASID when all the devices are unbound.
3188 if (ioasid_find(NULL, ioasid, NULL)) {
3189 pr_alert("Cannot free active IOASID %d\n", ioasid);
3192 vcmd_free_pasid(iommu, ioasid);
3195 static void register_pasid_allocator(struct intel_iommu *iommu)
3198 * If we are running in the host, no need for custom allocator
3199 * in that PASIDs are allocated from the host system-wide.
3201 if (!cap_caching_mode(iommu->cap))
3204 if (!sm_supported(iommu)) {
3205 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3210 * Register a custom PASID allocator if we are running in a guest,
3211 * guest PASID must be obtained via virtual command interface.
3212 * There can be multiple vIOMMUs in each guest but only one allocator
3213 * is active. All vIOMMU allocators will eventually be calling the same
3216 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3219 pr_info("Register custom PASID allocator\n");
3220 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3221 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3222 iommu->pasid_allocator.pdata = (void *)iommu;
3223 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3224 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3226 * Disable scalable mode on this IOMMU if there
3227 * is no custom allocator. Mixing SM capable vIOMMU
3228 * and non-SM vIOMMU are not supported.
3235 static int __init init_dmars(void)
3237 struct dmar_drhd_unit *drhd;
3238 struct intel_iommu *iommu;
3244 * initialize and program root entry to not present
3247 for_each_drhd_unit(drhd) {
3249 * lock not needed as this is only incremented in the single
3250 * threaded kernel __init code path all other access are read
3253 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3257 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3260 /* Preallocate enough resources for IOMMU hot-addition */
3261 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3262 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3264 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3267 pr_err("Allocating global iommu array failed\n");
3272 for_each_iommu(iommu, drhd) {
3273 if (drhd->ignored) {
3274 iommu_disable_translation(iommu);
3279 * Find the max pasid size of all IOMMU's in the system.
3280 * We need to ensure the system pasid table is no bigger
3281 * than the smallest supported.
3283 if (pasid_supported(iommu)) {
3284 u32 temp = 2 << ecap_pss(iommu->ecap);
3286 intel_pasid_max_id = min_t(u32, temp,
3287 intel_pasid_max_id);
3290 g_iommus[iommu->seq_id] = iommu;
3292 intel_iommu_init_qi(iommu);
3294 ret = iommu_init_domains(iommu);
3298 init_translation_status(iommu);
3300 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3301 iommu_disable_translation(iommu);
3302 clear_translation_pre_enabled(iommu);
3303 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3309 * we could share the same root & context tables
3310 * among all IOMMU's. Need to Split it later.
3312 ret = iommu_alloc_root_entry(iommu);
3316 if (translation_pre_enabled(iommu)) {
3317 pr_info("Translation already enabled - trying to copy translation structures\n");
3319 ret = copy_translation_tables(iommu);
3322 * We found the IOMMU with translation
3323 * enabled - but failed to copy over the
3324 * old root-entry table. Try to proceed
3325 * by disabling translation now and
3326 * allocating a clean root-entry table.
3327 * This might cause DMAR faults, but
3328 * probably the dump will still succeed.
3330 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3332 iommu_disable_translation(iommu);
3333 clear_translation_pre_enabled(iommu);
3335 pr_info("Copied translation tables from previous kernel for %s\n",
3340 if (!ecap_pass_through(iommu->ecap))
3341 hw_pass_through = 0;
3342 intel_svm_check(iommu);
3346 * Now that qi is enabled on all iommus, set the root entry and flush
3347 * caches. This is required on some Intel X58 chipsets, otherwise the
3348 * flush_context function will loop forever and the boot hangs.
3350 for_each_active_iommu(iommu, drhd) {
3351 iommu_flush_write_buffer(iommu);
3352 #ifdef CONFIG_INTEL_IOMMU_SVM
3353 register_pasid_allocator(iommu);
3355 iommu_set_root_entry(iommu);
3356 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3357 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3360 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3365 iommu_identity_mapping |= IDENTMAP_GFX;
3367 check_tylersburg_isoch();
3369 ret = si_domain_init(hw_pass_through);
3376 * global invalidate context cache
3377 * global invalidate iotlb
3378 * enable translation
3380 for_each_iommu(iommu, drhd) {
3381 if (drhd->ignored) {
3383 * we always have to disable PMRs or DMA may fail on
3387 iommu_disable_protect_mem_regions(iommu);
3391 iommu_flush_write_buffer(iommu);
3393 #ifdef CONFIG_INTEL_IOMMU_SVM
3394 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3396 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3397 * could cause possible lock race condition.
3399 up_write(&dmar_global_lock);
3400 ret = intel_svm_enable_prq(iommu);
3401 down_write(&dmar_global_lock);
3406 ret = dmar_set_interrupt(iommu);
3414 for_each_active_iommu(iommu, drhd) {
3415 disable_dmar_iommu(iommu);
3416 free_dmar_iommu(iommu);
3425 /* This takes a number of _MM_ pages, not VTD pages */
3426 static unsigned long intel_alloc_iova(struct device *dev,
3427 struct dmar_domain *domain,
3428 unsigned long nrpages, uint64_t dma_mask)
3430 unsigned long iova_pfn;
3433 * Restrict dma_mask to the width that the iommu can handle.
3434 * First-level translation restricts the input-address to a
3435 * canonical address (i.e., address bits 63:N have the same
3436 * value as address bit [N-1], where N is 48-bits with 4-level
3437 * paging and 57-bits with 5-level paging). Hence, skip bit
3440 if (domain_use_first_level(domain))
3441 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3444 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3447 /* Ensure we reserve the whole size-aligned region */
3448 nrpages = __roundup_pow_of_two(nrpages);
3450 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3452 * First try to allocate an io virtual address in
3453 * DMA_BIT_MASK(32) and if that fails then try allocating
3456 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3457 IOVA_PFN(DMA_BIT_MASK(32)), false);
3461 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3462 IOVA_PFN(dma_mask), true);
3463 if (unlikely(!iova_pfn)) {
3464 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3472 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3473 size_t size, int dir, u64 dma_mask)
3475 struct dmar_domain *domain;
3476 phys_addr_t start_paddr;
3477 unsigned long iova_pfn;
3480 struct intel_iommu *iommu;
3481 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3483 BUG_ON(dir == DMA_NONE);
3485 if (unlikely(attach_deferred(dev)))
3486 do_deferred_attach(dev);
3488 domain = find_domain(dev);
3490 return DMA_MAPPING_ERROR;
3492 iommu = domain_get_iommu(domain);
3493 size = aligned_nrpages(paddr, size);
3495 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3500 * Check if DMAR supports zero-length reads on write only
3503 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3504 !cap_zlr(iommu->cap))
3505 prot |= DMA_PTE_READ;
3506 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3507 prot |= DMA_PTE_WRITE;
3509 * paddr - (paddr + size) might be partial page, we should map the whole
3510 * page. Note: if two part of one page are separately mapped, we
3511 * might have two guest_addr mapping to the same host paddr, but this
3512 * is not a big problem
3514 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3515 mm_to_dma_pfn(paddr_pfn), size, prot);
3519 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3520 start_paddr += paddr & ~PAGE_MASK;
3522 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3528 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3529 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3530 size, (unsigned long long)paddr, dir);
3531 return DMA_MAPPING_ERROR;
3534 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3535 unsigned long offset, size_t size,
3536 enum dma_data_direction dir,
3537 unsigned long attrs)
3539 return __intel_map_single(dev, page_to_phys(page) + offset,
3540 size, dir, *dev->dma_mask);
3543 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3544 size_t size, enum dma_data_direction dir,
3545 unsigned long attrs)
3547 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3550 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3552 struct dmar_domain *domain;
3553 unsigned long start_pfn, last_pfn;
3554 unsigned long nrpages;
3555 unsigned long iova_pfn;
3556 struct intel_iommu *iommu;
3557 struct page *freelist;
3558 struct pci_dev *pdev = NULL;
3560 domain = find_domain(dev);
3563 iommu = domain_get_iommu(domain);
3565 iova_pfn = IOVA_PFN(dev_addr);
3567 nrpages = aligned_nrpages(dev_addr, size);
3568 start_pfn = mm_to_dma_pfn(iova_pfn);
3569 last_pfn = start_pfn + nrpages - 1;
3571 if (dev_is_pci(dev))
3572 pdev = to_pci_dev(dev);
3574 freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
3575 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3576 !has_iova_flush_queue(&domain->iovad)) {
3577 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3578 nrpages, !freelist, 0);
3580 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3581 dma_free_pagelist(freelist);
3583 queue_iova(&domain->iovad, iova_pfn, nrpages,
3584 (unsigned long)freelist);
3586 * queue up the release of the unmap to save the 1/6th of the
3587 * cpu used up by the iotlb flush operation...
3591 trace_unmap_single(dev, dev_addr, size);
3594 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3595 size_t size, enum dma_data_direction dir,
3596 unsigned long attrs)
3598 intel_unmap(dev, dev_addr, size);
3601 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3602 size_t size, enum dma_data_direction dir, unsigned long attrs)
3604 intel_unmap(dev, dev_addr, size);
3607 static void *intel_alloc_coherent(struct device *dev, size_t size,
3608 dma_addr_t *dma_handle, gfp_t flags,
3609 unsigned long attrs)
3611 struct page *page = NULL;
3614 if (unlikely(attach_deferred(dev)))
3615 do_deferred_attach(dev);
3617 size = PAGE_ALIGN(size);
3618 order = get_order(size);
3620 if (gfpflags_allow_blocking(flags)) {
3621 unsigned int count = size >> PAGE_SHIFT;
3623 page = dma_alloc_from_contiguous(dev, count, order,
3624 flags & __GFP_NOWARN);
3628 page = alloc_pages(flags, order);
3631 memset(page_address(page), 0, size);
3633 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3635 dev->coherent_dma_mask);
3636 if (*dma_handle != DMA_MAPPING_ERROR)
3637 return page_address(page);
3638 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3639 __free_pages(page, order);
3644 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3645 dma_addr_t dma_handle, unsigned long attrs)
3648 struct page *page = virt_to_page(vaddr);
3650 size = PAGE_ALIGN(size);
3651 order = get_order(size);
3653 intel_unmap(dev, dma_handle, size);
3654 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3655 __free_pages(page, order);
3658 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3659 int nelems, enum dma_data_direction dir,
3660 unsigned long attrs)
3662 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3663 unsigned long nrpages = 0;
3664 struct scatterlist *sg;
3667 for_each_sg(sglist, sg, nelems, i) {
3668 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3671 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3676 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3677 enum dma_data_direction dir, unsigned long attrs)
3680 struct dmar_domain *domain;
3683 unsigned long iova_pfn;
3685 struct scatterlist *sg;
3686 unsigned long start_vpfn;
3687 struct intel_iommu *iommu;
3689 BUG_ON(dir == DMA_NONE);
3691 if (unlikely(attach_deferred(dev)))
3692 do_deferred_attach(dev);
3694 domain = find_domain(dev);
3698 iommu = domain_get_iommu(domain);
3700 for_each_sg(sglist, sg, nelems, i)
3701 size += aligned_nrpages(sg->offset, sg->length);
3703 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3706 sglist->dma_length = 0;
3711 * Check if DMAR supports zero-length reads on write only
3714 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3715 !cap_zlr(iommu->cap))
3716 prot |= DMA_PTE_READ;
3717 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3718 prot |= DMA_PTE_WRITE;
3720 start_vpfn = mm_to_dma_pfn(iova_pfn);
3722 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3723 if (unlikely(ret)) {
3724 dma_pte_free_pagetable(domain, start_vpfn,
3725 start_vpfn + size - 1,
3726 agaw_to_level(domain->agaw) + 1);
3727 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3731 for_each_sg(sglist, sg, nelems, i)
3732 trace_map_sg(dev, i + 1, nelems, sg);
3737 static u64 intel_get_required_mask(struct device *dev)
3739 return DMA_BIT_MASK(32);
3742 static const struct dma_map_ops intel_dma_ops = {
3743 .alloc = intel_alloc_coherent,
3744 .free = intel_free_coherent,
3745 .map_sg = intel_map_sg,
3746 .unmap_sg = intel_unmap_sg,
3747 .map_page = intel_map_page,
3748 .unmap_page = intel_unmap_page,
3749 .map_resource = intel_map_resource,
3750 .unmap_resource = intel_unmap_resource,
3751 .dma_supported = dma_direct_supported,
3752 .mmap = dma_common_mmap,
3753 .get_sgtable = dma_common_get_sgtable,
3754 .alloc_pages = dma_common_alloc_pages,
3755 .free_pages = dma_common_free_pages,
3756 .get_required_mask = intel_get_required_mask,
3760 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3761 enum dma_data_direction dir, enum dma_sync_target target)
3763 struct dmar_domain *domain;
3764 phys_addr_t tlb_addr;
3766 domain = find_domain(dev);
3767 if (WARN_ON(!domain))
3770 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3771 if (is_swiotlb_buffer(tlb_addr))
3772 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3776 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3777 enum dma_data_direction dir, unsigned long attrs,
3780 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3781 struct dmar_domain *domain;
3782 struct intel_iommu *iommu;
3783 unsigned long iova_pfn;
3784 unsigned long nrpages;
3785 phys_addr_t tlb_addr;
3789 if (unlikely(attach_deferred(dev)))
3790 do_deferred_attach(dev);
3792 domain = find_domain(dev);
3794 if (WARN_ON(dir == DMA_NONE || !domain))
3795 return DMA_MAPPING_ERROR;
3797 iommu = domain_get_iommu(domain);
3798 if (WARN_ON(!iommu))
3799 return DMA_MAPPING_ERROR;
3801 nrpages = aligned_nrpages(0, size);
3802 iova_pfn = intel_alloc_iova(dev, domain,
3803 dma_to_mm_pfn(nrpages), dma_mask);
3805 return DMA_MAPPING_ERROR;
3808 * Check if DMAR supports zero-length reads on write only
3811 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3812 !cap_zlr(iommu->cap))
3813 prot |= DMA_PTE_READ;
3814 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3815 prot |= DMA_PTE_WRITE;
3818 * If both the physical buffer start address and size are
3819 * page aligned, we don't need to use a bounce page.
3821 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3822 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3823 aligned_size, dir, attrs);
3824 if (tlb_addr == DMA_MAPPING_ERROR) {
3827 /* Cleanup the padding area. */
3828 void *padding_start = phys_to_virt(tlb_addr);
3829 size_t padding_size = aligned_size;
3831 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3832 (dir == DMA_TO_DEVICE ||
3833 dir == DMA_BIDIRECTIONAL)) {
3834 padding_start += size;
3835 padding_size -= size;
3838 memset(padding_start, 0, padding_size);
3844 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3845 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3849 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3851 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3854 if (is_swiotlb_buffer(tlb_addr))
3855 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3856 aligned_size, dir, attrs);
3858 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3859 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3860 size, (unsigned long long)paddr, dir);
3862 return DMA_MAPPING_ERROR;
3866 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3867 enum dma_data_direction dir, unsigned long attrs)
3869 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3870 struct dmar_domain *domain;
3871 phys_addr_t tlb_addr;
3873 domain = find_domain(dev);
3874 if (WARN_ON(!domain))
3877 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3878 if (WARN_ON(!tlb_addr))
3881 intel_unmap(dev, dev_addr, size);
3882 if (is_swiotlb_buffer(tlb_addr))
3883 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3884 aligned_size, dir, attrs);
3886 trace_bounce_unmap_single(dev, dev_addr, size);
3890 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3891 size_t size, enum dma_data_direction dir, unsigned long attrs)
3893 return bounce_map_single(dev, page_to_phys(page) + offset,
3894 size, dir, attrs, *dev->dma_mask);
3898 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3899 enum dma_data_direction dir, unsigned long attrs)
3901 return bounce_map_single(dev, phys_addr, size,
3902 dir, attrs, *dev->dma_mask);
3906 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3907 enum dma_data_direction dir, unsigned long attrs)
3909 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3913 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3914 enum dma_data_direction dir, unsigned long attrs)
3916 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3920 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3921 enum dma_data_direction dir, unsigned long attrs)
3923 struct scatterlist *sg;
3926 for_each_sg(sglist, sg, nelems, i)
3927 bounce_unmap_page(dev, sg->dma_address,
3928 sg_dma_len(sg), dir, attrs);
3932 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3933 enum dma_data_direction dir, unsigned long attrs)
3936 struct scatterlist *sg;
3938 for_each_sg(sglist, sg, nelems, i) {
3939 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3940 sg->offset, sg->length,
3942 if (sg->dma_address == DMA_MAPPING_ERROR)
3944 sg_dma_len(sg) = sg->length;
3947 for_each_sg(sglist, sg, nelems, i)
3948 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3953 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3958 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3959 size_t size, enum dma_data_direction dir)
3961 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3965 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3966 size_t size, enum dma_data_direction dir)
3968 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3972 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3973 int nelems, enum dma_data_direction dir)
3975 struct scatterlist *sg;
3978 for_each_sg(sglist, sg, nelems, i)
3979 bounce_sync_single(dev, sg_dma_address(sg),
3980 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3984 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3985 int nelems, enum dma_data_direction dir)
3987 struct scatterlist *sg;
3990 for_each_sg(sglist, sg, nelems, i)
3991 bounce_sync_single(dev, sg_dma_address(sg),
3992 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3995 static const struct dma_map_ops bounce_dma_ops = {
3996 .alloc = intel_alloc_coherent,
3997 .free = intel_free_coherent,
3998 .map_sg = bounce_map_sg,
3999 .unmap_sg = bounce_unmap_sg,
4000 .map_page = bounce_map_page,
4001 .unmap_page = bounce_unmap_page,
4002 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4003 .sync_single_for_device = bounce_sync_single_for_device,
4004 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4005 .sync_sg_for_device = bounce_sync_sg_for_device,
4006 .map_resource = bounce_map_resource,
4007 .unmap_resource = bounce_unmap_resource,
4008 .alloc_pages = dma_common_alloc_pages,
4009 .free_pages = dma_common_free_pages,
4010 .dma_supported = dma_direct_supported,
4013 static inline int iommu_domain_cache_init(void)
4017 iommu_domain_cache = kmem_cache_create("iommu_domain",
4018 sizeof(struct dmar_domain),
4023 if (!iommu_domain_cache) {
4024 pr_err("Couldn't create iommu_domain cache\n");
4031 static inline int iommu_devinfo_cache_init(void)
4035 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4036 sizeof(struct device_domain_info),
4040 if (!iommu_devinfo_cache) {
4041 pr_err("Couldn't create devinfo cache\n");
4048 static int __init iommu_init_mempool(void)
4051 ret = iova_cache_get();
4055 ret = iommu_domain_cache_init();
4059 ret = iommu_devinfo_cache_init();
4063 kmem_cache_destroy(iommu_domain_cache);
4070 static void __init iommu_exit_mempool(void)
4072 kmem_cache_destroy(iommu_devinfo_cache);
4073 kmem_cache_destroy(iommu_domain_cache);
4077 static void __init init_no_remapping_devices(void)
4079 struct dmar_drhd_unit *drhd;
4083 for_each_drhd_unit(drhd) {
4084 if (!drhd->include_all) {
4085 for_each_active_dev_scope(drhd->devices,
4086 drhd->devices_cnt, i, dev)
4088 /* ignore DMAR unit if no devices exist */
4089 if (i == drhd->devices_cnt)
4094 for_each_active_drhd_unit(drhd) {
4095 if (drhd->include_all)
4098 for_each_active_dev_scope(drhd->devices,
4099 drhd->devices_cnt, i, dev)
4100 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4102 if (i < drhd->devices_cnt)
4105 /* This IOMMU has *only* gfx devices. Either bypass it or
4106 set the gfx_mapped flag, as appropriate */
4107 drhd->gfx_dedicated = 1;
4113 #ifdef CONFIG_SUSPEND
4114 static int init_iommu_hw(void)
4116 struct dmar_drhd_unit *drhd;
4117 struct intel_iommu *iommu = NULL;
4119 for_each_active_iommu(iommu, drhd)
4121 dmar_reenable_qi(iommu);
4123 for_each_iommu(iommu, drhd) {
4124 if (drhd->ignored) {
4126 * we always have to disable PMRs or DMA may fail on
4130 iommu_disable_protect_mem_regions(iommu);
4134 iommu_flush_write_buffer(iommu);
4136 iommu_set_root_entry(iommu);
4138 iommu->flush.flush_context(iommu, 0, 0, 0,
4139 DMA_CCMD_GLOBAL_INVL);
4140 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4141 iommu_enable_translation(iommu);
4142 iommu_disable_protect_mem_regions(iommu);
4148 static void iommu_flush_all(void)
4150 struct dmar_drhd_unit *drhd;
4151 struct intel_iommu *iommu;
4153 for_each_active_iommu(iommu, drhd) {
4154 iommu->flush.flush_context(iommu, 0, 0, 0,
4155 DMA_CCMD_GLOBAL_INVL);
4156 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4157 DMA_TLB_GLOBAL_FLUSH);
4161 static int iommu_suspend(void)
4163 struct dmar_drhd_unit *drhd;
4164 struct intel_iommu *iommu = NULL;
4167 for_each_active_iommu(iommu, drhd) {
4168 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4170 if (!iommu->iommu_state)
4176 for_each_active_iommu(iommu, drhd) {
4177 iommu_disable_translation(iommu);
4179 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4181 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4182 readl(iommu->reg + DMAR_FECTL_REG);
4183 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4184 readl(iommu->reg + DMAR_FEDATA_REG);
4185 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4186 readl(iommu->reg + DMAR_FEADDR_REG);
4187 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4188 readl(iommu->reg + DMAR_FEUADDR_REG);
4190 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4195 for_each_active_iommu(iommu, drhd)
4196 kfree(iommu->iommu_state);
4201 static void iommu_resume(void)
4203 struct dmar_drhd_unit *drhd;
4204 struct intel_iommu *iommu = NULL;
4207 if (init_iommu_hw()) {
4209 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4211 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4215 for_each_active_iommu(iommu, drhd) {
4217 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4219 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4220 iommu->reg + DMAR_FECTL_REG);
4221 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4222 iommu->reg + DMAR_FEDATA_REG);
4223 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4224 iommu->reg + DMAR_FEADDR_REG);
4225 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4226 iommu->reg + DMAR_FEUADDR_REG);
4228 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4231 for_each_active_iommu(iommu, drhd)
4232 kfree(iommu->iommu_state);
4235 static struct syscore_ops iommu_syscore_ops = {
4236 .resume = iommu_resume,
4237 .suspend = iommu_suspend,
4240 static void __init init_iommu_pm_ops(void)
4242 register_syscore_ops(&iommu_syscore_ops);
4246 static inline void init_iommu_pm_ops(void) {}
4247 #endif /* CONFIG_PM */
4249 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4251 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4252 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4253 rmrr->end_address <= rmrr->base_address ||
4254 arch_rmrr_sanity_check(rmrr))
4260 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4262 struct acpi_dmar_reserved_memory *rmrr;
4263 struct dmar_rmrr_unit *rmrru;
4265 rmrr = (struct acpi_dmar_reserved_memory *)header;
4266 if (rmrr_sanity_check(rmrr)) {
4268 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4269 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4270 rmrr->base_address, rmrr->end_address,
4271 dmi_get_system_info(DMI_BIOS_VENDOR),
4272 dmi_get_system_info(DMI_BIOS_VERSION),
4273 dmi_get_system_info(DMI_PRODUCT_VERSION));
4274 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4277 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4281 rmrru->hdr = header;
4283 rmrru->base_address = rmrr->base_address;
4284 rmrru->end_address = rmrr->end_address;
4286 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4287 ((void *)rmrr) + rmrr->header.length,
4288 &rmrru->devices_cnt);
4289 if (rmrru->devices_cnt && rmrru->devices == NULL)
4292 list_add(&rmrru->list, &dmar_rmrr_units);
4301 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4303 struct dmar_atsr_unit *atsru;
4304 struct acpi_dmar_atsr *tmp;
4306 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4308 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4309 if (atsr->segment != tmp->segment)
4311 if (atsr->header.length != tmp->header.length)
4313 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4320 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4322 struct acpi_dmar_atsr *atsr;
4323 struct dmar_atsr_unit *atsru;
4325 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4328 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4329 atsru = dmar_find_atsr(atsr);
4333 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4338 * If memory is allocated from slab by ACPI _DSM method, we need to
4339 * copy the memory content because the memory buffer will be freed
4342 atsru->hdr = (void *)(atsru + 1);
4343 memcpy(atsru->hdr, hdr, hdr->length);
4344 atsru->include_all = atsr->flags & 0x1;
4345 if (!atsru->include_all) {
4346 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4347 (void *)atsr + atsr->header.length,
4348 &atsru->devices_cnt);
4349 if (atsru->devices_cnt && atsru->devices == NULL) {
4355 list_add_rcu(&atsru->list, &dmar_atsr_units);
4360 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4362 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4366 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4368 struct acpi_dmar_atsr *atsr;
4369 struct dmar_atsr_unit *atsru;
4371 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4372 atsru = dmar_find_atsr(atsr);
4374 list_del_rcu(&atsru->list);
4376 intel_iommu_free_atsr(atsru);
4382 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4386 struct acpi_dmar_atsr *atsr;
4387 struct dmar_atsr_unit *atsru;
4389 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4390 atsru = dmar_find_atsr(atsr);
4394 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4395 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4403 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4406 struct intel_iommu *iommu = dmaru->iommu;
4408 if (g_iommus[iommu->seq_id])
4411 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4412 pr_warn("%s: Doesn't support hardware pass through.\n",
4416 if (!ecap_sc_support(iommu->ecap) &&
4417 domain_update_iommu_snooping(iommu)) {
4418 pr_warn("%s: Doesn't support snooping.\n",
4422 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4423 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4424 pr_warn("%s: Doesn't support large page.\n",
4430 * Disable translation if already enabled prior to OS handover.
4432 if (iommu->gcmd & DMA_GCMD_TE)
4433 iommu_disable_translation(iommu);
4435 g_iommus[iommu->seq_id] = iommu;
4436 ret = iommu_init_domains(iommu);
4438 ret = iommu_alloc_root_entry(iommu);
4442 intel_svm_check(iommu);
4444 if (dmaru->ignored) {
4446 * we always have to disable PMRs or DMA may fail on this device
4449 iommu_disable_protect_mem_regions(iommu);
4453 intel_iommu_init_qi(iommu);
4454 iommu_flush_write_buffer(iommu);
4456 #ifdef CONFIG_INTEL_IOMMU_SVM
4457 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4458 ret = intel_svm_enable_prq(iommu);
4463 ret = dmar_set_interrupt(iommu);
4467 iommu_set_root_entry(iommu);
4468 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4469 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4470 iommu_enable_translation(iommu);
4472 iommu_disable_protect_mem_regions(iommu);
4476 disable_dmar_iommu(iommu);
4478 free_dmar_iommu(iommu);
4482 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4485 struct intel_iommu *iommu = dmaru->iommu;
4487 if (!intel_iommu_enabled)
4493 ret = intel_iommu_add(dmaru);
4495 disable_dmar_iommu(iommu);
4496 free_dmar_iommu(iommu);
4502 static void intel_iommu_free_dmars(void)
4504 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4505 struct dmar_atsr_unit *atsru, *atsr_n;
4507 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4508 list_del(&rmrru->list);
4509 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4513 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4514 list_del(&atsru->list);
4515 intel_iommu_free_atsr(atsru);
4519 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4522 struct pci_bus *bus;
4523 struct pci_dev *bridge = NULL;
4525 struct acpi_dmar_atsr *atsr;
4526 struct dmar_atsr_unit *atsru;
4528 dev = pci_physfn(dev);
4529 for (bus = dev->bus; bus; bus = bus->parent) {
4531 /* If it's an integrated device, allow ATS */
4534 /* Connected via non-PCIe: no ATS */
4535 if (!pci_is_pcie(bridge) ||
4536 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4538 /* If we found the root port, look it up in the ATSR */
4539 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4544 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4545 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4546 if (atsr->segment != pci_domain_nr(dev->bus))
4549 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4550 if (tmp == &bridge->dev)
4553 if (atsru->include_all)
4563 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4566 struct dmar_rmrr_unit *rmrru;
4567 struct dmar_atsr_unit *atsru;
4568 struct acpi_dmar_atsr *atsr;
4569 struct acpi_dmar_reserved_memory *rmrr;
4571 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4574 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4575 rmrr = container_of(rmrru->hdr,
4576 struct acpi_dmar_reserved_memory, header);
4577 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4578 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4579 ((void *)rmrr) + rmrr->header.length,
4580 rmrr->segment, rmrru->devices,
4581 rmrru->devices_cnt);
4584 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4585 dmar_remove_dev_scope(info, rmrr->segment,
4586 rmrru->devices, rmrru->devices_cnt);
4590 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4591 if (atsru->include_all)
4594 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4595 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4596 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4597 (void *)atsr + atsr->header.length,
4598 atsr->segment, atsru->devices,
4599 atsru->devices_cnt);
4604 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4605 if (dmar_remove_dev_scope(info, atsr->segment,
4606 atsru->devices, atsru->devices_cnt))
4614 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4615 unsigned long val, void *v)
4617 struct memory_notify *mhp = v;
4618 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4619 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4623 case MEM_GOING_ONLINE:
4624 if (iommu_domain_identity_map(si_domain,
4625 start_vpfn, last_vpfn)) {
4626 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4627 start_vpfn, last_vpfn);
4633 case MEM_CANCEL_ONLINE:
4635 struct dmar_drhd_unit *drhd;
4636 struct intel_iommu *iommu;
4637 struct page *freelist;
4639 freelist = domain_unmap(si_domain,
4640 start_vpfn, last_vpfn,
4644 for_each_active_iommu(iommu, drhd)
4645 iommu_flush_iotlb_psi(iommu, si_domain,
4646 start_vpfn, mhp->nr_pages,
4649 dma_free_pagelist(freelist);
4657 static struct notifier_block intel_iommu_memory_nb = {
4658 .notifier_call = intel_iommu_memory_notifier,
4662 static void free_all_cpu_cached_iovas(unsigned int cpu)
4666 for (i = 0; i < g_num_of_iommus; i++) {
4667 struct intel_iommu *iommu = g_iommus[i];
4668 struct dmar_domain *domain;
4674 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4675 domain = get_iommu_domain(iommu, (u16)did);
4677 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4680 free_cpu_cached_iovas(cpu, &domain->iovad);
4685 static int intel_iommu_cpu_dead(unsigned int cpu)
4687 free_all_cpu_cached_iovas(cpu);
4691 static void intel_disable_iommus(void)
4693 struct intel_iommu *iommu = NULL;
4694 struct dmar_drhd_unit *drhd;
4696 for_each_iommu(iommu, drhd)
4697 iommu_disable_translation(iommu);
4700 void intel_iommu_shutdown(void)
4702 struct dmar_drhd_unit *drhd;
4703 struct intel_iommu *iommu = NULL;
4705 if (no_iommu || dmar_disabled)
4708 down_write(&dmar_global_lock);
4710 /* Disable PMRs explicitly here. */
4711 for_each_iommu(iommu, drhd)
4712 iommu_disable_protect_mem_regions(iommu);
4714 /* Make sure the IOMMUs are switched off */
4715 intel_disable_iommus();
4717 up_write(&dmar_global_lock);
4720 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4722 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4724 return container_of(iommu_dev, struct intel_iommu, iommu);
4727 static ssize_t intel_iommu_show_version(struct device *dev,
4728 struct device_attribute *attr,
4731 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4732 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4733 return sprintf(buf, "%d:%d\n",
4734 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4736 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4738 static ssize_t intel_iommu_show_address(struct device *dev,
4739 struct device_attribute *attr,
4742 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4743 return sprintf(buf, "%llx\n", iommu->reg_phys);
4745 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4747 static ssize_t intel_iommu_show_cap(struct device *dev,
4748 struct device_attribute *attr,
4751 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4752 return sprintf(buf, "%llx\n", iommu->cap);
4754 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4756 static ssize_t intel_iommu_show_ecap(struct device *dev,
4757 struct device_attribute *attr,
4760 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4761 return sprintf(buf, "%llx\n", iommu->ecap);
4763 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4765 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4766 struct device_attribute *attr,
4769 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4770 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4772 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4774 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4775 struct device_attribute *attr,
4778 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4779 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4780 cap_ndoms(iommu->cap)));
4782 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4784 static struct attribute *intel_iommu_attrs[] = {
4785 &dev_attr_version.attr,
4786 &dev_attr_address.attr,
4788 &dev_attr_ecap.attr,
4789 &dev_attr_domains_supported.attr,
4790 &dev_attr_domains_used.attr,
4794 static struct attribute_group intel_iommu_group = {
4795 .name = "intel-iommu",
4796 .attrs = intel_iommu_attrs,
4799 const struct attribute_group *intel_iommu_groups[] = {
4804 static inline bool has_external_pci(void)
4806 struct pci_dev *pdev = NULL;
4808 for_each_pci_dev(pdev)
4809 if (pdev->external_facing)
4815 static int __init platform_optin_force_iommu(void)
4817 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4820 if (no_iommu || dmar_disabled)
4821 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4824 * If Intel-IOMMU is disabled by default, we will apply identity
4825 * map for all devices except those marked as being untrusted.
4828 iommu_set_default_passthrough(false);
4836 static int __init probe_acpi_namespace_devices(void)
4838 struct dmar_drhd_unit *drhd;
4839 /* To avoid a -Wunused-but-set-variable warning. */
4840 struct intel_iommu *iommu __maybe_unused;
4844 for_each_active_iommu(iommu, drhd) {
4845 for_each_active_dev_scope(drhd->devices,
4846 drhd->devices_cnt, i, dev) {
4847 struct acpi_device_physical_node *pn;
4848 struct iommu_group *group;
4849 struct acpi_device *adev;
4851 if (dev->bus != &acpi_bus_type)
4854 adev = to_acpi_device(dev);
4855 mutex_lock(&adev->physical_node_lock);
4856 list_for_each_entry(pn,
4857 &adev->physical_node_list, node) {
4858 group = iommu_group_get(pn->dev);
4860 iommu_group_put(group);
4864 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4865 ret = iommu_probe_device(pn->dev);
4869 mutex_unlock(&adev->physical_node_lock);
4879 int __init intel_iommu_init(void)
4882 struct dmar_drhd_unit *drhd;
4883 struct intel_iommu *iommu;
4886 * Intel IOMMU is required for a TXT/tboot launch or platform
4887 * opt in, so enforce that.
4889 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4891 if (iommu_init_mempool()) {
4893 panic("tboot: Failed to initialize iommu memory\n");
4897 down_write(&dmar_global_lock);
4898 if (dmar_table_init()) {
4900 panic("tboot: Failed to initialize DMAR table\n");
4904 if (dmar_dev_scope_init() < 0) {
4906 panic("tboot: Failed to initialize DMAR device scope\n");
4910 up_write(&dmar_global_lock);
4913 * The bus notifier takes the dmar_global_lock, so lockdep will
4914 * complain later when we register it under the lock.
4916 dmar_register_bus_notifier();
4918 down_write(&dmar_global_lock);
4921 intel_iommu_debugfs_init();
4923 if (no_iommu || dmar_disabled) {
4925 * We exit the function here to ensure IOMMU's remapping and
4926 * mempool aren't setup, which means that the IOMMU's PMRs
4927 * won't be disabled via the call to init_dmars(). So disable
4928 * it explicitly here. The PMRs were setup by tboot prior to
4929 * calling SENTER, but the kernel is expected to reset/tear
4932 if (intel_iommu_tboot_noforce) {
4933 for_each_iommu(iommu, drhd)
4934 iommu_disable_protect_mem_regions(iommu);
4938 * Make sure the IOMMUs are switched off, even when we
4939 * boot into a kexec kernel and the previous kernel left
4942 intel_disable_iommus();
4946 if (list_empty(&dmar_rmrr_units))
4947 pr_info("No RMRR found\n");
4949 if (list_empty(&dmar_atsr_units))
4950 pr_info("No ATSR found\n");
4952 if (dmar_init_reserved_ranges()) {
4954 panic("tboot: Failed to reserve iommu ranges\n");
4955 goto out_free_reserved_range;
4959 intel_iommu_gfx_mapped = 1;
4961 init_no_remapping_devices();
4966 panic("tboot: Failed to initialize DMARs\n");
4967 pr_err("Initialization failed\n");
4968 goto out_free_reserved_range;
4970 up_write(&dmar_global_lock);
4972 init_iommu_pm_ops();
4974 down_read(&dmar_global_lock);
4975 for_each_active_iommu(iommu, drhd) {
4976 iommu_device_sysfs_add(&iommu->iommu, NULL,
4979 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4980 iommu_device_register(&iommu->iommu);
4982 up_read(&dmar_global_lock);
4984 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4985 if (si_domain && !hw_pass_through)
4986 register_memory_notifier(&intel_iommu_memory_nb);
4987 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4988 intel_iommu_cpu_dead);
4990 down_read(&dmar_global_lock);
4991 if (probe_acpi_namespace_devices())
4992 pr_warn("ACPI name space devices didn't probe correctly\n");
4994 /* Finally, we enable the DMA remapping hardware. */
4995 for_each_iommu(iommu, drhd) {
4996 if (!drhd->ignored && !translation_pre_enabled(iommu))
4997 iommu_enable_translation(iommu);
4999 iommu_disable_protect_mem_regions(iommu);
5001 up_read(&dmar_global_lock);
5003 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5005 intel_iommu_enabled = 1;
5009 out_free_reserved_range:
5010 put_iova_domain(&reserved_iova_list);
5012 intel_iommu_free_dmars();
5013 up_write(&dmar_global_lock);
5014 iommu_exit_mempool();
5018 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5020 struct intel_iommu *iommu = opaque;
5022 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5027 * NB - intel-iommu lacks any sort of reference counting for the users of
5028 * dependent devices. If multiple endpoints have intersecting dependent
5029 * devices, unbinding the driver from any one of them will possibly leave
5030 * the others unable to operate.
5032 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5034 if (!iommu || !dev || !dev_is_pci(dev))
5037 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5040 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5042 struct dmar_domain *domain;
5043 struct intel_iommu *iommu;
5044 unsigned long flags;
5046 assert_spin_locked(&device_domain_lock);
5051 iommu = info->iommu;
5052 domain = info->domain;
5055 if (dev_is_pci(info->dev) && sm_supported(iommu))
5056 intel_pasid_tear_down_entry(iommu, info->dev,
5057 PASID_RID2PASID, false);
5059 iommu_disable_dev_iotlb(info);
5060 if (!dev_is_real_dma_subdevice(info->dev))
5061 domain_context_clear(iommu, info->dev);
5062 intel_pasid_free_table(info->dev);
5065 unlink_domain_info(info);
5067 spin_lock_irqsave(&iommu->lock, flags);
5068 domain_detach_iommu(domain, iommu);
5069 spin_unlock_irqrestore(&iommu->lock, flags);
5071 free_devinfo_mem(info);
5074 static void dmar_remove_one_dev_info(struct device *dev)
5076 struct device_domain_info *info;
5077 unsigned long flags;
5079 spin_lock_irqsave(&device_domain_lock, flags);
5080 info = get_domain_info(dev);
5082 __dmar_remove_one_dev_info(info);
5083 spin_unlock_irqrestore(&device_domain_lock, flags);
5086 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5090 /* calculate AGAW */
5091 domain->gaw = guest_width;
5092 adjust_width = guestwidth_to_adjustwidth(guest_width);
5093 domain->agaw = width_to_agaw(adjust_width);
5095 domain->iommu_coherency = 0;
5096 domain->iommu_snooping = 0;
5097 domain->iommu_superpage = 0;
5098 domain->max_addr = 0;
5100 /* always allocate the top pgd */
5101 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5104 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5108 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5110 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5111 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5113 if (!intel_iommu_strict &&
5114 init_iova_flush_queue(&dmar_domain->iovad,
5115 iommu_flush_iova, iova_entry_free))
5116 pr_info("iova flush queue initialization failed\n");
5119 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5121 struct dmar_domain *dmar_domain;
5122 struct iommu_domain *domain;
5125 case IOMMU_DOMAIN_DMA:
5126 case IOMMU_DOMAIN_UNMANAGED:
5127 dmar_domain = alloc_domain(0);
5129 pr_err("Can't allocate dmar_domain\n");
5132 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5133 pr_err("Domain initialization failed\n");
5134 domain_exit(dmar_domain);
5138 if (type == IOMMU_DOMAIN_DMA)
5139 intel_init_iova_domain(dmar_domain);
5141 domain = &dmar_domain->domain;
5142 domain->geometry.aperture_start = 0;
5143 domain->geometry.aperture_end =
5144 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5145 domain->geometry.force_aperture = true;
5148 case IOMMU_DOMAIN_IDENTITY:
5149 return &si_domain->domain;
5157 static void intel_iommu_domain_free(struct iommu_domain *domain)
5159 if (domain != &si_domain->domain)
5160 domain_exit(to_dmar_domain(domain));
5164 * Check whether a @domain could be attached to the @dev through the
5165 * aux-domain attach/detach APIs.
5168 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5170 struct device_domain_info *info = get_domain_info(dev);
5172 return info && info->auxd_enabled &&
5173 domain->type == IOMMU_DOMAIN_UNMANAGED;
5176 static void auxiliary_link_device(struct dmar_domain *domain,
5179 struct device_domain_info *info = get_domain_info(dev);
5181 assert_spin_locked(&device_domain_lock);
5185 domain->auxd_refcnt++;
5186 list_add(&domain->auxd, &info->auxiliary_domains);
5189 static void auxiliary_unlink_device(struct dmar_domain *domain,
5192 struct device_domain_info *info = get_domain_info(dev);
5194 assert_spin_locked(&device_domain_lock);
5198 list_del(&domain->auxd);
5199 domain->auxd_refcnt--;
5201 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5202 ioasid_free(domain->default_pasid);
5205 static int aux_domain_add_dev(struct dmar_domain *domain,
5209 unsigned long flags;
5210 struct intel_iommu *iommu;
5212 iommu = device_to_iommu(dev, NULL, NULL);
5216 if (domain->default_pasid <= 0) {
5219 /* No private data needed for the default pasid */
5220 pasid = ioasid_alloc(NULL, PASID_MIN,
5221 pci_max_pasids(to_pci_dev(dev)) - 1,
5223 if (pasid == INVALID_IOASID) {
5224 pr_err("Can't allocate default pasid\n");
5227 domain->default_pasid = pasid;
5230 spin_lock_irqsave(&device_domain_lock, flags);
5232 * iommu->lock must be held to attach domain to iommu and setup the
5233 * pasid entry for second level translation.
5235 spin_lock(&iommu->lock);
5236 ret = domain_attach_iommu(domain, iommu);
5240 /* Setup the PASID entry for mediated devices: */
5241 if (domain_use_first_level(domain))
5242 ret = domain_setup_first_level(iommu, domain, dev,
5243 domain->default_pasid);
5245 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5246 domain->default_pasid);
5249 spin_unlock(&iommu->lock);
5251 auxiliary_link_device(domain, dev);
5253 spin_unlock_irqrestore(&device_domain_lock, flags);
5258 domain_detach_iommu(domain, iommu);
5260 spin_unlock(&iommu->lock);
5261 spin_unlock_irqrestore(&device_domain_lock, flags);
5262 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5263 ioasid_free(domain->default_pasid);
5268 static void aux_domain_remove_dev(struct dmar_domain *domain,
5271 struct device_domain_info *info;
5272 struct intel_iommu *iommu;
5273 unsigned long flags;
5275 if (!is_aux_domain(dev, &domain->domain))
5278 spin_lock_irqsave(&device_domain_lock, flags);
5279 info = get_domain_info(dev);
5280 iommu = info->iommu;
5282 auxiliary_unlink_device(domain, dev);
5284 spin_lock(&iommu->lock);
5285 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5286 domain_detach_iommu(domain, iommu);
5287 spin_unlock(&iommu->lock);
5289 spin_unlock_irqrestore(&device_domain_lock, flags);
5292 static int prepare_domain_attach_device(struct iommu_domain *domain,
5295 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5296 struct intel_iommu *iommu;
5299 iommu = device_to_iommu(dev, NULL, NULL);
5303 /* check if this iommu agaw is sufficient for max mapped address */
5304 addr_width = agaw_to_width(iommu->agaw);
5305 if (addr_width > cap_mgaw(iommu->cap))
5306 addr_width = cap_mgaw(iommu->cap);
5308 if (dmar_domain->max_addr > (1LL << addr_width)) {
5309 dev_err(dev, "%s: iommu width (%d) is not "
5310 "sufficient for the mapped address (%llx)\n",
5311 __func__, addr_width, dmar_domain->max_addr);
5314 dmar_domain->gaw = addr_width;
5317 * Knock out extra levels of page tables if necessary
5319 while (iommu->agaw < dmar_domain->agaw) {
5320 struct dma_pte *pte;
5322 pte = dmar_domain->pgd;
5323 if (dma_pte_present(pte)) {
5324 dmar_domain->pgd = (struct dma_pte *)
5325 phys_to_virt(dma_pte_addr(pte));
5326 free_pgtable_page(pte);
5328 dmar_domain->agaw--;
5334 static int intel_iommu_attach_device(struct iommu_domain *domain,
5339 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5340 device_is_rmrr_locked(dev)) {
5341 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5345 if (is_aux_domain(dev, domain))
5348 /* normally dev is not mapped */
5349 if (unlikely(domain_context_mapped(dev))) {
5350 struct dmar_domain *old_domain;
5352 old_domain = find_domain(dev);
5354 dmar_remove_one_dev_info(dev);
5357 ret = prepare_domain_attach_device(domain, dev);
5361 return domain_add_dev_info(to_dmar_domain(domain), dev);
5364 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5369 if (!is_aux_domain(dev, domain))
5372 ret = prepare_domain_attach_device(domain, dev);
5376 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5379 static void intel_iommu_detach_device(struct iommu_domain *domain,
5382 dmar_remove_one_dev_info(dev);
5385 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5388 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5391 #ifdef CONFIG_INTEL_IOMMU_SVM
5393 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5394 * VT-d granularity. Invalidation is typically included in the unmap operation
5395 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5396 * owns the first level page tables. Invalidations of translation caches in the
5397 * guest are trapped and passed down to the host.
5399 * vIOMMU in the guest will only expose first level page tables, therefore
5400 * we do not support IOTLB granularity for request without PASID (second level).
5402 * For example, to find the VT-d granularity encoding for IOTLB
5403 * type and page selective granularity within PASID:
5404 * X: indexed by iommu cache type
5405 * Y: indexed by enum iommu_inv_granularity
5406 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5410 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5412 * PASID based IOTLB invalidation: PASID selective (per PASID),
5413 * page selective (address granularity)
5415 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5416 /* PASID based dev TLBs */
5417 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5419 {-EINVAL, -EINVAL, -EINVAL}
5422 static inline int to_vtd_granularity(int type, int granu)
5424 return inv_type_granu_table[type][granu];
5427 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5429 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5431 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5432 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5433 * granu size in contiguous memory.
5435 return order_base_2(nr_pages);
5439 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5440 struct iommu_cache_invalidate_info *inv_info)
5442 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5443 struct device_domain_info *info;
5444 struct intel_iommu *iommu;
5445 unsigned long flags;
5452 if (!inv_info || !dmar_domain)
5455 if (!dev || !dev_is_pci(dev))
5458 iommu = device_to_iommu(dev, &bus, &devfn);
5462 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5465 spin_lock_irqsave(&device_domain_lock, flags);
5466 spin_lock(&iommu->lock);
5467 info = get_domain_info(dev);
5472 did = dmar_domain->iommu_did[iommu->seq_id];
5473 sid = PCI_DEVID(bus, devfn);
5475 /* Size is only valid in address selective invalidation */
5476 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5477 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5478 inv_info->granu.addr_info.nb_granules);
5480 for_each_set_bit(cache_type,
5481 (unsigned long *)&inv_info->cache,
5482 IOMMU_CACHE_INV_TYPE_NR) {
5487 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5488 if (granu == -EINVAL) {
5489 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5490 cache_type, inv_info->granularity);
5495 * PASID is stored in different locations based on the
5498 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5499 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5500 pasid = inv_info->granu.pasid_info.pasid;
5501 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5502 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5503 pasid = inv_info->granu.addr_info.pasid;
5505 switch (BIT(cache_type)) {
5506 case IOMMU_CACHE_INV_TYPE_IOTLB:
5507 /* HW will ignore LSB bits based on address mask */
5508 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5510 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5511 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5512 inv_info->granu.addr_info.addr, size);
5516 * If granu is PASID-selective, address is ignored.
5517 * We use npages = -1 to indicate that.
5519 qi_flush_piotlb(iommu, did, pasid,
5520 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5521 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5522 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5524 if (!info->ats_enabled)
5527 * Always flush device IOTLB if ATS is enabled. vIOMMU
5528 * in the guest may assume IOTLB flush is inclusive,
5529 * which is more efficient.
5532 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5534 * PASID based device TLB invalidation does not support
5535 * IOMMU_INV_GRANU_PASID granularity but only supports
5536 * IOMMU_INV_GRANU_ADDR.
5537 * The equivalent of that is we set the size to be the
5538 * entire range of 64 bit. User only provides PASID info
5539 * without address info. So we set addr to 0.
5541 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5542 size = 64 - VTD_PAGE_SHIFT;
5544 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5545 addr = inv_info->granu.addr_info.addr;
5548 if (info->ats_enabled)
5549 qi_flush_dev_iotlb_pasid(iommu, sid,
5551 info->ats_qdep, addr,
5554 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5557 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5563 spin_unlock(&iommu->lock);
5564 spin_unlock_irqrestore(&device_domain_lock, flags);
5570 static int intel_iommu_map(struct iommu_domain *domain,
5571 unsigned long iova, phys_addr_t hpa,
5572 size_t size, int iommu_prot, gfp_t gfp)
5574 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5579 if (iommu_prot & IOMMU_READ)
5580 prot |= DMA_PTE_READ;
5581 if (iommu_prot & IOMMU_WRITE)
5582 prot |= DMA_PTE_WRITE;
5583 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5584 prot |= DMA_PTE_SNP;
5586 max_addr = iova + size;
5587 if (dmar_domain->max_addr < max_addr) {
5590 /* check if minimum agaw is sufficient for mapped address */
5591 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5592 if (end < max_addr) {
5593 pr_err("%s: iommu width (%d) is not "
5594 "sufficient for the mapped address (%llx)\n",
5595 __func__, dmar_domain->gaw, max_addr);
5598 dmar_domain->max_addr = max_addr;
5600 /* Round up size to next multiple of PAGE_SIZE, if it and
5601 the low bits of hpa would take us onto the next page */
5602 size = aligned_nrpages(hpa, size);
5603 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5604 hpa >> VTD_PAGE_SHIFT, size, prot);
5608 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5609 unsigned long iova, size_t size,
5610 struct iommu_iotlb_gather *gather)
5612 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5613 unsigned long start_pfn, last_pfn;
5616 /* Cope with horrid API which requires us to unmap more than the
5617 size argument if it happens to be a large-page mapping. */
5618 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5620 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5621 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5623 start_pfn = iova >> VTD_PAGE_SHIFT;
5624 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5626 gather->freelist = domain_unmap(dmar_domain, start_pfn,
5627 last_pfn, gather->freelist);
5629 if (dmar_domain->max_addr == iova + size)
5630 dmar_domain->max_addr = iova;
5632 iommu_iotlb_gather_add_page(domain, gather, iova, size);
5637 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5638 struct iommu_iotlb_gather *gather)
5640 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5641 unsigned long iova_pfn = IOVA_PFN(gather->start);
5642 size_t size = gather->end - gather->start;
5643 unsigned long start_pfn, last_pfn;
5644 unsigned long nrpages;
5647 nrpages = aligned_nrpages(gather->start, size);
5648 start_pfn = mm_to_dma_pfn(iova_pfn);
5649 last_pfn = start_pfn + nrpages - 1;
5651 for_each_domain_iommu(iommu_id, dmar_domain)
5652 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5653 start_pfn, nrpages, !gather->freelist, 0);
5655 dma_free_pagelist(gather->freelist);
5658 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5661 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5662 struct dma_pte *pte;
5666 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5667 if (pte && dma_pte_present(pte))
5668 phys = dma_pte_addr(pte) +
5669 (iova & (BIT_MASK(level_to_offset_bits(level) +
5670 VTD_PAGE_SHIFT) - 1));
5675 static inline bool scalable_mode_support(void)
5677 struct dmar_drhd_unit *drhd;
5678 struct intel_iommu *iommu;
5682 for_each_active_iommu(iommu, drhd) {
5683 if (!sm_supported(iommu)) {
5693 static inline bool iommu_pasid_support(void)
5695 struct dmar_drhd_unit *drhd;
5696 struct intel_iommu *iommu;
5700 for_each_active_iommu(iommu, drhd) {
5701 if (!pasid_supported(iommu)) {
5711 static inline bool nested_mode_support(void)
5713 struct dmar_drhd_unit *drhd;
5714 struct intel_iommu *iommu;
5718 for_each_active_iommu(iommu, drhd) {
5719 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5729 static bool intel_iommu_capable(enum iommu_cap cap)
5731 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5732 return domain_update_iommu_snooping(NULL) == 1;
5733 if (cap == IOMMU_CAP_INTR_REMAP)
5734 return irq_remapping_enabled == 1;
5739 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5741 struct intel_iommu *iommu;
5743 iommu = device_to_iommu(dev, NULL, NULL);
5745 return ERR_PTR(-ENODEV);
5747 if (translation_pre_enabled(iommu))
5748 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5750 return &iommu->iommu;
5753 static void intel_iommu_release_device(struct device *dev)
5755 struct intel_iommu *iommu;
5757 iommu = device_to_iommu(dev, NULL, NULL);
5761 dmar_remove_one_dev_info(dev);
5763 set_dma_ops(dev, NULL);
5766 static void intel_iommu_probe_finalize(struct device *dev)
5768 struct iommu_domain *domain;
5770 domain = iommu_get_domain_for_dev(dev);
5771 if (device_needs_bounce(dev))
5772 set_dma_ops(dev, &bounce_dma_ops);
5773 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5774 set_dma_ops(dev, &intel_dma_ops);
5776 set_dma_ops(dev, NULL);
5779 static void intel_iommu_get_resv_regions(struct device *device,
5780 struct list_head *head)
5782 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5783 struct iommu_resv_region *reg;
5784 struct dmar_rmrr_unit *rmrr;
5785 struct device *i_dev;
5788 down_read(&dmar_global_lock);
5789 for_each_rmrr_units(rmrr) {
5790 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5792 struct iommu_resv_region *resv;
5793 enum iommu_resv_type type;
5796 if (i_dev != device &&
5797 !is_downstream_to_pci_bridge(device, i_dev))
5800 length = rmrr->end_address - rmrr->base_address + 1;
5802 type = device_rmrr_is_relaxable(device) ?
5803 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5805 resv = iommu_alloc_resv_region(rmrr->base_address,
5806 length, prot, type);
5810 list_add_tail(&resv->list, head);
5813 up_read(&dmar_global_lock);
5815 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5816 if (dev_is_pci(device)) {
5817 struct pci_dev *pdev = to_pci_dev(device);
5819 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5820 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5821 IOMMU_RESV_DIRECT_RELAXABLE);
5823 list_add_tail(®->list, head);
5826 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5828 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5829 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5833 list_add_tail(®->list, head);
5836 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5838 struct device_domain_info *info;
5839 struct context_entry *context;
5840 struct dmar_domain *domain;
5841 unsigned long flags;
5845 domain = find_domain(dev);
5849 spin_lock_irqsave(&device_domain_lock, flags);
5850 spin_lock(&iommu->lock);
5853 info = get_domain_info(dev);
5854 if (!info || !info->pasid_supported)
5857 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5858 if (WARN_ON(!context))
5861 ctx_lo = context[0].lo;
5863 if (!(ctx_lo & CONTEXT_PASIDE)) {
5864 ctx_lo |= CONTEXT_PASIDE;
5865 context[0].lo = ctx_lo;
5867 iommu->flush.flush_context(iommu,
5868 domain->iommu_did[iommu->seq_id],
5869 PCI_DEVID(info->bus, info->devfn),
5870 DMA_CCMD_MASK_NOBIT,
5871 DMA_CCMD_DEVICE_INVL);
5874 /* Enable PASID support in the device, if it wasn't already */
5875 if (!info->pasid_enabled)
5876 iommu_enable_dev_iotlb(info);
5881 spin_unlock(&iommu->lock);
5882 spin_unlock_irqrestore(&device_domain_lock, flags);
5887 static void intel_iommu_apply_resv_region(struct device *dev,
5888 struct iommu_domain *domain,
5889 struct iommu_resv_region *region)
5891 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5892 unsigned long start, end;
5894 start = IOVA_PFN(region->start);
5895 end = IOVA_PFN(region->start + region->length - 1);
5897 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5900 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5902 if (dev_is_pci(dev))
5903 return pci_device_group(dev);
5904 return generic_device_group(dev);
5907 static int intel_iommu_enable_auxd(struct device *dev)
5909 struct device_domain_info *info;
5910 struct intel_iommu *iommu;
5911 unsigned long flags;
5914 iommu = device_to_iommu(dev, NULL, NULL);
5915 if (!iommu || dmar_disabled)
5918 if (!sm_supported(iommu) || !pasid_supported(iommu))
5921 ret = intel_iommu_enable_pasid(iommu, dev);
5925 spin_lock_irqsave(&device_domain_lock, flags);
5926 info = get_domain_info(dev);
5927 info->auxd_enabled = 1;
5928 spin_unlock_irqrestore(&device_domain_lock, flags);
5933 static int intel_iommu_disable_auxd(struct device *dev)
5935 struct device_domain_info *info;
5936 unsigned long flags;
5938 spin_lock_irqsave(&device_domain_lock, flags);
5939 info = get_domain_info(dev);
5940 if (!WARN_ON(!info))
5941 info->auxd_enabled = 0;
5942 spin_unlock_irqrestore(&device_domain_lock, flags);
5948 * A PCI express designated vendor specific extended capability is defined
5949 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5950 * for system software and tools to detect endpoint devices supporting the
5951 * Intel scalable IO virtualization without host driver dependency.
5953 * Returns the address of the matching extended capability structure within
5954 * the device's PCI configuration space or 0 if the device does not support
5957 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5962 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5964 pci_read_config_word(pdev, pos + 4, &vendor);
5965 pci_read_config_word(pdev, pos + 8, &id);
5966 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5969 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5976 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5978 if (feat == IOMMU_DEV_FEAT_AUX) {
5981 if (!dev_is_pci(dev) || dmar_disabled ||
5982 !scalable_mode_support() || !iommu_pasid_support())
5985 ret = pci_pasid_features(to_pci_dev(dev));
5989 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5992 if (feat == IOMMU_DEV_FEAT_SVA) {
5993 struct device_domain_info *info = get_domain_info(dev);
5995 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5996 info->pasid_supported && info->pri_supported &&
5997 info->ats_supported;
6004 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6006 if (feat == IOMMU_DEV_FEAT_AUX)
6007 return intel_iommu_enable_auxd(dev);
6009 if (feat == IOMMU_DEV_FEAT_SVA) {
6010 struct device_domain_info *info = get_domain_info(dev);
6015 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6023 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6025 if (feat == IOMMU_DEV_FEAT_AUX)
6026 return intel_iommu_disable_auxd(dev);
6032 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6034 struct device_domain_info *info = get_domain_info(dev);
6036 if (feat == IOMMU_DEV_FEAT_AUX)
6037 return scalable_mode_support() && info && info->auxd_enabled;
6043 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6045 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6047 return dmar_domain->default_pasid > 0 ?
6048 dmar_domain->default_pasid : -EINVAL;
6051 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6054 return attach_deferred(dev);
6058 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6059 enum iommu_attr attr, void *data)
6061 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6062 unsigned long flags;
6065 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6069 case DOMAIN_ATTR_NESTING:
6070 spin_lock_irqsave(&device_domain_lock, flags);
6071 if (nested_mode_support() &&
6072 list_empty(&dmar_domain->devices)) {
6073 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6074 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6078 spin_unlock_irqrestore(&device_domain_lock, flags);
6089 * Check that the device does not live on an external facing PCI port that is
6090 * marked as untrusted. Such devices should not be able to apply quirks and
6091 * thus not be able to bypass the IOMMU restrictions.
6093 static bool risky_device(struct pci_dev *pdev)
6095 if (pdev->untrusted) {
6097 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6098 pdev->vendor, pdev->device);
6099 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6105 const struct iommu_ops intel_iommu_ops = {
6106 .capable = intel_iommu_capable,
6107 .domain_alloc = intel_iommu_domain_alloc,
6108 .domain_free = intel_iommu_domain_free,
6109 .domain_set_attr = intel_iommu_domain_set_attr,
6110 .attach_dev = intel_iommu_attach_device,
6111 .detach_dev = intel_iommu_detach_device,
6112 .aux_attach_dev = intel_iommu_aux_attach_device,
6113 .aux_detach_dev = intel_iommu_aux_detach_device,
6114 .aux_get_pasid = intel_iommu_aux_get_pasid,
6115 .map = intel_iommu_map,
6116 .unmap = intel_iommu_unmap,
6117 .iotlb_sync = intel_iommu_tlb_sync,
6118 .iova_to_phys = intel_iommu_iova_to_phys,
6119 .probe_device = intel_iommu_probe_device,
6120 .probe_finalize = intel_iommu_probe_finalize,
6121 .release_device = intel_iommu_release_device,
6122 .get_resv_regions = intel_iommu_get_resv_regions,
6123 .put_resv_regions = generic_iommu_put_resv_regions,
6124 .apply_resv_region = intel_iommu_apply_resv_region,
6125 .device_group = intel_iommu_device_group,
6126 .dev_has_feat = intel_iommu_dev_has_feat,
6127 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6128 .dev_enable_feat = intel_iommu_dev_enable_feat,
6129 .dev_disable_feat = intel_iommu_dev_disable_feat,
6130 .is_attach_deferred = intel_iommu_is_attach_deferred,
6131 .def_domain_type = device_def_domain_type,
6132 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6133 #ifdef CONFIG_INTEL_IOMMU_SVM
6134 .cache_invalidate = intel_iommu_sva_invalidate,
6135 .sva_bind_gpasid = intel_svm_bind_gpasid,
6136 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6137 .sva_bind = intel_svm_bind,
6138 .sva_unbind = intel_svm_unbind,
6139 .sva_get_pasid = intel_svm_get_pasid,
6140 .page_response = intel_svm_page_response,
6144 static void quirk_iommu_igfx(struct pci_dev *dev)
6146 if (risky_device(dev))
6149 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6153 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6162 /* Broadwell igfx malfunctions with dmar */
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6170 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6171 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6172 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6173 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6174 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6175 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6176 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6177 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6178 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6188 static void quirk_iommu_rwbf(struct pci_dev *dev)
6190 if (risky_device(dev))
6194 * Mobile 4 Series Chipset neglects to set RWBF capability,
6195 * but needs it. Same seems to hold for the desktop versions.
6197 pci_info(dev, "Forcing write-buffer flush capability\n");
6201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6206 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6210 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6211 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6212 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6213 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6214 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6215 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6216 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6217 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6219 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6223 if (risky_device(dev))
6226 if (pci_read_config_word(dev, GGC, &ggc))
6229 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6230 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6232 } else if (dmar_map_gfx) {
6233 /* we have to ensure the gfx device is idle before we flush */
6234 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6235 intel_iommu_strict = 1;
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6243 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6247 if (!IS_GFX_DEVICE(dev))
6250 ver = (dev->device >> 8) & 0xff;
6251 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6252 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6256 if (risky_device(dev))
6259 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6260 iommu_skip_te_disable = 1;
6262 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6264 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6265 ISOCH DMAR unit for the Azalia sound device, but not give it any
6266 TLB entries, which causes it to deadlock. Check for that. We do
6267 this in a function called from init_dmars(), instead of in a PCI
6268 quirk, because we don't want to print the obnoxious "BIOS broken"
6269 message if VT-d is actually disabled.
6271 static void __init check_tylersburg_isoch(void)
6273 struct pci_dev *pdev;
6274 uint32_t vtisochctrl;
6276 /* If there's no Azalia in the system anyway, forget it. */
6277 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6281 if (risky_device(pdev)) {
6288 /* System Management Registers. Might be hidden, in which case
6289 we can't do the sanity check. But that's OK, because the
6290 known-broken BIOSes _don't_ actually hide it, so far. */
6291 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6295 if (risky_device(pdev)) {
6300 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6307 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6308 if (vtisochctrl & 1)
6311 /* Drop all bits other than the number of TLB entries */
6312 vtisochctrl &= 0x1c;
6314 /* If we have the recommended number of TLB entries (16), fine. */
6315 if (vtisochctrl == 0x10)
6318 /* Zero TLB entries? You get to ride the short bus to school. */
6320 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6321 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6322 dmi_get_system_info(DMI_BIOS_VENDOR),
6323 dmi_get_system_info(DMI_BIOS_VERSION),
6324 dmi_get_system_info(DMI_PRODUCT_VERSION));
6325 iommu_identity_mapping |= IDENTMAP_AZALIA;
6329 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",