1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "../irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
361 #define IDENTMAP_GFX 2
362 #define IDENTMAP_AZALIA 4
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
397 struct device_domain_info *info;
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
403 spin_unlock_irqrestore(&device_domain_lock, flags);
407 spin_unlock_irqrestore(&device_domain_lock, flags);
412 const struct iommu_ops intel_iommu_ops;
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
424 static void init_translation_status(struct intel_iommu *iommu)
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
433 static int __init intel_iommu_setup(char *str)
438 if (!strncmp(str, "on", 2)) {
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
468 str += strcspn(str, ",");
474 __setup("intel_iommu=", intel_iommu_setup);
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 struct dmar_domain **domains;
484 domains = iommu->domains[idx];
488 return domains[did & 0xff];
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
494 struct dmar_domain **domains;
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
506 domains[did & 0xff] = domain;
509 void *alloc_pgtable_page(int node)
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 vaddr = page_address(page);
520 void free_pgtable_page(void *vaddr)
522 free_page((unsigned long)vaddr);
525 static inline void *alloc_domain_mem(void)
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 static void free_domain_mem(void *vaddr)
532 kmem_cache_free(iommu_domain_cache, vaddr);
535 static inline void * alloc_devinfo_mem(void)
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 static inline void free_devinfo_mem(void *vaddr)
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
545 static inline int domain_type_is_si(struct dmar_domain *domain)
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
571 if (test_bit(agaw, &sagaw))
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 for_each_domain_iommu(iommu_id, domain)
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 return g_iommus[iommu_id];
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
627 domain->iommu_coherency = 1;
629 for_each_domain_iommu(i, domain) {
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
639 /* No hardware attached; use lowest common denominator */
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
657 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_sc_support(iommu->ecap)) {
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 struct intel_iommu *skip)
673 struct dmar_drhd_unit *drhd;
674 struct intel_iommu *iommu;
677 if (!intel_iommu_superpage) {
681 /* set iommu_superpage to the smallest common denominator */
683 for_each_active_iommu(iommu, drhd) {
685 if (domain && domain_use_first_level(domain)) {
686 if (!cap_fl1gp_support(iommu->cap))
689 mask &= cap_super_page_val(iommu->cap);
701 /* Some capabilities may be different across iommus */
702 static void domain_update_iommu_cap(struct dmar_domain *domain)
704 domain_update_iommu_coherency(domain);
705 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
706 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
709 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
712 struct root_entry *root = &iommu->root_entry[bus];
713 struct context_entry *context;
717 if (sm_supported(iommu)) {
725 context = phys_to_virt(*entry & VTD_PAGE_MASK);
727 unsigned long phy_addr;
731 context = alloc_pgtable_page(iommu->node);
735 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
736 phy_addr = virt_to_phys((void *)context);
737 *entry = phy_addr | 1;
738 __iommu_flush_cache(iommu, entry, sizeof(*entry));
740 return &context[devfn];
743 static bool attach_deferred(struct device *dev)
745 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750 * sub-hierarchy of a candidate PCI-PCI bridge
751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752 * @bridge: the candidate PCI-PCI bridge
754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
759 struct pci_dev *pdev, *pbridge;
761 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
764 pdev = to_pci_dev(dev);
765 pbridge = to_pci_dev(bridge);
767 if (pbridge->subordinate &&
768 pbridge->subordinate->number <= pdev->bus->number &&
769 pbridge->subordinate->busn_res.end >= pdev->bus->number)
775 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
777 struct dmar_drhd_unit *drhd;
781 /* We know that this device on this chipset has its own IOMMU.
782 * If we find it under a different IOMMU, then the BIOS is lying
783 * to us. Hope that the IOMMU for this device is actually
784 * disabled, and it needs no translation...
786 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
789 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
794 /* we know that the this iommu should be at offset 0xa000 from vtbar */
795 drhd = dmar_find_matched_drhd_unit(pdev);
796 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
797 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
798 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
805 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
807 if (!iommu || iommu->drhd->ignored)
810 if (dev_is_pci(dev)) {
811 struct pci_dev *pdev = to_pci_dev(dev);
813 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
814 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
815 quirk_ioat_snb_local_iommu(pdev))
822 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
824 struct dmar_drhd_unit *drhd = NULL;
825 struct pci_dev *pdev = NULL;
826 struct intel_iommu *iommu;
834 if (dev_is_pci(dev)) {
835 struct pci_dev *pf_pdev;
837 pdev = pci_real_dma_dev(to_pci_dev(dev));
839 /* VFs aren't listed in scope tables; we need to look up
840 * the PF instead to find the IOMMU. */
841 pf_pdev = pci_physfn(pdev);
843 segment = pci_domain_nr(pdev->bus);
844 } else if (has_acpi_companion(dev))
845 dev = &ACPI_COMPANION(dev)->dev;
848 for_each_iommu(iommu, drhd) {
849 if (pdev && segment != drhd->segment)
852 for_each_active_dev_scope(drhd->devices,
853 drhd->devices_cnt, i, tmp) {
855 /* For a VF use its original BDF# not that of the PF
856 * which we used for the IOMMU lookup. Strictly speaking
857 * we could do this for all PCI devices; we only need to
858 * get the BDF# from the scope table for ACPI matches. */
859 if (pdev && pdev->is_virtfn)
863 *bus = drhd->devices[i].bus;
864 *devfn = drhd->devices[i].devfn;
869 if (is_downstream_to_pci_bridge(dev, tmp))
873 if (pdev && drhd->include_all) {
876 *bus = pdev->bus->number;
877 *devfn = pdev->devfn;
884 if (iommu_is_dummy(iommu, dev))
892 static void domain_flush_cache(struct dmar_domain *domain,
893 void *addr, int size)
895 if (!domain->iommu_coherency)
896 clflush_cache_range(addr, size);
899 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
901 struct context_entry *context;
905 spin_lock_irqsave(&iommu->lock, flags);
906 context = iommu_context_addr(iommu, bus, devfn, 0);
908 ret = context_present(context);
909 spin_unlock_irqrestore(&iommu->lock, flags);
913 static void free_context_table(struct intel_iommu *iommu)
917 struct context_entry *context;
919 spin_lock_irqsave(&iommu->lock, flags);
920 if (!iommu->root_entry) {
923 for (i = 0; i < ROOT_ENTRY_NR; i++) {
924 context = iommu_context_addr(iommu, i, 0, 0);
926 free_pgtable_page(context);
928 if (!sm_supported(iommu))
931 context = iommu_context_addr(iommu, i, 0x80, 0);
933 free_pgtable_page(context);
936 free_pgtable_page(iommu->root_entry);
937 iommu->root_entry = NULL;
939 spin_unlock_irqrestore(&iommu->lock, flags);
942 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
943 unsigned long pfn, int *target_level)
945 struct dma_pte *parent, *pte;
946 int level = agaw_to_level(domain->agaw);
949 BUG_ON(!domain->pgd);
951 if (!domain_pfn_supported(domain, pfn))
952 /* Address beyond IOMMU's addressing capabilities. */
955 parent = domain->pgd;
960 offset = pfn_level_offset(pfn, level);
961 pte = &parent[offset];
962 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
964 if (level == *target_level)
967 if (!dma_pte_present(pte)) {
970 tmp_page = alloc_pgtable_page(domain->nid);
975 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
976 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
977 if (domain_use_first_level(domain))
978 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
979 if (cmpxchg64(&pte->val, 0ULL, pteval))
980 /* Someone else set it while we were thinking; use theirs. */
981 free_pgtable_page(tmp_page);
983 domain_flush_cache(domain, pte, sizeof(*pte));
988 parent = phys_to_virt(dma_pte_addr(pte));
993 *target_level = level;
998 /* return address's pte at specific level */
999 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1001 int level, int *large_page)
1003 struct dma_pte *parent, *pte;
1004 int total = agaw_to_level(domain->agaw);
1007 parent = domain->pgd;
1008 while (level <= total) {
1009 offset = pfn_level_offset(pfn, total);
1010 pte = &parent[offset];
1014 if (!dma_pte_present(pte)) {
1015 *large_page = total;
1019 if (dma_pte_superpage(pte)) {
1020 *large_page = total;
1024 parent = phys_to_virt(dma_pte_addr(pte));
1030 /* clear last level pte, a tlb flush should be followed */
1031 static void dma_pte_clear_range(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn)
1035 unsigned int large_page;
1036 struct dma_pte *first_pte, *pte;
1038 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040 BUG_ON(start_pfn > last_pfn);
1042 /* we don't need lock here; nobody else touches the iova range */
1045 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1047 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1052 start_pfn += lvl_to_nr_pages(large_page);
1054 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1056 domain_flush_cache(domain, first_pte,
1057 (void *)pte - (void *)first_pte);
1059 } while (start_pfn && start_pfn <= last_pfn);
1062 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063 int retain_level, struct dma_pte *pte,
1064 unsigned long pfn, unsigned long start_pfn,
1065 unsigned long last_pfn)
1067 pfn = max(start_pfn, pfn);
1068 pte = &pte[pfn_level_offset(pfn, level)];
1071 unsigned long level_pfn;
1072 struct dma_pte *level_pte;
1074 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1077 level_pfn = pfn & level_mask(level);
1078 level_pte = phys_to_virt(dma_pte_addr(pte));
1081 dma_pte_free_level(domain, level - 1, retain_level,
1082 level_pte, level_pfn, start_pfn,
1087 * Free the page table if we're below the level we want to
1088 * retain and the range covers the entire table.
1090 if (level < retain_level && !(start_pfn > level_pfn ||
1091 last_pfn < level_pfn + level_size(level) - 1)) {
1093 domain_flush_cache(domain, pte, sizeof(*pte));
1094 free_pgtable_page(level_pte);
1097 pfn += level_size(level);
1098 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1102 * clear last level (leaf) ptes and free page table pages below the
1103 * level we wish to keep intact.
1105 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106 unsigned long start_pfn,
1107 unsigned long last_pfn,
1110 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112 BUG_ON(start_pfn > last_pfn);
1114 dma_pte_clear_range(domain, start_pfn, last_pfn);
1116 /* We don't need lock here; nobody else touches the iova range */
1117 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118 domain->pgd, 0, start_pfn, last_pfn);
1121 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122 free_pgtable_page(domain->pgd);
1127 /* When a page at a given level is being unlinked from its parent, we don't
1128 need to *modify* it at all. All we need to do is make a list of all the
1129 pages which can be freed just as soon as we've flushed the IOTLB and we
1130 know the hardware page-walk will no longer touch them.
1131 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1133 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134 int level, struct dma_pte *pte,
1135 struct page *freelist)
1139 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140 pg->freelist = freelist;
1146 pte = page_address(pg);
1148 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149 freelist = dma_pte_list_pagetables(domain, level - 1,
1152 } while (!first_pte_in_page(pte));
1157 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158 struct dma_pte *pte, unsigned long pfn,
1159 unsigned long start_pfn,
1160 unsigned long last_pfn,
1161 struct page *freelist)
1163 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1165 pfn = max(start_pfn, pfn);
1166 pte = &pte[pfn_level_offset(pfn, level)];
1169 unsigned long level_pfn;
1171 if (!dma_pte_present(pte))
1174 level_pfn = pfn & level_mask(level);
1176 /* If range covers entire pagetable, free it */
1177 if (start_pfn <= level_pfn &&
1178 last_pfn >= level_pfn + level_size(level) - 1) {
1179 /* These suborbinate page tables are going away entirely. Don't
1180 bother to clear them; we're just going to *free* them. */
1181 if (level > 1 && !dma_pte_superpage(pte))
1182 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1188 } else if (level > 1) {
1189 /* Recurse down into a level that isn't *entirely* obsolete */
1190 freelist = dma_pte_clear_level(domain, level - 1,
1191 phys_to_virt(dma_pte_addr(pte)),
1192 level_pfn, start_pfn, last_pfn,
1196 pfn += level_size(level);
1197 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1200 domain_flush_cache(domain, first_pte,
1201 (void *)++last_pte - (void *)first_pte);
1206 /* We can't just free the pages because the IOMMU may still be walking
1207 the page tables, and may have cached the intermediate levels. The
1208 pages can only be freed after the IOTLB flush has been done. */
1209 static struct page *domain_unmap(struct dmar_domain *domain,
1210 unsigned long start_pfn,
1211 unsigned long last_pfn)
1213 struct page *freelist;
1215 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217 BUG_ON(start_pfn > last_pfn);
1219 /* we don't need lock here; nobody else touches the iova range */
1220 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221 domain->pgd, 0, start_pfn, last_pfn, NULL);
1224 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225 struct page *pgd_page = virt_to_page(domain->pgd);
1226 pgd_page->freelist = freelist;
1227 freelist = pgd_page;
1235 static void dma_free_pagelist(struct page *freelist)
1239 while ((pg = freelist)) {
1240 freelist = pg->freelist;
1241 free_pgtable_page(page_address(pg));
1245 static void iova_entry_free(unsigned long data)
1247 struct page *freelist = (struct page *)data;
1249 dma_free_pagelist(freelist);
1252 /* iommu handling */
1253 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1255 struct root_entry *root;
1256 unsigned long flags;
1258 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1260 pr_err("Allocating root entry for %s failed\n",
1265 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1267 spin_lock_irqsave(&iommu->lock, flags);
1268 iommu->root_entry = root;
1269 spin_unlock_irqrestore(&iommu->lock, flags);
1274 static void iommu_set_root_entry(struct intel_iommu *iommu)
1280 addr = virt_to_phys(iommu->root_entry);
1281 if (sm_supported(iommu))
1282 addr |= DMA_RTADDR_SMT;
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1287 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1289 /* Make sure hardware complete it */
1290 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291 readl, (sts & DMA_GSTS_RTPS), sts);
1293 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1296 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1301 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1304 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1307 /* Make sure hardware complete it */
1308 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309 readl, (!(val & DMA_GSTS_WBFS)), val);
1311 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1314 /* return value determine if we need a write buffer flush */
1315 static void __iommu_flush_context(struct intel_iommu *iommu,
1316 u16 did, u16 source_id, u8 function_mask,
1323 case DMA_CCMD_GLOBAL_INVL:
1324 val = DMA_CCMD_GLOBAL_INVL;
1326 case DMA_CCMD_DOMAIN_INVL:
1327 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1329 case DMA_CCMD_DEVICE_INVL:
1330 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1336 val |= DMA_CCMD_ICC;
1338 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1341 /* Make sure hardware complete it */
1342 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1345 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348 /* return value determine if we need a write buffer flush */
1349 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350 u64 addr, unsigned int size_order, u64 type)
1352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353 u64 val = 0, val_iva = 0;
1357 case DMA_TLB_GLOBAL_FLUSH:
1358 /* global flush doesn't need set IVA_REG */
1359 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1361 case DMA_TLB_DSI_FLUSH:
1362 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1364 case DMA_TLB_PSI_FLUSH:
1365 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366 /* IH bit is passed in as part of address */
1367 val_iva = size_order | addr;
1372 /* Note: set drain read/write */
1375 * This is probably to be super secure.. Looks like we can
1376 * ignore it without any impact.
1378 if (cap_read_drain(iommu->cap))
1379 val |= DMA_TLB_READ_DRAIN;
1381 if (cap_write_drain(iommu->cap))
1382 val |= DMA_TLB_WRITE_DRAIN;
1384 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 /* Note: Only uses first TLB reg currently */
1387 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1390 /* Make sure hardware complete it */
1391 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396 /* check IOTLB invalidation granularity */
1397 if (DMA_TLB_IAIG(val) == 0)
1398 pr_err("Flush IOTLB failed\n");
1399 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400 pr_debug("TLB flush request %Lx, actual %Lx\n",
1401 (unsigned long long)DMA_TLB_IIRG(type),
1402 (unsigned long long)DMA_TLB_IAIG(val));
1405 static struct device_domain_info *
1406 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1409 struct device_domain_info *info;
1411 assert_spin_locked(&device_domain_lock);
1416 list_for_each_entry(info, &domain->devices, link)
1417 if (info->iommu == iommu && info->bus == bus &&
1418 info->devfn == devfn) {
1419 if (info->ats_supported && info->dev)
1427 static void domain_update_iotlb(struct dmar_domain *domain)
1429 struct device_domain_info *info;
1430 bool has_iotlb_device = false;
1432 assert_spin_locked(&device_domain_lock);
1434 list_for_each_entry(info, &domain->devices, link) {
1435 struct pci_dev *pdev;
1437 if (!info->dev || !dev_is_pci(info->dev))
1440 pdev = to_pci_dev(info->dev);
1441 if (pdev->ats_enabled) {
1442 has_iotlb_device = true;
1447 domain->has_iotlb_device = has_iotlb_device;
1450 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1452 struct pci_dev *pdev;
1454 assert_spin_locked(&device_domain_lock);
1456 if (!info || !dev_is_pci(info->dev))
1459 pdev = to_pci_dev(info->dev);
1460 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463 * reserved, which should be set to 0.
1465 if (!ecap_dit(info->iommu->ecap))
1468 struct pci_dev *pf_pdev;
1470 /* pdev will be returned if device is not a vf */
1471 pf_pdev = pci_physfn(pdev);
1472 info->pfsid = pci_dev_id(pf_pdev);
1475 #ifdef CONFIG_INTEL_IOMMU_SVM
1476 /* The PCIe spec, in its wisdom, declares that the behaviour of
1477 the device if you enable PASID support after ATS support is
1478 undefined. So always enable PASID support on devices which
1479 have it, even if we can't yet know if we're ever going to
1481 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482 info->pasid_enabled = 1;
1484 if (info->pri_supported &&
1485 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1486 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487 info->pri_enabled = 1;
1489 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491 info->ats_enabled = 1;
1492 domain_update_iotlb(info->domain);
1493 info->ats_qdep = pci_ats_queue_depth(pdev);
1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1499 struct pci_dev *pdev;
1501 assert_spin_locked(&device_domain_lock);
1503 if (!dev_is_pci(info->dev))
1506 pdev = to_pci_dev(info->dev);
1508 if (info->ats_enabled) {
1509 pci_disable_ats(pdev);
1510 info->ats_enabled = 0;
1511 domain_update_iotlb(info->domain);
1513 #ifdef CONFIG_INTEL_IOMMU_SVM
1514 if (info->pri_enabled) {
1515 pci_disable_pri(pdev);
1516 info->pri_enabled = 0;
1518 if (info->pasid_enabled) {
1519 pci_disable_pasid(pdev);
1520 info->pasid_enabled = 0;
1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526 u64 addr, unsigned mask)
1529 unsigned long flags;
1530 struct device_domain_info *info;
1532 if (!domain->has_iotlb_device)
1535 spin_lock_irqsave(&device_domain_lock, flags);
1536 list_for_each_entry(info, &domain->devices, link) {
1537 if (!info->ats_enabled)
1540 sid = info->bus << 8 | info->devfn;
1541 qdep = info->ats_qdep;
1542 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1545 spin_unlock_irqrestore(&device_domain_lock, flags);
1548 static void domain_flush_piotlb(struct intel_iommu *iommu,
1549 struct dmar_domain *domain,
1550 u64 addr, unsigned long npages, bool ih)
1552 u16 did = domain->iommu_did[iommu->seq_id];
1554 if (domain->default_pasid)
1555 qi_flush_piotlb(iommu, did, domain->default_pasid,
1558 if (!list_empty(&domain->devices))
1559 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1562 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563 struct dmar_domain *domain,
1564 unsigned long pfn, unsigned int pages,
1567 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569 u16 did = domain->iommu_did[iommu->seq_id];
1576 if (domain_use_first_level(domain)) {
1577 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1580 * Fallback to domain selective flush if no PSI support or
1581 * the size is too big. PSI requires page size to be 2 ^ x,
1582 * and the base address is naturally aligned to the size.
1584 if (!cap_pgsel_inv(iommu->cap) ||
1585 mask > cap_max_amask_val(iommu->cap))
1586 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1594 * In caching mode, changes of pages from non-present to present require
1595 * flush. However, device IOTLB doesn't need to be flushed in this case.
1597 if (!cap_caching_mode(iommu->cap) || !map)
1598 iommu_flush_dev_iotlb(domain, addr, mask);
1601 /* Notification for newly created mappings */
1602 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603 struct dmar_domain *domain,
1604 unsigned long pfn, unsigned int pages)
1607 * It's a non-present to present mapping. Only flush if caching mode
1610 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1613 iommu_flush_write_buffer(iommu);
1616 static void iommu_flush_iova(struct iova_domain *iovad)
1618 struct dmar_domain *domain;
1621 domain = container_of(iovad, struct dmar_domain, iovad);
1623 for_each_domain_iommu(idx, domain) {
1624 struct intel_iommu *iommu = g_iommus[idx];
1625 u16 did = domain->iommu_did[iommu->seq_id];
1627 if (domain_use_first_level(domain))
1628 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1630 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1633 if (!cap_caching_mode(iommu->cap))
1634 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635 0, MAX_AGAW_PFN_WIDTH);
1639 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1642 unsigned long flags;
1644 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1647 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649 pmen &= ~DMA_PMEN_EPM;
1650 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1652 /* wait for the protected region status bit to clear */
1653 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654 readl, !(pmen & DMA_PMEN_PRS), pmen);
1656 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1659 static void iommu_enable_translation(struct intel_iommu *iommu)
1662 unsigned long flags;
1664 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665 iommu->gcmd |= DMA_GCMD_TE;
1666 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1668 /* Make sure hardware complete it */
1669 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670 readl, (sts & DMA_GSTS_TES), sts);
1672 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1675 static void iommu_disable_translation(struct intel_iommu *iommu)
1680 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1684 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685 iommu->gcmd &= ~DMA_GCMD_TE;
1686 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1688 /* Make sure hardware complete it */
1689 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690 readl, (!(sts & DMA_GSTS_TES)), sts);
1692 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1695 static int iommu_init_domains(struct intel_iommu *iommu)
1697 u32 ndomains, nlongs;
1700 ndomains = cap_ndoms(iommu->cap);
1701 pr_debug("%s: Number of Domains supported <%d>\n",
1702 iommu->name, ndomains);
1703 nlongs = BITS_TO_LONGS(ndomains);
1705 spin_lock_init(&iommu->lock);
1707 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708 if (!iommu->domain_ids) {
1709 pr_err("%s: Allocating domain id array failed\n",
1714 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715 iommu->domains = kzalloc(size, GFP_KERNEL);
1717 if (iommu->domains) {
1718 size = 256 * sizeof(struct dmar_domain *);
1719 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1722 if (!iommu->domains || !iommu->domains[0]) {
1723 pr_err("%s: Allocating domain array failed\n",
1725 kfree(iommu->domain_ids);
1726 kfree(iommu->domains);
1727 iommu->domain_ids = NULL;
1728 iommu->domains = NULL;
1733 * If Caching mode is set, then invalid translations are tagged
1734 * with domain-id 0, hence we need to pre-allocate it. We also
1735 * use domain-id 0 as a marker for non-allocated domain-id, so
1736 * make sure it is not used for a real domain.
1738 set_bit(0, iommu->domain_ids);
1741 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742 * entry for first-level or pass-through translation modes should
1743 * be programmed with a domain id different from those used for
1744 * second-level or nested translation. We reserve a domain id for
1747 if (sm_supported(iommu))
1748 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1753 static void disable_dmar_iommu(struct intel_iommu *iommu)
1755 struct device_domain_info *info, *tmp;
1756 unsigned long flags;
1758 if (!iommu->domains || !iommu->domain_ids)
1761 spin_lock_irqsave(&device_domain_lock, flags);
1762 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763 if (info->iommu != iommu)
1766 if (!info->dev || !info->domain)
1769 __dmar_remove_one_dev_info(info);
1771 spin_unlock_irqrestore(&device_domain_lock, flags);
1773 if (iommu->gcmd & DMA_GCMD_TE)
1774 iommu_disable_translation(iommu);
1777 static void free_dmar_iommu(struct intel_iommu *iommu)
1779 if ((iommu->domains) && (iommu->domain_ids)) {
1780 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1783 for (i = 0; i < elems; i++)
1784 kfree(iommu->domains[i]);
1785 kfree(iommu->domains);
1786 kfree(iommu->domain_ids);
1787 iommu->domains = NULL;
1788 iommu->domain_ids = NULL;
1791 g_iommus[iommu->seq_id] = NULL;
1793 /* free context mapping */
1794 free_context_table(iommu);
1796 #ifdef CONFIG_INTEL_IOMMU_SVM
1797 if (pasid_supported(iommu)) {
1798 if (ecap_prs(iommu->ecap))
1799 intel_svm_finish_prq(iommu);
1801 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802 ioasid_unregister_allocator(&iommu->pasid_allocator);
1808 * Check and return whether first level is used by default for
1811 static bool first_level_by_default(void)
1813 struct dmar_drhd_unit *drhd;
1814 struct intel_iommu *iommu;
1815 static int first_level_support = -1;
1817 if (likely(first_level_support != -1))
1818 return first_level_support;
1820 first_level_support = 1;
1823 for_each_active_iommu(iommu, drhd) {
1824 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825 first_level_support = 0;
1831 return first_level_support;
1834 static struct dmar_domain *alloc_domain(int flags)
1836 struct dmar_domain *domain;
1838 domain = alloc_domain_mem();
1842 memset(domain, 0, sizeof(*domain));
1843 domain->nid = NUMA_NO_NODE;
1844 domain->flags = flags;
1845 if (first_level_by_default())
1846 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847 domain->has_iotlb_device = false;
1848 INIT_LIST_HEAD(&domain->devices);
1853 /* Must be called with iommu->lock */
1854 static int domain_attach_iommu(struct dmar_domain *domain,
1855 struct intel_iommu *iommu)
1857 unsigned long ndomains;
1860 assert_spin_locked(&device_domain_lock);
1861 assert_spin_locked(&iommu->lock);
1863 domain->iommu_refcnt[iommu->seq_id] += 1;
1864 domain->iommu_count += 1;
1865 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866 ndomains = cap_ndoms(iommu->cap);
1867 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1869 if (num >= ndomains) {
1870 pr_err("%s: No free domain ids\n", iommu->name);
1871 domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 domain->iommu_count -= 1;
1876 set_bit(num, iommu->domain_ids);
1877 set_iommu_domain(iommu, num, domain);
1879 domain->iommu_did[iommu->seq_id] = num;
1880 domain->nid = iommu->node;
1882 domain_update_iommu_cap(domain);
1888 static int domain_detach_iommu(struct dmar_domain *domain,
1889 struct intel_iommu *iommu)
1893 assert_spin_locked(&device_domain_lock);
1894 assert_spin_locked(&iommu->lock);
1896 domain->iommu_refcnt[iommu->seq_id] -= 1;
1897 count = --domain->iommu_count;
1898 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899 num = domain->iommu_did[iommu->seq_id];
1900 clear_bit(num, iommu->domain_ids);
1901 set_iommu_domain(iommu, num, NULL);
1903 domain_update_iommu_cap(domain);
1904 domain->iommu_did[iommu->seq_id] = 0;
1910 static struct iova_domain reserved_iova_list;
1911 static struct lock_class_key reserved_rbtree_key;
1913 static int dmar_init_reserved_ranges(void)
1915 struct pci_dev *pdev = NULL;
1919 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1921 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922 &reserved_rbtree_key);
1924 /* IOAPIC ranges shouldn't be accessed by DMA */
1925 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926 IOVA_PFN(IOAPIC_RANGE_END));
1928 pr_err("Reserve IOAPIC range failed\n");
1932 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1933 for_each_pci_dev(pdev) {
1936 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937 r = &pdev->resource[i];
1938 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1940 iova = reserve_iova(&reserved_iova_list,
1944 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1952 static inline int guestwidth_to_adjustwidth(int gaw)
1955 int r = (gaw - 12) % 9;
1966 static void domain_exit(struct dmar_domain *domain)
1969 /* Remove associated devices and clear attached or cached domains */
1970 domain_remove_dev_info(domain);
1973 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974 put_iova_domain(&domain->iovad);
1977 struct page *freelist;
1979 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980 dma_free_pagelist(freelist);
1983 free_domain_mem(domain);
1987 * Get the PASID directory size for scalable mode context entry.
1988 * Value of X in the PDTS field of a scalable mode context entry
1989 * indicates PASID directory with 2^(X + 7) entries.
1991 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1995 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2004 * Set the RID_PASID field of a scalable mode context entry. The
2005 * IOMMU hardware will use the PASID value set in this field for
2006 * DMA translations of DMA requests without PASID.
2009 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2011 context->hi |= pasid & ((1 << 20) - 1);
2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2018 static inline void context_set_sm_dte(struct context_entry *context)
2020 context->lo |= (1 << 2);
2024 * Set the PRE(Page Request Enable) field of a scalable mode context
2027 static inline void context_set_sm_pre(struct context_entry *context)
2029 context->lo |= (1 << 4);
2032 /* Convert value to context PASID directory size field coding. */
2033 #define context_pdts(pds) (((pds) & 0x7) << 9)
2035 static int domain_context_mapping_one(struct dmar_domain *domain,
2036 struct intel_iommu *iommu,
2037 struct pasid_table *table,
2040 u16 did = domain->iommu_did[iommu->seq_id];
2041 int translation = CONTEXT_TT_MULTI_LEVEL;
2042 struct device_domain_info *info = NULL;
2043 struct context_entry *context;
2044 unsigned long flags;
2049 if (hw_pass_through && domain_type_is_si(domain))
2050 translation = CONTEXT_TT_PASS_THROUGH;
2052 pr_debug("Set context mapping for %02x:%02x.%d\n",
2053 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2055 BUG_ON(!domain->pgd);
2057 spin_lock_irqsave(&device_domain_lock, flags);
2058 spin_lock(&iommu->lock);
2061 context = iommu_context_addr(iommu, bus, devfn, 1);
2066 if (context_present(context))
2070 * For kdump cases, old valid entries may be cached due to the
2071 * in-flight DMA and copied pgtable, but there is no unmapping
2072 * behaviour for them, thus we need an explicit cache flush for
2073 * the newly-mapped device. For kdump, at this point, the device
2074 * is supposed to finish reset at its driver probe stage, so no
2075 * in-flight DMA will exist, and we don't need to worry anymore
2078 if (context_copied(context)) {
2079 u16 did_old = context_domain_id(context);
2081 if (did_old < cap_ndoms(iommu->cap)) {
2082 iommu->flush.flush_context(iommu, did_old,
2083 (((u16)bus) << 8) | devfn,
2084 DMA_CCMD_MASK_NOBIT,
2085 DMA_CCMD_DEVICE_INVL);
2086 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2091 context_clear_entry(context);
2093 if (sm_supported(iommu)) {
2098 /* Setup the PASID DIR pointer: */
2099 pds = context_get_sm_pds(table);
2100 context->lo = (u64)virt_to_phys(table->table) |
2103 /* Setup the RID_PASID field: */
2104 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2107 * Setup the Device-TLB enable bit and Page request
2110 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111 if (info && info->ats_supported)
2112 context_set_sm_dte(context);
2113 if (info && info->pri_supported)
2114 context_set_sm_pre(context);
2116 struct dma_pte *pgd = domain->pgd;
2119 context_set_domain_id(context, did);
2121 if (translation != CONTEXT_TT_PASS_THROUGH) {
2123 * Skip top levels of page tables for iommu which has
2124 * less agaw than default. Unnecessary for PT mode.
2126 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2128 pgd = phys_to_virt(dma_pte_addr(pgd));
2129 if (!dma_pte_present(pgd))
2133 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134 if (info && info->ats_supported)
2135 translation = CONTEXT_TT_DEV_IOTLB;
2137 translation = CONTEXT_TT_MULTI_LEVEL;
2139 context_set_address_root(context, virt_to_phys(pgd));
2140 context_set_address_width(context, agaw);
2143 * In pass through mode, AW must be programmed to
2144 * indicate the largest AGAW value supported by
2145 * hardware. And ASR is ignored by hardware.
2147 context_set_address_width(context, iommu->msagaw);
2150 context_set_translation_type(context, translation);
2153 context_set_fault_enable(context);
2154 context_set_present(context);
2155 if (!ecap_coherent(iommu->ecap))
2156 clflush_cache_range(context, sizeof(*context));
2159 * It's a non-present to present mapping. If hardware doesn't cache
2160 * non-present entry we only need to flush the write-buffer. If the
2161 * _does_ cache non-present entries, then it does so in the special
2162 * domain #0, which we have to flush:
2164 if (cap_caching_mode(iommu->cap)) {
2165 iommu->flush.flush_context(iommu, 0,
2166 (((u16)bus) << 8) | devfn,
2167 DMA_CCMD_MASK_NOBIT,
2168 DMA_CCMD_DEVICE_INVL);
2169 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2171 iommu_flush_write_buffer(iommu);
2173 iommu_enable_dev_iotlb(info);
2178 spin_unlock(&iommu->lock);
2179 spin_unlock_irqrestore(&device_domain_lock, flags);
2184 struct domain_context_mapping_data {
2185 struct dmar_domain *domain;
2186 struct intel_iommu *iommu;
2187 struct pasid_table *table;
2190 static int domain_context_mapping_cb(struct pci_dev *pdev,
2191 u16 alias, void *opaque)
2193 struct domain_context_mapping_data *data = opaque;
2195 return domain_context_mapping_one(data->domain, data->iommu,
2196 data->table, PCI_BUS_NUM(alias),
2201 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2203 struct domain_context_mapping_data data;
2204 struct pasid_table *table;
2205 struct intel_iommu *iommu;
2208 iommu = device_to_iommu(dev, &bus, &devfn);
2212 table = intel_pasid_get_table(dev);
2214 if (!dev_is_pci(dev))
2215 return domain_context_mapping_one(domain, iommu, table,
2218 data.domain = domain;
2222 return pci_for_each_dma_alias(to_pci_dev(dev),
2223 &domain_context_mapping_cb, &data);
2226 static int domain_context_mapped_cb(struct pci_dev *pdev,
2227 u16 alias, void *opaque)
2229 struct intel_iommu *iommu = opaque;
2231 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2234 static int domain_context_mapped(struct device *dev)
2236 struct intel_iommu *iommu;
2239 iommu = device_to_iommu(dev, &bus, &devfn);
2243 if (!dev_is_pci(dev))
2244 return device_context_mapped(iommu, bus, devfn);
2246 return !pci_for_each_dma_alias(to_pci_dev(dev),
2247 domain_context_mapped_cb, iommu);
2250 /* Returns a number of VTD pages, but aligned to MM page size */
2251 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2254 host_addr &= ~PAGE_MASK;
2255 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2258 /* Return largest possible superpage level for a given mapping */
2259 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260 unsigned long iov_pfn,
2261 unsigned long phy_pfn,
2262 unsigned long pages)
2264 int support, level = 1;
2265 unsigned long pfnmerge;
2267 support = domain->iommu_superpage;
2269 /* To use a large page, the virtual *and* physical addresses
2270 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271 of them will mean we have to use smaller pages. So just
2272 merge them and check both at once. */
2273 pfnmerge = iov_pfn | phy_pfn;
2275 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276 pages >>= VTD_STRIDE_SHIFT;
2279 pfnmerge >>= VTD_STRIDE_SHIFT;
2286 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287 struct scatterlist *sg, unsigned long phys_pfn,
2288 unsigned long nr_pages, int prot)
2290 struct dma_pte *first_pte = NULL, *pte = NULL;
2292 unsigned long sg_res = 0;
2293 unsigned int largepage_lvl = 0;
2294 unsigned long lvl_pages = 0;
2297 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2299 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2302 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303 if (domain_use_first_level(domain))
2304 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2308 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2311 while (nr_pages > 0) {
2315 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2317 sg_res = aligned_nrpages(sg->offset, sg->length);
2318 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319 sg->dma_length = sg->length;
2320 pteval = (sg_phys(sg) - pgoff) | attr;
2321 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2325 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2327 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2330 /* It is large page*/
2331 if (largepage_lvl > 1) {
2332 unsigned long nr_superpages, end_pfn;
2334 pteval |= DMA_PTE_LARGE_PAGE;
2335 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2337 nr_superpages = sg_res / lvl_pages;
2338 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2341 * Ensure that old small page tables are
2342 * removed to make room for superpage(s).
2343 * We're adding new large pages, so make sure
2344 * we don't remove their parent tables.
2346 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2349 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2353 /* We don't need lock here, nobody else
2354 * touches the iova range
2356 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2358 static int dumps = 5;
2359 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360 iov_pfn, tmp, (unsigned long long)pteval);
2363 debug_dma_dump_mappings(NULL);
2368 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2370 BUG_ON(nr_pages < lvl_pages);
2371 BUG_ON(sg_res < lvl_pages);
2373 nr_pages -= lvl_pages;
2374 iov_pfn += lvl_pages;
2375 phys_pfn += lvl_pages;
2376 pteval += lvl_pages * VTD_PAGE_SIZE;
2377 sg_res -= lvl_pages;
2379 /* If the next PTE would be the first in a new page, then we
2380 need to flush the cache on the entries we've just written.
2381 And then we'll need to recalculate 'pte', so clear it and
2382 let it get set again in the if (!pte) block above.
2384 If we're done (!nr_pages) we need to flush the cache too.
2386 Also if we've been setting superpages, we may need to
2387 recalculate 'pte' and switch back to smaller pages for the
2388 end of the mapping, if the trailing size is not enough to
2389 use another superpage (i.e. sg_res < lvl_pages). */
2391 if (!nr_pages || first_pte_in_page(pte) ||
2392 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393 domain_flush_cache(domain, first_pte,
2394 (void *)pte - (void *)first_pte);
2398 if (!sg_res && nr_pages)
2404 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405 struct scatterlist *sg, unsigned long phys_pfn,
2406 unsigned long nr_pages, int prot)
2409 struct intel_iommu *iommu;
2411 /* Do the real mapping first */
2412 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2416 for_each_domain_iommu(iommu_id, domain) {
2417 iommu = g_iommus[iommu_id];
2418 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2424 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425 struct scatterlist *sg, unsigned long nr_pages,
2428 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2431 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432 unsigned long phys_pfn, unsigned long nr_pages,
2435 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2438 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2440 unsigned long flags;
2441 struct context_entry *context;
2447 spin_lock_irqsave(&iommu->lock, flags);
2448 context = iommu_context_addr(iommu, bus, devfn, 0);
2450 spin_unlock_irqrestore(&iommu->lock, flags);
2453 did_old = context_domain_id(context);
2454 context_clear_entry(context);
2455 __iommu_flush_cache(iommu, context, sizeof(*context));
2456 spin_unlock_irqrestore(&iommu->lock, flags);
2457 iommu->flush.flush_context(iommu,
2459 (((u16)bus) << 8) | devfn,
2460 DMA_CCMD_MASK_NOBIT,
2461 DMA_CCMD_DEVICE_INVL);
2462 iommu->flush.flush_iotlb(iommu,
2469 static inline void unlink_domain_info(struct device_domain_info *info)
2471 assert_spin_locked(&device_domain_lock);
2472 list_del(&info->link);
2473 list_del(&info->global);
2475 dev_iommu_priv_set(info->dev, NULL);
2478 static void domain_remove_dev_info(struct dmar_domain *domain)
2480 struct device_domain_info *info, *tmp;
2481 unsigned long flags;
2483 spin_lock_irqsave(&device_domain_lock, flags);
2484 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485 __dmar_remove_one_dev_info(info);
2486 spin_unlock_irqrestore(&device_domain_lock, flags);
2489 struct dmar_domain *find_domain(struct device *dev)
2491 struct device_domain_info *info;
2493 if (unlikely(attach_deferred(dev)))
2496 /* No lock here, assumes no domain exit in normal case */
2497 info = get_domain_info(dev);
2499 return info->domain;
2504 static void do_deferred_attach(struct device *dev)
2506 struct iommu_domain *domain;
2508 dev_iommu_priv_set(dev, NULL);
2509 domain = iommu_get_domain_for_dev(dev);
2511 intel_iommu_attach_device(domain, dev);
2514 static inline struct device_domain_info *
2515 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2517 struct device_domain_info *info;
2519 list_for_each_entry(info, &device_domain_list, global)
2520 if (info->segment == segment && info->bus == bus &&
2521 info->devfn == devfn)
2527 static int domain_setup_first_level(struct intel_iommu *iommu,
2528 struct dmar_domain *domain,
2532 int flags = PASID_FLAG_SUPERVISOR_MODE;
2533 struct dma_pte *pgd = domain->pgd;
2537 * Skip top levels of page tables for iommu which has
2538 * less agaw than default. Unnecessary for PT mode.
2540 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541 pgd = phys_to_virt(dma_pte_addr(pgd));
2542 if (!dma_pte_present(pgd))
2546 level = agaw_to_level(agaw);
2547 if (level != 4 && level != 5)
2550 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2552 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553 domain->iommu_did[iommu->seq_id],
2557 static bool dev_is_real_dma_subdevice(struct device *dev)
2559 return dev && dev_is_pci(dev) &&
2560 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2563 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2566 struct dmar_domain *domain)
2568 struct dmar_domain *found = NULL;
2569 struct device_domain_info *info;
2570 unsigned long flags;
2573 info = alloc_devinfo_mem();
2577 if (!dev_is_real_dma_subdevice(dev)) {
2579 info->devfn = devfn;
2580 info->segment = iommu->segment;
2582 struct pci_dev *pdev = to_pci_dev(dev);
2584 info->bus = pdev->bus->number;
2585 info->devfn = pdev->devfn;
2586 info->segment = pci_domain_nr(pdev->bus);
2589 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2593 info->domain = domain;
2594 info->iommu = iommu;
2595 info->pasid_table = NULL;
2596 info->auxd_enabled = 0;
2597 INIT_LIST_HEAD(&info->auxiliary_domains);
2599 if (dev && dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(info->dev);
2602 if (ecap_dev_iotlb_support(iommu->ecap) &&
2603 pci_ats_supported(pdev) &&
2604 dmar_find_matched_atsr_unit(pdev))
2605 info->ats_supported = 1;
2607 if (sm_supported(iommu)) {
2608 if (pasid_supported(iommu)) {
2609 int features = pci_pasid_features(pdev);
2611 info->pasid_supported = features | 1;
2614 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615 pci_pri_supported(pdev))
2616 info->pri_supported = 1;
2620 spin_lock_irqsave(&device_domain_lock, flags);
2622 found = find_domain(dev);
2625 struct device_domain_info *info2;
2626 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2629 found = info2->domain;
2635 spin_unlock_irqrestore(&device_domain_lock, flags);
2636 free_devinfo_mem(info);
2637 /* Caller must free the original domain */
2641 spin_lock(&iommu->lock);
2642 ret = domain_attach_iommu(domain, iommu);
2643 spin_unlock(&iommu->lock);
2646 spin_unlock_irqrestore(&device_domain_lock, flags);
2647 free_devinfo_mem(info);
2651 list_add(&info->link, &domain->devices);
2652 list_add(&info->global, &device_domain_list);
2654 dev_iommu_priv_set(dev, info);
2655 spin_unlock_irqrestore(&device_domain_lock, flags);
2657 /* PASID table is mandatory for a PCI device in scalable mode. */
2658 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659 ret = intel_pasid_alloc_table(dev);
2661 dev_err(dev, "PASID table allocation failed\n");
2662 dmar_remove_one_dev_info(dev);
2666 /* Setup the PASID entry for requests without PASID: */
2667 spin_lock_irqsave(&iommu->lock, flags);
2668 if (hw_pass_through && domain_type_is_si(domain))
2669 ret = intel_pasid_setup_pass_through(iommu, domain,
2670 dev, PASID_RID2PASID);
2671 else if (domain_use_first_level(domain))
2672 ret = domain_setup_first_level(iommu, domain, dev,
2675 ret = intel_pasid_setup_second_level(iommu, domain,
2676 dev, PASID_RID2PASID);
2677 spin_unlock_irqrestore(&iommu->lock, flags);
2679 dev_err(dev, "Setup RID2PASID failed\n");
2680 dmar_remove_one_dev_info(dev);
2685 if (dev && domain_context_mapping(domain, dev)) {
2686 dev_err(dev, "Domain context map failed\n");
2687 dmar_remove_one_dev_info(dev);
2694 static int iommu_domain_identity_map(struct dmar_domain *domain,
2695 unsigned long first_vpfn,
2696 unsigned long last_vpfn)
2699 * RMRR range might have overlap with physical memory range,
2702 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2704 return __domain_mapping(domain, first_vpfn, NULL,
2705 first_vpfn, last_vpfn - first_vpfn + 1,
2706 DMA_PTE_READ|DMA_PTE_WRITE);
2709 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2711 static int __init si_domain_init(int hw)
2713 struct dmar_rmrr_unit *rmrr;
2717 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2721 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722 domain_exit(si_domain);
2729 for_each_online_node(nid) {
2730 unsigned long start_pfn, end_pfn;
2733 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734 ret = iommu_domain_identity_map(si_domain,
2735 mm_to_dma_pfn(start_pfn),
2736 mm_to_dma_pfn(end_pfn));
2743 * Identity map the RMRRs so that devices with RMRRs could also use
2746 for_each_rmrr_units(rmrr) {
2747 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2749 unsigned long long start = rmrr->base_address;
2750 unsigned long long end = rmrr->end_address;
2752 if (WARN_ON(end < start ||
2753 end >> agaw_to_width(si_domain->agaw)))
2756 ret = iommu_domain_identity_map(si_domain,
2757 mm_to_dma_pfn(start >> PAGE_SHIFT),
2758 mm_to_dma_pfn(end >> PAGE_SHIFT));
2767 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2769 struct dmar_domain *ndomain;
2770 struct intel_iommu *iommu;
2773 iommu = device_to_iommu(dev, &bus, &devfn);
2777 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778 if (ndomain != domain)
2784 static bool device_has_rmrr(struct device *dev)
2786 struct dmar_rmrr_unit *rmrr;
2791 for_each_rmrr_units(rmrr) {
2793 * Return TRUE if this RMRR contains the device that
2796 for_each_active_dev_scope(rmrr->devices,
2797 rmrr->devices_cnt, i, tmp)
2799 is_downstream_to_pci_bridge(dev, tmp)) {
2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810 * is relaxable (ie. is allowed to be not enforced under some conditions)
2811 * @dev: device handle
2813 * We assume that PCI USB devices with RMRRs have them largely
2814 * for historical reasons and that the RMRR space is not actively used post
2815 * boot. This exclusion may change if vendors begin to abuse it.
2817 * The same exception is made for graphics devices, with the requirement that
2818 * any use of the RMRR regions will be torn down before assigning the device
2821 * Return: true if the RMRR is relaxable, false otherwise
2823 static bool device_rmrr_is_relaxable(struct device *dev)
2825 struct pci_dev *pdev;
2827 if (!dev_is_pci(dev))
2830 pdev = to_pci_dev(dev);
2831 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2838 * There are a couple cases where we need to restrict the functionality of
2839 * devices associated with RMRRs. The first is when evaluating a device for
2840 * identity mapping because problems exist when devices are moved in and out
2841 * of domains and their respective RMRR information is lost. This means that
2842 * a device with associated RMRRs will never be in a "passthrough" domain.
2843 * The second is use of the device through the IOMMU API. This interface
2844 * expects to have full control of the IOVA space for the device. We cannot
2845 * satisfy both the requirement that RMRR access is maintained and have an
2846 * unencumbered IOVA space. We also have no ability to quiesce the device's
2847 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848 * We therefore prevent devices associated with an RMRR from participating in
2849 * the IOMMU API, which eliminates them from device assignment.
2851 * In both cases, devices which have relaxable RMRRs are not concerned by this
2852 * restriction. See device_rmrr_is_relaxable comment.
2854 static bool device_is_rmrr_locked(struct device *dev)
2856 if (!device_has_rmrr(dev))
2859 if (device_rmrr_is_relaxable(dev))
2866 * Return the required default domain type for a specific device.
2868 * @dev: the device in query
2869 * @startup: true if this is during early boot
2872 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874 * - 0: both identity and dynamic domains work for this device
2876 static int device_def_domain_type(struct device *dev)
2878 if (dev_is_pci(dev)) {
2879 struct pci_dev *pdev = to_pci_dev(dev);
2882 * Prevent any device marked as untrusted from getting
2883 * placed into the statically identity mapping domain.
2885 if (pdev->untrusted)
2886 return IOMMU_DOMAIN_DMA;
2888 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889 return IOMMU_DOMAIN_IDENTITY;
2891 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892 return IOMMU_DOMAIN_IDENTITY;
2898 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2901 * Start from the sane iommu hardware state.
2902 * If the queued invalidation is already initialized by us
2903 * (for example, while enabling interrupt-remapping) then
2904 * we got the things already rolling from a sane state.
2908 * Clear any previous faults.
2910 dmar_fault(-1, iommu);
2912 * Disable queued invalidation if supported and already enabled
2913 * before OS handover.
2915 dmar_disable_qi(iommu);
2918 if (dmar_enable_qi(iommu)) {
2920 * Queued Invalidate not enabled, use Register Based Invalidate
2922 iommu->flush.flush_context = __iommu_flush_context;
2923 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924 pr_info("%s: Using Register based invalidation\n",
2927 iommu->flush.flush_context = qi_flush_context;
2928 iommu->flush.flush_iotlb = qi_flush_iotlb;
2929 pr_info("%s: Using Queued invalidation\n", iommu->name);
2933 static int copy_context_table(struct intel_iommu *iommu,
2934 struct root_entry *old_re,
2935 struct context_entry **tbl,
2938 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939 struct context_entry *new_ce = NULL, ce;
2940 struct context_entry *old_ce = NULL;
2941 struct root_entry re;
2942 phys_addr_t old_ce_phys;
2944 tbl_idx = ext ? bus * 2 : bus;
2945 memcpy(&re, old_re, sizeof(re));
2947 for (devfn = 0; devfn < 256; devfn++) {
2948 /* First calculate the correct index */
2949 idx = (ext ? devfn * 2 : devfn) % 256;
2952 /* First save what we may have and clean up */
2954 tbl[tbl_idx] = new_ce;
2955 __iommu_flush_cache(iommu, new_ce,
2965 old_ce_phys = root_entry_lctp(&re);
2967 old_ce_phys = root_entry_uctp(&re);
2970 if (ext && devfn == 0) {
2971 /* No LCTP, try UCTP */
2980 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2985 new_ce = alloc_pgtable_page(iommu->node);
2992 /* Now copy the context entry */
2993 memcpy(&ce, old_ce + idx, sizeof(ce));
2995 if (!__context_present(&ce))
2998 did = context_domain_id(&ce);
2999 if (did >= 0 && did < cap_ndoms(iommu->cap))
3000 set_bit(did, iommu->domain_ids);
3003 * We need a marker for copied context entries. This
3004 * marker needs to work for the old format as well as
3005 * for extended context entries.
3007 * Bit 67 of the context entry is used. In the old
3008 * format this bit is available to software, in the
3009 * extended format it is the PGE bit, but PGE is ignored
3010 * by HW if PASIDs are disabled (and thus still
3013 * So disable PASIDs first and then mark the entry
3014 * copied. This means that we don't copy PASID
3015 * translations from the old kernel, but this is fine as
3016 * faults there are not fatal.
3018 context_clear_pasid_enable(&ce);
3019 context_set_copied(&ce);
3024 tbl[tbl_idx + pos] = new_ce;
3026 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3035 static int copy_translation_tables(struct intel_iommu *iommu)
3037 struct context_entry **ctxt_tbls;
3038 struct root_entry *old_rt;
3039 phys_addr_t old_rt_phys;
3040 int ctxt_table_entries;
3041 unsigned long flags;
3046 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048 new_ext = !!ecap_ecs(iommu->ecap);
3051 * The RTT bit can only be changed when translation is disabled,
3052 * but disabling translation means to open a window for data
3053 * corruption. So bail out and don't copy anything if we would
3054 * have to change the bit.
3059 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3063 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3067 /* This is too big for the stack - allocate it from slab */
3068 ctxt_table_entries = ext ? 512 : 256;
3070 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3074 for (bus = 0; bus < 256; bus++) {
3075 ret = copy_context_table(iommu, &old_rt[bus],
3076 ctxt_tbls, bus, ext);
3078 pr_err("%s: Failed to copy context table for bus %d\n",
3084 spin_lock_irqsave(&iommu->lock, flags);
3086 /* Context tables are copied, now write them to the root_entry table */
3087 for (bus = 0; bus < 256; bus++) {
3088 int idx = ext ? bus * 2 : bus;
3091 if (ctxt_tbls[idx]) {
3092 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093 iommu->root_entry[bus].lo = val;
3096 if (!ext || !ctxt_tbls[idx + 1])
3099 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100 iommu->root_entry[bus].hi = val;
3103 spin_unlock_irqrestore(&iommu->lock, flags);
3107 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3117 #ifdef CONFIG_INTEL_IOMMU_SVM
3118 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3120 struct intel_iommu *iommu = data;
3124 return INVALID_IOASID;
3126 * VT-d virtual command interface always uses the full 20 bit
3127 * PASID range. Host can partition guest PASID range based on
3128 * policies but it is out of guest's control.
3130 if (min < PASID_MIN || max > intel_pasid_max_id)
3131 return INVALID_IOASID;
3133 if (vcmd_alloc_pasid(iommu, &ioasid))
3134 return INVALID_IOASID;
3139 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3141 struct intel_iommu *iommu = data;
3146 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147 * We can only free the PASID when all the devices are unbound.
3149 if (ioasid_find(NULL, ioasid, NULL)) {
3150 pr_alert("Cannot free active IOASID %d\n", ioasid);
3153 vcmd_free_pasid(iommu, ioasid);
3156 static void register_pasid_allocator(struct intel_iommu *iommu)
3159 * If we are running in the host, no need for custom allocator
3160 * in that PASIDs are allocated from the host system-wide.
3162 if (!cap_caching_mode(iommu->cap))
3165 if (!sm_supported(iommu)) {
3166 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3171 * Register a custom PASID allocator if we are running in a guest,
3172 * guest PASID must be obtained via virtual command interface.
3173 * There can be multiple vIOMMUs in each guest but only one allocator
3174 * is active. All vIOMMU allocators will eventually be calling the same
3177 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3180 pr_info("Register custom PASID allocator\n");
3181 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183 iommu->pasid_allocator.pdata = (void *)iommu;
3184 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3187 * Disable scalable mode on this IOMMU if there
3188 * is no custom allocator. Mixing SM capable vIOMMU
3189 * and non-SM vIOMMU are not supported.
3196 static int __init init_dmars(void)
3198 struct dmar_drhd_unit *drhd;
3199 struct intel_iommu *iommu;
3205 * initialize and program root entry to not present
3208 for_each_drhd_unit(drhd) {
3210 * lock not needed as this is only incremented in the single
3211 * threaded kernel __init code path all other access are read
3214 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3218 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3221 /* Preallocate enough resources for IOMMU hot-addition */
3222 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3225 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3228 pr_err("Allocating global iommu array failed\n");
3233 for_each_iommu(iommu, drhd) {
3234 if (drhd->ignored) {
3235 iommu_disable_translation(iommu);
3240 * Find the max pasid size of all IOMMU's in the system.
3241 * We need to ensure the system pasid table is no bigger
3242 * than the smallest supported.
3244 if (pasid_supported(iommu)) {
3245 u32 temp = 2 << ecap_pss(iommu->ecap);
3247 intel_pasid_max_id = min_t(u32, temp,
3248 intel_pasid_max_id);
3251 g_iommus[iommu->seq_id] = iommu;
3253 intel_iommu_init_qi(iommu);
3255 ret = iommu_init_domains(iommu);
3259 init_translation_status(iommu);
3261 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262 iommu_disable_translation(iommu);
3263 clear_translation_pre_enabled(iommu);
3264 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3270 * we could share the same root & context tables
3271 * among all IOMMU's. Need to Split it later.
3273 ret = iommu_alloc_root_entry(iommu);
3277 if (translation_pre_enabled(iommu)) {
3278 pr_info("Translation already enabled - trying to copy translation structures\n");
3280 ret = copy_translation_tables(iommu);
3283 * We found the IOMMU with translation
3284 * enabled - but failed to copy over the
3285 * old root-entry table. Try to proceed
3286 * by disabling translation now and
3287 * allocating a clean root-entry table.
3288 * This might cause DMAR faults, but
3289 * probably the dump will still succeed.
3291 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3293 iommu_disable_translation(iommu);
3294 clear_translation_pre_enabled(iommu);
3296 pr_info("Copied translation tables from previous kernel for %s\n",
3301 if (!ecap_pass_through(iommu->ecap))
3302 hw_pass_through = 0;
3303 intel_svm_check(iommu);
3307 * Now that qi is enabled on all iommus, set the root entry and flush
3308 * caches. This is required on some Intel X58 chipsets, otherwise the
3309 * flush_context function will loop forever and the boot hangs.
3311 for_each_active_iommu(iommu, drhd) {
3312 iommu_flush_write_buffer(iommu);
3313 #ifdef CONFIG_INTEL_IOMMU_SVM
3314 register_pasid_allocator(iommu);
3316 iommu_set_root_entry(iommu);
3317 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3321 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326 iommu_identity_mapping |= IDENTMAP_GFX;
3328 check_tylersburg_isoch();
3330 ret = si_domain_init(hw_pass_through);
3337 * global invalidate context cache
3338 * global invalidate iotlb
3339 * enable translation
3341 for_each_iommu(iommu, drhd) {
3342 if (drhd->ignored) {
3344 * we always have to disable PMRs or DMA may fail on
3348 iommu_disable_protect_mem_regions(iommu);
3352 iommu_flush_write_buffer(iommu);
3354 #ifdef CONFIG_INTEL_IOMMU_SVM
3355 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3357 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358 * could cause possible lock race condition.
3360 up_write(&dmar_global_lock);
3361 ret = intel_svm_enable_prq(iommu);
3362 down_write(&dmar_global_lock);
3367 ret = dmar_set_interrupt(iommu);
3375 for_each_active_iommu(iommu, drhd) {
3376 disable_dmar_iommu(iommu);
3377 free_dmar_iommu(iommu);
3386 /* This takes a number of _MM_ pages, not VTD pages */
3387 static unsigned long intel_alloc_iova(struct device *dev,
3388 struct dmar_domain *domain,
3389 unsigned long nrpages, uint64_t dma_mask)
3391 unsigned long iova_pfn;
3394 * Restrict dma_mask to the width that the iommu can handle.
3395 * First-level translation restricts the input-address to a
3396 * canonical address (i.e., address bits 63:N have the same
3397 * value as address bit [N-1], where N is 48-bits with 4-level
3398 * paging and 57-bits with 5-level paging). Hence, skip bit
3401 if (domain_use_first_level(domain))
3402 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3405 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3408 /* Ensure we reserve the whole size-aligned region */
3409 nrpages = __roundup_pow_of_two(nrpages);
3411 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3413 * First try to allocate an io virtual address in
3414 * DMA_BIT_MASK(32) and if that fails then try allocating
3417 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418 IOVA_PFN(DMA_BIT_MASK(32)), false);
3422 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423 IOVA_PFN(dma_mask), true);
3424 if (unlikely(!iova_pfn)) {
3425 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3433 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434 size_t size, int dir, u64 dma_mask)
3436 struct dmar_domain *domain;
3437 phys_addr_t start_paddr;
3438 unsigned long iova_pfn;
3441 struct intel_iommu *iommu;
3442 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3444 BUG_ON(dir == DMA_NONE);
3446 if (unlikely(attach_deferred(dev)))
3447 do_deferred_attach(dev);
3449 domain = find_domain(dev);
3451 return DMA_MAPPING_ERROR;
3453 iommu = domain_get_iommu(domain);
3454 size = aligned_nrpages(paddr, size);
3456 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3461 * Check if DMAR supports zero-length reads on write only
3464 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465 !cap_zlr(iommu->cap))
3466 prot |= DMA_PTE_READ;
3467 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468 prot |= DMA_PTE_WRITE;
3470 * paddr - (paddr + size) might be partial page, we should map the whole
3471 * page. Note: if two part of one page are separately mapped, we
3472 * might have two guest_addr mapping to the same host paddr, but this
3473 * is not a big problem
3475 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476 mm_to_dma_pfn(paddr_pfn), size, prot);
3480 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481 start_paddr += paddr & ~PAGE_MASK;
3483 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491 size, (unsigned long long)paddr, dir);
3492 return DMA_MAPPING_ERROR;
3495 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496 unsigned long offset, size_t size,
3497 enum dma_data_direction dir,
3498 unsigned long attrs)
3500 return __intel_map_single(dev, page_to_phys(page) + offset,
3501 size, dir, *dev->dma_mask);
3504 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505 size_t size, enum dma_data_direction dir,
3506 unsigned long attrs)
3508 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3511 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3513 struct dmar_domain *domain;
3514 unsigned long start_pfn, last_pfn;
3515 unsigned long nrpages;
3516 unsigned long iova_pfn;
3517 struct intel_iommu *iommu;
3518 struct page *freelist;
3519 struct pci_dev *pdev = NULL;
3521 domain = find_domain(dev);
3524 iommu = domain_get_iommu(domain);
3526 iova_pfn = IOVA_PFN(dev_addr);
3528 nrpages = aligned_nrpages(dev_addr, size);
3529 start_pfn = mm_to_dma_pfn(iova_pfn);
3530 last_pfn = start_pfn + nrpages - 1;
3532 if (dev_is_pci(dev))
3533 pdev = to_pci_dev(dev);
3535 freelist = domain_unmap(domain, start_pfn, last_pfn);
3536 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537 !has_iova_flush_queue(&domain->iovad)) {
3538 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539 nrpages, !freelist, 0);
3541 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542 dma_free_pagelist(freelist);
3544 queue_iova(&domain->iovad, iova_pfn, nrpages,
3545 (unsigned long)freelist);
3547 * queue up the release of the unmap to save the 1/6th of the
3548 * cpu used up by the iotlb flush operation...
3552 trace_unmap_single(dev, dev_addr, size);
3555 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556 size_t size, enum dma_data_direction dir,
3557 unsigned long attrs)
3559 intel_unmap(dev, dev_addr, size);
3562 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563 size_t size, enum dma_data_direction dir, unsigned long attrs)
3565 intel_unmap(dev, dev_addr, size);
3568 static void *intel_alloc_coherent(struct device *dev, size_t size,
3569 dma_addr_t *dma_handle, gfp_t flags,
3570 unsigned long attrs)
3572 struct page *page = NULL;
3575 if (unlikely(attach_deferred(dev)))
3576 do_deferred_attach(dev);
3578 size = PAGE_ALIGN(size);
3579 order = get_order(size);
3581 if (gfpflags_allow_blocking(flags)) {
3582 unsigned int count = size >> PAGE_SHIFT;
3584 page = dma_alloc_from_contiguous(dev, count, order,
3585 flags & __GFP_NOWARN);
3589 page = alloc_pages(flags, order);
3592 memset(page_address(page), 0, size);
3594 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3596 dev->coherent_dma_mask);
3597 if (*dma_handle != DMA_MAPPING_ERROR)
3598 return page_address(page);
3599 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600 __free_pages(page, order);
3605 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606 dma_addr_t dma_handle, unsigned long attrs)
3609 struct page *page = virt_to_page(vaddr);
3611 size = PAGE_ALIGN(size);
3612 order = get_order(size);
3614 intel_unmap(dev, dma_handle, size);
3615 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616 __free_pages(page, order);
3619 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620 int nelems, enum dma_data_direction dir,
3621 unsigned long attrs)
3623 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624 unsigned long nrpages = 0;
3625 struct scatterlist *sg;
3628 for_each_sg(sglist, sg, nelems, i) {
3629 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3632 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3634 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3637 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638 enum dma_data_direction dir, unsigned long attrs)
3641 struct dmar_domain *domain;
3644 unsigned long iova_pfn;
3646 struct scatterlist *sg;
3647 unsigned long start_vpfn;
3648 struct intel_iommu *iommu;
3650 BUG_ON(dir == DMA_NONE);
3652 if (unlikely(attach_deferred(dev)))
3653 do_deferred_attach(dev);
3655 domain = find_domain(dev);
3659 iommu = domain_get_iommu(domain);
3661 for_each_sg(sglist, sg, nelems, i)
3662 size += aligned_nrpages(sg->offset, sg->length);
3664 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3667 sglist->dma_length = 0;
3672 * Check if DMAR supports zero-length reads on write only
3675 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676 !cap_zlr(iommu->cap))
3677 prot |= DMA_PTE_READ;
3678 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679 prot |= DMA_PTE_WRITE;
3681 start_vpfn = mm_to_dma_pfn(iova_pfn);
3683 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684 if (unlikely(ret)) {
3685 dma_pte_free_pagetable(domain, start_vpfn,
3686 start_vpfn + size - 1,
3687 agaw_to_level(domain->agaw) + 1);
3688 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3692 for_each_sg(sglist, sg, nelems, i)
3693 trace_map_sg(dev, i + 1, nelems, sg);
3698 static u64 intel_get_required_mask(struct device *dev)
3700 return DMA_BIT_MASK(32);
3703 static const struct dma_map_ops intel_dma_ops = {
3704 .alloc = intel_alloc_coherent,
3705 .free = intel_free_coherent,
3706 .map_sg = intel_map_sg,
3707 .unmap_sg = intel_unmap_sg,
3708 .map_page = intel_map_page,
3709 .unmap_page = intel_unmap_page,
3710 .map_resource = intel_map_resource,
3711 .unmap_resource = intel_unmap_resource,
3712 .dma_supported = dma_direct_supported,
3713 .mmap = dma_common_mmap,
3714 .get_sgtable = dma_common_get_sgtable,
3715 .get_required_mask = intel_get_required_mask,
3719 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720 enum dma_data_direction dir, enum dma_sync_target target)
3722 struct dmar_domain *domain;
3723 phys_addr_t tlb_addr;
3725 domain = find_domain(dev);
3726 if (WARN_ON(!domain))
3729 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730 if (is_swiotlb_buffer(tlb_addr))
3731 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3735 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736 enum dma_data_direction dir, unsigned long attrs,
3739 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740 struct dmar_domain *domain;
3741 struct intel_iommu *iommu;
3742 unsigned long iova_pfn;
3743 unsigned long nrpages;
3744 phys_addr_t tlb_addr;
3748 if (unlikely(attach_deferred(dev)))
3749 do_deferred_attach(dev);
3751 domain = find_domain(dev);
3753 if (WARN_ON(dir == DMA_NONE || !domain))
3754 return DMA_MAPPING_ERROR;
3756 iommu = domain_get_iommu(domain);
3757 if (WARN_ON(!iommu))
3758 return DMA_MAPPING_ERROR;
3760 nrpages = aligned_nrpages(0, size);
3761 iova_pfn = intel_alloc_iova(dev, domain,
3762 dma_to_mm_pfn(nrpages), dma_mask);
3764 return DMA_MAPPING_ERROR;
3767 * Check if DMAR supports zero-length reads on write only
3770 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771 !cap_zlr(iommu->cap))
3772 prot |= DMA_PTE_READ;
3773 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774 prot |= DMA_PTE_WRITE;
3777 * If both the physical buffer start address and size are
3778 * page aligned, we don't need to use a bounce page.
3780 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781 tlb_addr = swiotlb_tbl_map_single(dev,
3782 __phys_to_dma(dev, io_tlb_start),
3783 paddr, size, aligned_size, dir, attrs);
3784 if (tlb_addr == DMA_MAPPING_ERROR) {
3787 /* Cleanup the padding area. */
3788 void *padding_start = phys_to_virt(tlb_addr);
3789 size_t padding_size = aligned_size;
3791 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792 (dir == DMA_TO_DEVICE ||
3793 dir == DMA_BIDIRECTIONAL)) {
3794 padding_start += size;
3795 padding_size -= size;
3798 memset(padding_start, 0, padding_size);
3804 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3809 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3811 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3814 if (is_swiotlb_buffer(tlb_addr))
3815 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816 aligned_size, dir, attrs);
3818 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820 size, (unsigned long long)paddr, dir);
3822 return DMA_MAPPING_ERROR;
3826 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827 enum dma_data_direction dir, unsigned long attrs)
3829 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830 struct dmar_domain *domain;
3831 phys_addr_t tlb_addr;
3833 domain = find_domain(dev);
3834 if (WARN_ON(!domain))
3837 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838 if (WARN_ON(!tlb_addr))
3841 intel_unmap(dev, dev_addr, size);
3842 if (is_swiotlb_buffer(tlb_addr))
3843 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844 aligned_size, dir, attrs);
3846 trace_bounce_unmap_single(dev, dev_addr, size);
3850 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851 size_t size, enum dma_data_direction dir, unsigned long attrs)
3853 return bounce_map_single(dev, page_to_phys(page) + offset,
3854 size, dir, attrs, *dev->dma_mask);
3858 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859 enum dma_data_direction dir, unsigned long attrs)
3861 return bounce_map_single(dev, phys_addr, size,
3862 dir, attrs, *dev->dma_mask);
3866 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867 enum dma_data_direction dir, unsigned long attrs)
3869 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3873 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874 enum dma_data_direction dir, unsigned long attrs)
3876 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3880 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881 enum dma_data_direction dir, unsigned long attrs)
3883 struct scatterlist *sg;
3886 for_each_sg(sglist, sg, nelems, i)
3887 bounce_unmap_page(dev, sg->dma_address,
3888 sg_dma_len(sg), dir, attrs);
3892 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893 enum dma_data_direction dir, unsigned long attrs)
3896 struct scatterlist *sg;
3898 for_each_sg(sglist, sg, nelems, i) {
3899 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900 sg->offset, sg->length,
3902 if (sg->dma_address == DMA_MAPPING_ERROR)
3904 sg_dma_len(sg) = sg->length;
3907 for_each_sg(sglist, sg, nelems, i)
3908 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3913 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3918 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919 size_t size, enum dma_data_direction dir)
3921 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3925 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926 size_t size, enum dma_data_direction dir)
3928 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3932 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933 int nelems, enum dma_data_direction dir)
3935 struct scatterlist *sg;
3938 for_each_sg(sglist, sg, nelems, i)
3939 bounce_sync_single(dev, sg_dma_address(sg),
3940 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3944 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945 int nelems, enum dma_data_direction dir)
3947 struct scatterlist *sg;
3950 for_each_sg(sglist, sg, nelems, i)
3951 bounce_sync_single(dev, sg_dma_address(sg),
3952 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3955 static const struct dma_map_ops bounce_dma_ops = {
3956 .alloc = intel_alloc_coherent,
3957 .free = intel_free_coherent,
3958 .map_sg = bounce_map_sg,
3959 .unmap_sg = bounce_unmap_sg,
3960 .map_page = bounce_map_page,
3961 .unmap_page = bounce_unmap_page,
3962 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3963 .sync_single_for_device = bounce_sync_single_for_device,
3964 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3965 .sync_sg_for_device = bounce_sync_sg_for_device,
3966 .map_resource = bounce_map_resource,
3967 .unmap_resource = bounce_unmap_resource,
3968 .dma_supported = dma_direct_supported,
3971 static inline int iommu_domain_cache_init(void)
3975 iommu_domain_cache = kmem_cache_create("iommu_domain",
3976 sizeof(struct dmar_domain),
3981 if (!iommu_domain_cache) {
3982 pr_err("Couldn't create iommu_domain cache\n");
3989 static inline int iommu_devinfo_cache_init(void)
3993 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994 sizeof(struct device_domain_info),
3998 if (!iommu_devinfo_cache) {
3999 pr_err("Couldn't create devinfo cache\n");
4006 static int __init iommu_init_mempool(void)
4009 ret = iova_cache_get();
4013 ret = iommu_domain_cache_init();
4017 ret = iommu_devinfo_cache_init();
4021 kmem_cache_destroy(iommu_domain_cache);
4028 static void __init iommu_exit_mempool(void)
4030 kmem_cache_destroy(iommu_devinfo_cache);
4031 kmem_cache_destroy(iommu_domain_cache);
4035 static void __init init_no_remapping_devices(void)
4037 struct dmar_drhd_unit *drhd;
4041 for_each_drhd_unit(drhd) {
4042 if (!drhd->include_all) {
4043 for_each_active_dev_scope(drhd->devices,
4044 drhd->devices_cnt, i, dev)
4046 /* ignore DMAR unit if no devices exist */
4047 if (i == drhd->devices_cnt)
4052 for_each_active_drhd_unit(drhd) {
4053 if (drhd->include_all)
4056 for_each_active_dev_scope(drhd->devices,
4057 drhd->devices_cnt, i, dev)
4058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4060 if (i < drhd->devices_cnt)
4063 /* This IOMMU has *only* gfx devices. Either bypass it or
4064 set the gfx_mapped flag, as appropriate */
4065 drhd->gfx_dedicated = 1;
4071 #ifdef CONFIG_SUSPEND
4072 static int init_iommu_hw(void)
4074 struct dmar_drhd_unit *drhd;
4075 struct intel_iommu *iommu = NULL;
4077 for_each_active_iommu(iommu, drhd)
4079 dmar_reenable_qi(iommu);
4081 for_each_iommu(iommu, drhd) {
4082 if (drhd->ignored) {
4084 * we always have to disable PMRs or DMA may fail on
4088 iommu_disable_protect_mem_regions(iommu);
4092 iommu_flush_write_buffer(iommu);
4094 iommu_set_root_entry(iommu);
4096 iommu->flush.flush_context(iommu, 0, 0, 0,
4097 DMA_CCMD_GLOBAL_INVL);
4098 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099 iommu_enable_translation(iommu);
4100 iommu_disable_protect_mem_regions(iommu);
4106 static void iommu_flush_all(void)
4108 struct dmar_drhd_unit *drhd;
4109 struct intel_iommu *iommu;
4111 for_each_active_iommu(iommu, drhd) {
4112 iommu->flush.flush_context(iommu, 0, 0, 0,
4113 DMA_CCMD_GLOBAL_INVL);
4114 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115 DMA_TLB_GLOBAL_FLUSH);
4119 static int iommu_suspend(void)
4121 struct dmar_drhd_unit *drhd;
4122 struct intel_iommu *iommu = NULL;
4125 for_each_active_iommu(iommu, drhd) {
4126 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4128 if (!iommu->iommu_state)
4134 for_each_active_iommu(iommu, drhd) {
4135 iommu_disable_translation(iommu);
4137 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4139 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140 readl(iommu->reg + DMAR_FECTL_REG);
4141 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142 readl(iommu->reg + DMAR_FEDATA_REG);
4143 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144 readl(iommu->reg + DMAR_FEADDR_REG);
4145 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146 readl(iommu->reg + DMAR_FEUADDR_REG);
4148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4153 for_each_active_iommu(iommu, drhd)
4154 kfree(iommu->iommu_state);
4159 static void iommu_resume(void)
4161 struct dmar_drhd_unit *drhd;
4162 struct intel_iommu *iommu = NULL;
4165 if (init_iommu_hw()) {
4167 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4169 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4173 for_each_active_iommu(iommu, drhd) {
4175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4177 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178 iommu->reg + DMAR_FECTL_REG);
4179 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180 iommu->reg + DMAR_FEDATA_REG);
4181 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182 iommu->reg + DMAR_FEADDR_REG);
4183 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184 iommu->reg + DMAR_FEUADDR_REG);
4186 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4189 for_each_active_iommu(iommu, drhd)
4190 kfree(iommu->iommu_state);
4193 static struct syscore_ops iommu_syscore_ops = {
4194 .resume = iommu_resume,
4195 .suspend = iommu_suspend,
4198 static void __init init_iommu_pm_ops(void)
4200 register_syscore_ops(&iommu_syscore_ops);
4204 static inline void init_iommu_pm_ops(void) {}
4205 #endif /* CONFIG_PM */
4207 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4209 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211 rmrr->end_address <= rmrr->base_address ||
4212 arch_rmrr_sanity_check(rmrr))
4218 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4220 struct acpi_dmar_reserved_memory *rmrr;
4221 struct dmar_rmrr_unit *rmrru;
4223 rmrr = (struct acpi_dmar_reserved_memory *)header;
4224 if (rmrr_sanity_check(rmrr)) {
4226 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228 rmrr->base_address, rmrr->end_address,
4229 dmi_get_system_info(DMI_BIOS_VENDOR),
4230 dmi_get_system_info(DMI_BIOS_VERSION),
4231 dmi_get_system_info(DMI_PRODUCT_VERSION));
4232 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4235 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4239 rmrru->hdr = header;
4241 rmrru->base_address = rmrr->base_address;
4242 rmrru->end_address = rmrr->end_address;
4244 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245 ((void *)rmrr) + rmrr->header.length,
4246 &rmrru->devices_cnt);
4247 if (rmrru->devices_cnt && rmrru->devices == NULL)
4250 list_add(&rmrru->list, &dmar_rmrr_units);
4259 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4261 struct dmar_atsr_unit *atsru;
4262 struct acpi_dmar_atsr *tmp;
4264 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4266 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267 if (atsr->segment != tmp->segment)
4269 if (atsr->header.length != tmp->header.length)
4271 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4278 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4280 struct acpi_dmar_atsr *atsr;
4281 struct dmar_atsr_unit *atsru;
4283 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4286 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287 atsru = dmar_find_atsr(atsr);
4291 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4296 * If memory is allocated from slab by ACPI _DSM method, we need to
4297 * copy the memory content because the memory buffer will be freed
4300 atsru->hdr = (void *)(atsru + 1);
4301 memcpy(atsru->hdr, hdr, hdr->length);
4302 atsru->include_all = atsr->flags & 0x1;
4303 if (!atsru->include_all) {
4304 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305 (void *)atsr + atsr->header.length,
4306 &atsru->devices_cnt);
4307 if (atsru->devices_cnt && atsru->devices == NULL) {
4313 list_add_rcu(&atsru->list, &dmar_atsr_units);
4318 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4320 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4324 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4326 struct acpi_dmar_atsr *atsr;
4327 struct dmar_atsr_unit *atsru;
4329 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330 atsru = dmar_find_atsr(atsr);
4332 list_del_rcu(&atsru->list);
4334 intel_iommu_free_atsr(atsru);
4340 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4344 struct acpi_dmar_atsr *atsr;
4345 struct dmar_atsr_unit *atsru;
4347 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348 atsru = dmar_find_atsr(atsr);
4352 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4361 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4364 struct intel_iommu *iommu = dmaru->iommu;
4366 if (g_iommus[iommu->seq_id])
4369 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370 pr_warn("%s: Doesn't support hardware pass through.\n",
4374 if (!ecap_sc_support(iommu->ecap) &&
4375 domain_update_iommu_snooping(iommu)) {
4376 pr_warn("%s: Doesn't support snooping.\n",
4380 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382 pr_warn("%s: Doesn't support large page.\n",
4388 * Disable translation if already enabled prior to OS handover.
4390 if (iommu->gcmd & DMA_GCMD_TE)
4391 iommu_disable_translation(iommu);
4393 g_iommus[iommu->seq_id] = iommu;
4394 ret = iommu_init_domains(iommu);
4396 ret = iommu_alloc_root_entry(iommu);
4400 intel_svm_check(iommu);
4402 if (dmaru->ignored) {
4404 * we always have to disable PMRs or DMA may fail on this device
4407 iommu_disable_protect_mem_regions(iommu);
4411 intel_iommu_init_qi(iommu);
4412 iommu_flush_write_buffer(iommu);
4414 #ifdef CONFIG_INTEL_IOMMU_SVM
4415 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416 ret = intel_svm_enable_prq(iommu);
4421 ret = dmar_set_interrupt(iommu);
4425 iommu_set_root_entry(iommu);
4426 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428 iommu_enable_translation(iommu);
4430 iommu_disable_protect_mem_regions(iommu);
4434 disable_dmar_iommu(iommu);
4436 free_dmar_iommu(iommu);
4440 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4443 struct intel_iommu *iommu = dmaru->iommu;
4445 if (!intel_iommu_enabled)
4451 ret = intel_iommu_add(dmaru);
4453 disable_dmar_iommu(iommu);
4454 free_dmar_iommu(iommu);
4460 static void intel_iommu_free_dmars(void)
4462 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463 struct dmar_atsr_unit *atsru, *atsr_n;
4465 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466 list_del(&rmrru->list);
4467 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4471 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472 list_del(&atsru->list);
4473 intel_iommu_free_atsr(atsru);
4477 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4480 struct pci_bus *bus;
4481 struct pci_dev *bridge = NULL;
4483 struct acpi_dmar_atsr *atsr;
4484 struct dmar_atsr_unit *atsru;
4486 dev = pci_physfn(dev);
4487 for (bus = dev->bus; bus; bus = bus->parent) {
4489 /* If it's an integrated device, allow ATS */
4492 /* Connected via non-PCIe: no ATS */
4493 if (!pci_is_pcie(bridge) ||
4494 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4496 /* If we found the root port, look it up in the ATSR */
4497 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4502 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504 if (atsr->segment != pci_domain_nr(dev->bus))
4507 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508 if (tmp == &bridge->dev)
4511 if (atsru->include_all)
4521 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4524 struct dmar_rmrr_unit *rmrru;
4525 struct dmar_atsr_unit *atsru;
4526 struct acpi_dmar_atsr *atsr;
4527 struct acpi_dmar_reserved_memory *rmrr;
4529 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4532 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533 rmrr = container_of(rmrru->hdr,
4534 struct acpi_dmar_reserved_memory, header);
4535 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537 ((void *)rmrr) + rmrr->header.length,
4538 rmrr->segment, rmrru->devices,
4539 rmrru->devices_cnt);
4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543 dmar_remove_dev_scope(info, rmrr->segment,
4544 rmrru->devices, rmrru->devices_cnt);
4548 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549 if (atsru->include_all)
4552 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555 (void *)atsr + atsr->header.length,
4556 atsr->segment, atsru->devices,
4557 atsru->devices_cnt);
4562 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563 if (dmar_remove_dev_scope(info, atsr->segment,
4564 atsru->devices, atsru->devices_cnt))
4572 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573 unsigned long val, void *v)
4575 struct memory_notify *mhp = v;
4576 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4581 case MEM_GOING_ONLINE:
4582 if (iommu_domain_identity_map(si_domain,
4583 start_vpfn, last_vpfn)) {
4584 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585 start_vpfn, last_vpfn);
4591 case MEM_CANCEL_ONLINE:
4593 struct dmar_drhd_unit *drhd;
4594 struct intel_iommu *iommu;
4595 struct page *freelist;
4597 freelist = domain_unmap(si_domain,
4598 start_vpfn, last_vpfn);
4601 for_each_active_iommu(iommu, drhd)
4602 iommu_flush_iotlb_psi(iommu, si_domain,
4603 start_vpfn, mhp->nr_pages,
4606 dma_free_pagelist(freelist);
4614 static struct notifier_block intel_iommu_memory_nb = {
4615 .notifier_call = intel_iommu_memory_notifier,
4619 static void free_all_cpu_cached_iovas(unsigned int cpu)
4623 for (i = 0; i < g_num_of_iommus; i++) {
4624 struct intel_iommu *iommu = g_iommus[i];
4625 struct dmar_domain *domain;
4631 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632 domain = get_iommu_domain(iommu, (u16)did);
4634 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4637 free_cpu_cached_iovas(cpu, &domain->iovad);
4642 static int intel_iommu_cpu_dead(unsigned int cpu)
4644 free_all_cpu_cached_iovas(cpu);
4648 static void intel_disable_iommus(void)
4650 struct intel_iommu *iommu = NULL;
4651 struct dmar_drhd_unit *drhd;
4653 for_each_iommu(iommu, drhd)
4654 iommu_disable_translation(iommu);
4657 void intel_iommu_shutdown(void)
4659 struct dmar_drhd_unit *drhd;
4660 struct intel_iommu *iommu = NULL;
4662 if (no_iommu || dmar_disabled)
4665 down_write(&dmar_global_lock);
4667 /* Disable PMRs explicitly here. */
4668 for_each_iommu(iommu, drhd)
4669 iommu_disable_protect_mem_regions(iommu);
4671 /* Make sure the IOMMUs are switched off */
4672 intel_disable_iommus();
4674 up_write(&dmar_global_lock);
4677 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4679 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4681 return container_of(iommu_dev, struct intel_iommu, iommu);
4684 static ssize_t intel_iommu_show_version(struct device *dev,
4685 struct device_attribute *attr,
4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690 return sprintf(buf, "%d:%d\n",
4691 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4693 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4695 static ssize_t intel_iommu_show_address(struct device *dev,
4696 struct device_attribute *attr,
4699 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700 return sprintf(buf, "%llx\n", iommu->reg_phys);
4702 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4704 static ssize_t intel_iommu_show_cap(struct device *dev,
4705 struct device_attribute *attr,
4708 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709 return sprintf(buf, "%llx\n", iommu->cap);
4711 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4713 static ssize_t intel_iommu_show_ecap(struct device *dev,
4714 struct device_attribute *attr,
4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 return sprintf(buf, "%llx\n", iommu->ecap);
4720 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4722 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723 struct device_attribute *attr,
4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4729 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4731 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732 struct device_attribute *attr,
4735 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737 cap_ndoms(iommu->cap)));
4739 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4741 static struct attribute *intel_iommu_attrs[] = {
4742 &dev_attr_version.attr,
4743 &dev_attr_address.attr,
4745 &dev_attr_ecap.attr,
4746 &dev_attr_domains_supported.attr,
4747 &dev_attr_domains_used.attr,
4751 static struct attribute_group intel_iommu_group = {
4752 .name = "intel-iommu",
4753 .attrs = intel_iommu_attrs,
4756 const struct attribute_group *intel_iommu_groups[] = {
4761 static inline bool has_external_pci(void)
4763 struct pci_dev *pdev = NULL;
4765 for_each_pci_dev(pdev)
4766 if (pdev->external_facing)
4772 static int __init platform_optin_force_iommu(void)
4774 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4777 if (no_iommu || dmar_disabled)
4778 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4781 * If Intel-IOMMU is disabled by default, we will apply identity
4782 * map for all devices except those marked as being untrusted.
4785 iommu_set_default_passthrough(false);
4793 static int __init probe_acpi_namespace_devices(void)
4795 struct dmar_drhd_unit *drhd;
4796 /* To avoid a -Wunused-but-set-variable warning. */
4797 struct intel_iommu *iommu __maybe_unused;
4801 for_each_active_iommu(iommu, drhd) {
4802 for_each_active_dev_scope(drhd->devices,
4803 drhd->devices_cnt, i, dev) {
4804 struct acpi_device_physical_node *pn;
4805 struct iommu_group *group;
4806 struct acpi_device *adev;
4808 if (dev->bus != &acpi_bus_type)
4811 adev = to_acpi_device(dev);
4812 mutex_lock(&adev->physical_node_lock);
4813 list_for_each_entry(pn,
4814 &adev->physical_node_list, node) {
4815 group = iommu_group_get(pn->dev);
4817 iommu_group_put(group);
4821 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822 ret = iommu_probe_device(pn->dev);
4826 mutex_unlock(&adev->physical_node_lock);
4836 int __init intel_iommu_init(void)
4839 struct dmar_drhd_unit *drhd;
4840 struct intel_iommu *iommu;
4843 * Intel IOMMU is required for a TXT/tboot launch or platform
4844 * opt in, so enforce that.
4846 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4848 if (iommu_init_mempool()) {
4850 panic("tboot: Failed to initialize iommu memory\n");
4854 down_write(&dmar_global_lock);
4855 if (dmar_table_init()) {
4857 panic("tboot: Failed to initialize DMAR table\n");
4861 if (dmar_dev_scope_init() < 0) {
4863 panic("tboot: Failed to initialize DMAR device scope\n");
4867 up_write(&dmar_global_lock);
4870 * The bus notifier takes the dmar_global_lock, so lockdep will
4871 * complain later when we register it under the lock.
4873 dmar_register_bus_notifier();
4875 down_write(&dmar_global_lock);
4878 intel_iommu_debugfs_init();
4880 if (no_iommu || dmar_disabled) {
4882 * We exit the function here to ensure IOMMU's remapping and
4883 * mempool aren't setup, which means that the IOMMU's PMRs
4884 * won't be disabled via the call to init_dmars(). So disable
4885 * it explicitly here. The PMRs were setup by tboot prior to
4886 * calling SENTER, but the kernel is expected to reset/tear
4889 if (intel_iommu_tboot_noforce) {
4890 for_each_iommu(iommu, drhd)
4891 iommu_disable_protect_mem_regions(iommu);
4895 * Make sure the IOMMUs are switched off, even when we
4896 * boot into a kexec kernel and the previous kernel left
4899 intel_disable_iommus();
4903 if (list_empty(&dmar_rmrr_units))
4904 pr_info("No RMRR found\n");
4906 if (list_empty(&dmar_atsr_units))
4907 pr_info("No ATSR found\n");
4909 if (dmar_init_reserved_ranges()) {
4911 panic("tboot: Failed to reserve iommu ranges\n");
4912 goto out_free_reserved_range;
4916 intel_iommu_gfx_mapped = 1;
4918 init_no_remapping_devices();
4923 panic("tboot: Failed to initialize DMARs\n");
4924 pr_err("Initialization failed\n");
4925 goto out_free_reserved_range;
4927 up_write(&dmar_global_lock);
4929 init_iommu_pm_ops();
4931 down_read(&dmar_global_lock);
4932 for_each_active_iommu(iommu, drhd) {
4933 iommu_device_sysfs_add(&iommu->iommu, NULL,
4936 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937 iommu_device_register(&iommu->iommu);
4939 up_read(&dmar_global_lock);
4941 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942 if (si_domain && !hw_pass_through)
4943 register_memory_notifier(&intel_iommu_memory_nb);
4944 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945 intel_iommu_cpu_dead);
4947 down_read(&dmar_global_lock);
4948 if (probe_acpi_namespace_devices())
4949 pr_warn("ACPI name space devices didn't probe correctly\n");
4951 /* Finally, we enable the DMA remapping hardware. */
4952 for_each_iommu(iommu, drhd) {
4953 if (!drhd->ignored && !translation_pre_enabled(iommu))
4954 iommu_enable_translation(iommu);
4956 iommu_disable_protect_mem_regions(iommu);
4958 up_read(&dmar_global_lock);
4960 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4962 intel_iommu_enabled = 1;
4966 out_free_reserved_range:
4967 put_iova_domain(&reserved_iova_list);
4969 intel_iommu_free_dmars();
4970 up_write(&dmar_global_lock);
4971 iommu_exit_mempool();
4975 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4977 struct intel_iommu *iommu = opaque;
4979 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4984 * NB - intel-iommu lacks any sort of reference counting for the users of
4985 * dependent devices. If multiple endpoints have intersecting dependent
4986 * devices, unbinding the driver from any one of them will possibly leave
4987 * the others unable to operate.
4989 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4991 if (!iommu || !dev || !dev_is_pci(dev))
4994 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4997 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4999 struct dmar_domain *domain;
5000 struct intel_iommu *iommu;
5001 unsigned long flags;
5003 assert_spin_locked(&device_domain_lock);
5008 iommu = info->iommu;
5009 domain = info->domain;
5012 if (dev_is_pci(info->dev) && sm_supported(iommu))
5013 intel_pasid_tear_down_entry(iommu, info->dev,
5014 PASID_RID2PASID, false);
5016 iommu_disable_dev_iotlb(info);
5017 if (!dev_is_real_dma_subdevice(info->dev))
5018 domain_context_clear(iommu, info->dev);
5019 intel_pasid_free_table(info->dev);
5022 unlink_domain_info(info);
5024 spin_lock_irqsave(&iommu->lock, flags);
5025 domain_detach_iommu(domain, iommu);
5026 spin_unlock_irqrestore(&iommu->lock, flags);
5028 free_devinfo_mem(info);
5031 static void dmar_remove_one_dev_info(struct device *dev)
5033 struct device_domain_info *info;
5034 unsigned long flags;
5036 spin_lock_irqsave(&device_domain_lock, flags);
5037 info = get_domain_info(dev);
5039 __dmar_remove_one_dev_info(info);
5040 spin_unlock_irqrestore(&device_domain_lock, flags);
5043 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5047 /* calculate AGAW */
5048 domain->gaw = guest_width;
5049 adjust_width = guestwidth_to_adjustwidth(guest_width);
5050 domain->agaw = width_to_agaw(adjust_width);
5052 domain->iommu_coherency = 0;
5053 domain->iommu_snooping = 0;
5054 domain->iommu_superpage = 0;
5055 domain->max_addr = 0;
5057 /* always allocate the top pgd */
5058 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5061 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5065 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5067 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5070 if (!intel_iommu_strict &&
5071 init_iova_flush_queue(&dmar_domain->iovad,
5072 iommu_flush_iova, iova_entry_free))
5073 pr_info("iova flush queue initialization failed\n");
5076 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5078 struct dmar_domain *dmar_domain;
5079 struct iommu_domain *domain;
5082 case IOMMU_DOMAIN_DMA:
5083 case IOMMU_DOMAIN_UNMANAGED:
5084 dmar_domain = alloc_domain(0);
5086 pr_err("Can't allocate dmar_domain\n");
5089 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090 pr_err("Domain initialization failed\n");
5091 domain_exit(dmar_domain);
5095 if (type == IOMMU_DOMAIN_DMA)
5096 intel_init_iova_domain(dmar_domain);
5098 domain_update_iommu_cap(dmar_domain);
5100 domain = &dmar_domain->domain;
5101 domain->geometry.aperture_start = 0;
5102 domain->geometry.aperture_end =
5103 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104 domain->geometry.force_aperture = true;
5107 case IOMMU_DOMAIN_IDENTITY:
5108 return &si_domain->domain;
5116 static void intel_iommu_domain_free(struct iommu_domain *domain)
5118 if (domain != &si_domain->domain)
5119 domain_exit(to_dmar_domain(domain));
5123 * Check whether a @domain could be attached to the @dev through the
5124 * aux-domain attach/detach APIs.
5127 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5129 struct device_domain_info *info = get_domain_info(dev);
5131 return info && info->auxd_enabled &&
5132 domain->type == IOMMU_DOMAIN_UNMANAGED;
5135 static void auxiliary_link_device(struct dmar_domain *domain,
5138 struct device_domain_info *info = get_domain_info(dev);
5140 assert_spin_locked(&device_domain_lock);
5144 domain->auxd_refcnt++;
5145 list_add(&domain->auxd, &info->auxiliary_domains);
5148 static void auxiliary_unlink_device(struct dmar_domain *domain,
5151 struct device_domain_info *info = get_domain_info(dev);
5153 assert_spin_locked(&device_domain_lock);
5157 list_del(&domain->auxd);
5158 domain->auxd_refcnt--;
5160 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161 ioasid_free(domain->default_pasid);
5164 static int aux_domain_add_dev(struct dmar_domain *domain,
5168 unsigned long flags;
5169 struct intel_iommu *iommu;
5171 iommu = device_to_iommu(dev, NULL, NULL);
5175 if (domain->default_pasid <= 0) {
5178 /* No private data needed for the default pasid */
5179 pasid = ioasid_alloc(NULL, PASID_MIN,
5180 pci_max_pasids(to_pci_dev(dev)) - 1,
5182 if (pasid == INVALID_IOASID) {
5183 pr_err("Can't allocate default pasid\n");
5186 domain->default_pasid = pasid;
5189 spin_lock_irqsave(&device_domain_lock, flags);
5191 * iommu->lock must be held to attach domain to iommu and setup the
5192 * pasid entry for second level translation.
5194 spin_lock(&iommu->lock);
5195 ret = domain_attach_iommu(domain, iommu);
5199 /* Setup the PASID entry for mediated devices: */
5200 if (domain_use_first_level(domain))
5201 ret = domain_setup_first_level(iommu, domain, dev,
5202 domain->default_pasid);
5204 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205 domain->default_pasid);
5208 spin_unlock(&iommu->lock);
5210 auxiliary_link_device(domain, dev);
5212 spin_unlock_irqrestore(&device_domain_lock, flags);
5217 domain_detach_iommu(domain, iommu);
5219 spin_unlock(&iommu->lock);
5220 spin_unlock_irqrestore(&device_domain_lock, flags);
5221 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222 ioasid_free(domain->default_pasid);
5227 static void aux_domain_remove_dev(struct dmar_domain *domain,
5230 struct device_domain_info *info;
5231 struct intel_iommu *iommu;
5232 unsigned long flags;
5234 if (!is_aux_domain(dev, &domain->domain))
5237 spin_lock_irqsave(&device_domain_lock, flags);
5238 info = get_domain_info(dev);
5239 iommu = info->iommu;
5241 auxiliary_unlink_device(domain, dev);
5243 spin_lock(&iommu->lock);
5244 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245 domain_detach_iommu(domain, iommu);
5246 spin_unlock(&iommu->lock);
5248 spin_unlock_irqrestore(&device_domain_lock, flags);
5251 static int prepare_domain_attach_device(struct iommu_domain *domain,
5254 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255 struct intel_iommu *iommu;
5258 iommu = device_to_iommu(dev, NULL, NULL);
5262 /* check if this iommu agaw is sufficient for max mapped address */
5263 addr_width = agaw_to_width(iommu->agaw);
5264 if (addr_width > cap_mgaw(iommu->cap))
5265 addr_width = cap_mgaw(iommu->cap);
5267 if (dmar_domain->max_addr > (1LL << addr_width)) {
5268 dev_err(dev, "%s: iommu width (%d) is not "
5269 "sufficient for the mapped address (%llx)\n",
5270 __func__, addr_width, dmar_domain->max_addr);
5273 dmar_domain->gaw = addr_width;
5276 * Knock out extra levels of page tables if necessary
5278 while (iommu->agaw < dmar_domain->agaw) {
5279 struct dma_pte *pte;
5281 pte = dmar_domain->pgd;
5282 if (dma_pte_present(pte)) {
5283 dmar_domain->pgd = (struct dma_pte *)
5284 phys_to_virt(dma_pte_addr(pte));
5285 free_pgtable_page(pte);
5287 dmar_domain->agaw--;
5293 static int intel_iommu_attach_device(struct iommu_domain *domain,
5298 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299 device_is_rmrr_locked(dev)) {
5300 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5304 if (is_aux_domain(dev, domain))
5307 /* normally dev is not mapped */
5308 if (unlikely(domain_context_mapped(dev))) {
5309 struct dmar_domain *old_domain;
5311 old_domain = find_domain(dev);
5313 dmar_remove_one_dev_info(dev);
5316 ret = prepare_domain_attach_device(domain, dev);
5320 return domain_add_dev_info(to_dmar_domain(domain), dev);
5323 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5328 if (!is_aux_domain(dev, domain))
5331 ret = prepare_domain_attach_device(domain, dev);
5335 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5338 static void intel_iommu_detach_device(struct iommu_domain *domain,
5341 dmar_remove_one_dev_info(dev);
5344 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5347 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352 * VT-d granularity. Invalidation is typically included in the unmap operation
5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354 * owns the first level page tables. Invalidations of translation caches in the
5355 * guest are trapped and passed down to the host.
5357 * vIOMMU in the guest will only expose first level page tables, therefore
5358 * we do not support IOTLB granularity for request without PASID (second level).
5360 * For example, to find the VT-d granularity encoding for IOTLB
5361 * type and page selective granularity within PASID:
5362 * X: indexed by iommu cache type
5363 * Y: indexed by enum iommu_inv_granularity
5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5368 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5370 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371 * page selective (address granularity)
5373 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374 /* PASID based dev TLBs */
5375 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5377 {-EINVAL, -EINVAL, -EINVAL}
5380 static inline int to_vtd_granularity(int type, int granu)
5382 return inv_type_granu_table[type][granu];
5385 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5387 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5389 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391 * granu size in contiguous memory.
5393 return order_base_2(nr_pages);
5396 #ifdef CONFIG_INTEL_IOMMU_SVM
5398 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399 struct iommu_cache_invalidate_info *inv_info)
5401 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 struct device_domain_info *info;
5403 struct intel_iommu *iommu;
5404 unsigned long flags;
5411 if (!inv_info || !dmar_domain ||
5412 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5415 if (!dev || !dev_is_pci(dev))
5418 iommu = device_to_iommu(dev, &bus, &devfn);
5422 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5425 spin_lock_irqsave(&device_domain_lock, flags);
5426 spin_lock(&iommu->lock);
5427 info = get_domain_info(dev);
5432 did = dmar_domain->iommu_did[iommu->seq_id];
5433 sid = PCI_DEVID(bus, devfn);
5435 /* Size is only valid in address selective invalidation */
5436 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437 size = to_vtd_size(inv_info->addr_info.granule_size,
5438 inv_info->addr_info.nb_granules);
5440 for_each_set_bit(cache_type,
5441 (unsigned long *)&inv_info->cache,
5442 IOMMU_CACHE_INV_TYPE_NR) {
5447 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448 if (granu == -EINVAL) {
5449 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450 cache_type, inv_info->granularity);
5455 * PASID is stored in different locations based on the
5458 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460 pasid = inv_info->pasid_info.pasid;
5461 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463 pasid = inv_info->addr_info.pasid;
5465 switch (BIT(cache_type)) {
5466 case IOMMU_CACHE_INV_TYPE_IOTLB:
5467 /* HW will ignore LSB bits based on address mask */
5468 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5470 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472 inv_info->addr_info.addr, size);
5476 * If granu is PASID-selective, address is ignored.
5477 * We use npages = -1 to indicate that.
5479 qi_flush_piotlb(iommu, did, pasid,
5480 mm_to_dma_pfn(inv_info->addr_info.addr),
5481 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5484 if (!info->ats_enabled)
5487 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488 * in the guest may assume IOTLB flush is inclusive,
5489 * which is more efficient.
5492 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5494 * PASID based device TLB invalidation does not support
5495 * IOMMU_INV_GRANU_PASID granularity but only supports
5496 * IOMMU_INV_GRANU_ADDR.
5497 * The equivalent of that is we set the size to be the
5498 * entire range of 64 bit. User only provides PASID info
5499 * without address info. So we set addr to 0.
5501 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502 size = 64 - VTD_PAGE_SHIFT;
5504 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505 addr = inv_info->addr_info.addr;
5508 if (info->ats_enabled)
5509 qi_flush_dev_iotlb_pasid(iommu, sid,
5511 info->ats_qdep, addr,
5514 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5517 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5523 spin_unlock(&iommu->lock);
5524 spin_unlock_irqrestore(&device_domain_lock, flags);
5530 static int intel_iommu_map(struct iommu_domain *domain,
5531 unsigned long iova, phys_addr_t hpa,
5532 size_t size, int iommu_prot, gfp_t gfp)
5534 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5539 if (iommu_prot & IOMMU_READ)
5540 prot |= DMA_PTE_READ;
5541 if (iommu_prot & IOMMU_WRITE)
5542 prot |= DMA_PTE_WRITE;
5543 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544 prot |= DMA_PTE_SNP;
5546 max_addr = iova + size;
5547 if (dmar_domain->max_addr < max_addr) {
5550 /* check if minimum agaw is sufficient for mapped address */
5551 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552 if (end < max_addr) {
5553 pr_err("%s: iommu width (%d) is not "
5554 "sufficient for the mapped address (%llx)\n",
5555 __func__, dmar_domain->gaw, max_addr);
5558 dmar_domain->max_addr = max_addr;
5560 /* Round up size to next multiple of PAGE_SIZE, if it and
5561 the low bits of hpa would take us onto the next page */
5562 size = aligned_nrpages(hpa, size);
5563 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564 hpa >> VTD_PAGE_SHIFT, size, prot);
5568 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569 unsigned long iova, size_t size,
5570 struct iommu_iotlb_gather *gather)
5572 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 struct page *freelist = NULL;
5574 unsigned long start_pfn, last_pfn;
5575 unsigned int npages;
5576 int iommu_id, level = 0;
5578 /* Cope with horrid API which requires us to unmap more than the
5579 size argument if it happens to be a large-page mapping. */
5580 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5582 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5585 start_pfn = iova >> VTD_PAGE_SHIFT;
5586 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5588 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5590 npages = last_pfn - start_pfn + 1;
5592 for_each_domain_iommu(iommu_id, dmar_domain)
5593 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594 start_pfn, npages, !freelist, 0);
5596 dma_free_pagelist(freelist);
5598 if (dmar_domain->max_addr == iova + size)
5599 dmar_domain->max_addr = iova;
5604 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5607 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608 struct dma_pte *pte;
5612 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613 if (pte && dma_pte_present(pte))
5614 phys = dma_pte_addr(pte) +
5615 (iova & (BIT_MASK(level_to_offset_bits(level) +
5616 VTD_PAGE_SHIFT) - 1));
5621 static inline bool scalable_mode_support(void)
5623 struct dmar_drhd_unit *drhd;
5624 struct intel_iommu *iommu;
5628 for_each_active_iommu(iommu, drhd) {
5629 if (!sm_supported(iommu)) {
5639 static inline bool iommu_pasid_support(void)
5641 struct dmar_drhd_unit *drhd;
5642 struct intel_iommu *iommu;
5646 for_each_active_iommu(iommu, drhd) {
5647 if (!pasid_supported(iommu)) {
5657 static inline bool nested_mode_support(void)
5659 struct dmar_drhd_unit *drhd;
5660 struct intel_iommu *iommu;
5664 for_each_active_iommu(iommu, drhd) {
5665 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5675 static bool intel_iommu_capable(enum iommu_cap cap)
5677 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678 return domain_update_iommu_snooping(NULL) == 1;
5679 if (cap == IOMMU_CAP_INTR_REMAP)
5680 return irq_remapping_enabled == 1;
5685 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5687 struct intel_iommu *iommu;
5689 iommu = device_to_iommu(dev, NULL, NULL);
5691 return ERR_PTR(-ENODEV);
5693 if (translation_pre_enabled(iommu))
5694 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5696 return &iommu->iommu;
5699 static void intel_iommu_release_device(struct device *dev)
5701 struct intel_iommu *iommu;
5703 iommu = device_to_iommu(dev, NULL, NULL);
5707 dmar_remove_one_dev_info(dev);
5709 set_dma_ops(dev, NULL);
5712 static void intel_iommu_probe_finalize(struct device *dev)
5714 struct iommu_domain *domain;
5716 domain = iommu_get_domain_for_dev(dev);
5717 if (device_needs_bounce(dev))
5718 set_dma_ops(dev, &bounce_dma_ops);
5719 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720 set_dma_ops(dev, &intel_dma_ops);
5722 set_dma_ops(dev, NULL);
5725 static void intel_iommu_get_resv_regions(struct device *device,
5726 struct list_head *head)
5728 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729 struct iommu_resv_region *reg;
5730 struct dmar_rmrr_unit *rmrr;
5731 struct device *i_dev;
5734 down_read(&dmar_global_lock);
5735 for_each_rmrr_units(rmrr) {
5736 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5738 struct iommu_resv_region *resv;
5739 enum iommu_resv_type type;
5742 if (i_dev != device &&
5743 !is_downstream_to_pci_bridge(device, i_dev))
5746 length = rmrr->end_address - rmrr->base_address + 1;
5748 type = device_rmrr_is_relaxable(device) ?
5749 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5751 resv = iommu_alloc_resv_region(rmrr->base_address,
5752 length, prot, type);
5756 list_add_tail(&resv->list, head);
5759 up_read(&dmar_global_lock);
5761 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762 if (dev_is_pci(device)) {
5763 struct pci_dev *pdev = to_pci_dev(device);
5765 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767 IOMMU_RESV_DIRECT_RELAXABLE);
5769 list_add_tail(®->list, head);
5772 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5774 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5779 list_add_tail(®->list, head);
5782 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5784 struct device_domain_info *info;
5785 struct context_entry *context;
5786 struct dmar_domain *domain;
5787 unsigned long flags;
5791 domain = find_domain(dev);
5795 spin_lock_irqsave(&device_domain_lock, flags);
5796 spin_lock(&iommu->lock);
5799 info = get_domain_info(dev);
5800 if (!info || !info->pasid_supported)
5803 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804 if (WARN_ON(!context))
5807 ctx_lo = context[0].lo;
5809 if (!(ctx_lo & CONTEXT_PASIDE)) {
5810 ctx_lo |= CONTEXT_PASIDE;
5811 context[0].lo = ctx_lo;
5813 iommu->flush.flush_context(iommu,
5814 domain->iommu_did[iommu->seq_id],
5815 PCI_DEVID(info->bus, info->devfn),
5816 DMA_CCMD_MASK_NOBIT,
5817 DMA_CCMD_DEVICE_INVL);
5820 /* Enable PASID support in the device, if it wasn't already */
5821 if (!info->pasid_enabled)
5822 iommu_enable_dev_iotlb(info);
5827 spin_unlock(&iommu->lock);
5828 spin_unlock_irqrestore(&device_domain_lock, flags);
5833 static void intel_iommu_apply_resv_region(struct device *dev,
5834 struct iommu_domain *domain,
5835 struct iommu_resv_region *region)
5837 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838 unsigned long start, end;
5840 start = IOVA_PFN(region->start);
5841 end = IOVA_PFN(region->start + region->length - 1);
5843 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5846 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5848 if (dev_is_pci(dev))
5849 return pci_device_group(dev);
5850 return generic_device_group(dev);
5853 static int intel_iommu_enable_auxd(struct device *dev)
5855 struct device_domain_info *info;
5856 struct intel_iommu *iommu;
5857 unsigned long flags;
5860 iommu = device_to_iommu(dev, NULL, NULL);
5861 if (!iommu || dmar_disabled)
5864 if (!sm_supported(iommu) || !pasid_supported(iommu))
5867 ret = intel_iommu_enable_pasid(iommu, dev);
5871 spin_lock_irqsave(&device_domain_lock, flags);
5872 info = get_domain_info(dev);
5873 info->auxd_enabled = 1;
5874 spin_unlock_irqrestore(&device_domain_lock, flags);
5879 static int intel_iommu_disable_auxd(struct device *dev)
5881 struct device_domain_info *info;
5882 unsigned long flags;
5884 spin_lock_irqsave(&device_domain_lock, flags);
5885 info = get_domain_info(dev);
5886 if (!WARN_ON(!info))
5887 info->auxd_enabled = 0;
5888 spin_unlock_irqrestore(&device_domain_lock, flags);
5894 * A PCI express designated vendor specific extended capability is defined
5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896 * for system software and tools to detect endpoint devices supporting the
5897 * Intel scalable IO virtualization without host driver dependency.
5899 * Returns the address of the matching extended capability structure within
5900 * the device's PCI configuration space or 0 if the device does not support
5903 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5908 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5910 pci_read_config_word(pdev, pos + 4, &vendor);
5911 pci_read_config_word(pdev, pos + 8, &id);
5912 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5915 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5922 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5924 if (feat == IOMMU_DEV_FEAT_AUX) {
5927 if (!dev_is_pci(dev) || dmar_disabled ||
5928 !scalable_mode_support() || !iommu_pasid_support())
5931 ret = pci_pasid_features(to_pci_dev(dev));
5935 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5938 if (feat == IOMMU_DEV_FEAT_SVA) {
5939 struct device_domain_info *info = get_domain_info(dev);
5941 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942 info->pasid_supported && info->pri_supported &&
5943 info->ats_supported;
5950 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5952 if (feat == IOMMU_DEV_FEAT_AUX)
5953 return intel_iommu_enable_auxd(dev);
5955 if (feat == IOMMU_DEV_FEAT_SVA) {
5956 struct device_domain_info *info = get_domain_info(dev);
5961 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5969 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5971 if (feat == IOMMU_DEV_FEAT_AUX)
5972 return intel_iommu_disable_auxd(dev);
5978 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5980 struct device_domain_info *info = get_domain_info(dev);
5982 if (feat == IOMMU_DEV_FEAT_AUX)
5983 return scalable_mode_support() && info && info->auxd_enabled;
5989 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5991 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5993 return dmar_domain->default_pasid > 0 ?
5994 dmar_domain->default_pasid : -EINVAL;
5997 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6000 return attach_deferred(dev);
6004 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005 enum iommu_attr attr, void *data)
6007 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008 unsigned long flags;
6011 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6015 case DOMAIN_ATTR_NESTING:
6016 spin_lock_irqsave(&device_domain_lock, flags);
6017 if (nested_mode_support() &&
6018 list_empty(&dmar_domain->devices)) {
6019 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6024 spin_unlock_irqrestore(&device_domain_lock, flags);
6035 * Check that the device does not live on an external facing PCI port that is
6036 * marked as untrusted. Such devices should not be able to apply quirks and
6037 * thus not be able to bypass the IOMMU restrictions.
6039 static bool risky_device(struct pci_dev *pdev)
6041 if (pdev->untrusted) {
6043 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044 pdev->vendor, pdev->device);
6045 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6051 const struct iommu_ops intel_iommu_ops = {
6052 .capable = intel_iommu_capable,
6053 .domain_alloc = intel_iommu_domain_alloc,
6054 .domain_free = intel_iommu_domain_free,
6055 .domain_set_attr = intel_iommu_domain_set_attr,
6056 .attach_dev = intel_iommu_attach_device,
6057 .detach_dev = intel_iommu_detach_device,
6058 .aux_attach_dev = intel_iommu_aux_attach_device,
6059 .aux_detach_dev = intel_iommu_aux_detach_device,
6060 .aux_get_pasid = intel_iommu_aux_get_pasid,
6061 .map = intel_iommu_map,
6062 .unmap = intel_iommu_unmap,
6063 .iova_to_phys = intel_iommu_iova_to_phys,
6064 .probe_device = intel_iommu_probe_device,
6065 .probe_finalize = intel_iommu_probe_finalize,
6066 .release_device = intel_iommu_release_device,
6067 .get_resv_regions = intel_iommu_get_resv_regions,
6068 .put_resv_regions = generic_iommu_put_resv_regions,
6069 .apply_resv_region = intel_iommu_apply_resv_region,
6070 .device_group = intel_iommu_device_group,
6071 .dev_has_feat = intel_iommu_dev_has_feat,
6072 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6073 .dev_enable_feat = intel_iommu_dev_enable_feat,
6074 .dev_disable_feat = intel_iommu_dev_disable_feat,
6075 .is_attach_deferred = intel_iommu_is_attach_deferred,
6076 .def_domain_type = device_def_domain_type,
6077 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6078 #ifdef CONFIG_INTEL_IOMMU_SVM
6079 .cache_invalidate = intel_iommu_sva_invalidate,
6080 .sva_bind_gpasid = intel_svm_bind_gpasid,
6081 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6082 .sva_bind = intel_svm_bind,
6083 .sva_unbind = intel_svm_unbind,
6084 .sva_get_pasid = intel_svm_get_pasid,
6085 .page_response = intel_svm_page_response,
6089 static void quirk_iommu_igfx(struct pci_dev *dev)
6091 if (risky_device(dev))
6094 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6098 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6107 /* Broadwell igfx malfunctions with dmar */
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6133 static void quirk_iommu_rwbf(struct pci_dev *dev)
6135 if (risky_device(dev))
6139 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140 * but needs it. Same seems to hold for the desktop versions.
6142 pci_info(dev, "Forcing write-buffer flush capability\n");
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6155 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6156 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6157 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6158 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6159 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6160 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6161 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6162 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6164 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6168 if (risky_device(dev))
6171 if (pci_read_config_word(dev, GGC, &ggc))
6174 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6177 } else if (dmar_map_gfx) {
6178 /* we have to ensure the gfx device is idle before we flush */
6179 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180 intel_iommu_strict = 1;
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6188 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6192 if (!IS_GFX_DEVICE(dev))
6195 ver = (dev->device >> 8) & 0xff;
6196 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6201 if (risky_device(dev))
6204 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205 iommu_skip_te_disable = 1;
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6209 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6210 ISOCH DMAR unit for the Azalia sound device, but not give it any
6211 TLB entries, which causes it to deadlock. Check for that. We do
6212 this in a function called from init_dmars(), instead of in a PCI
6213 quirk, because we don't want to print the obnoxious "BIOS broken"
6214 message if VT-d is actually disabled.
6216 static void __init check_tylersburg_isoch(void)
6218 struct pci_dev *pdev;
6219 uint32_t vtisochctrl;
6221 /* If there's no Azalia in the system anyway, forget it. */
6222 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6226 if (risky_device(pdev)) {
6233 /* System Management Registers. Might be hidden, in which case
6234 we can't do the sanity check. But that's OK, because the
6235 known-broken BIOSes _don't_ actually hide it, so far. */
6236 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6240 if (risky_device(pdev)) {
6245 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6252 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253 if (vtisochctrl & 1)
6256 /* Drop all bits other than the number of TLB entries */
6257 vtisochctrl &= 0x1c;
6259 /* If we have the recommended number of TLB entries (16), fine. */
6260 if (vtisochctrl == 0x10)
6263 /* Zero TLB entries? You get to ride the short bus to school. */
6265 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267 dmi_get_system_info(DMI_BIOS_VENDOR),
6268 dmi_get_system_info(DMI_BIOS_VERSION),
6269 dmi_get_system_info(DMI_PRODUCT_VERSION));
6270 iommu_identity_mapping |= IDENTMAP_AZALIA;
6274 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",