1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
360 #define IDENTMAP_GFX 2
361 #define IDENTMAP_AZALIA 4
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev->archdata.iommu;
376 if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377 info == DEFER_DEVICE_DOMAIN_INFO))
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
387 to_pci_dev(d)->untrusted)
390 * Iterate over elements in device_domain_list and call the specified
391 * callback @fn against each element.
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394 void *data), void *data)
398 struct device_domain_info *info;
400 spin_lock_irqsave(&device_domain_lock, flags);
401 list_for_each_entry(info, &device_domain_list, global) {
402 ret = fn(info, data);
404 spin_unlock_irqrestore(&device_domain_lock, flags);
408 spin_unlock_irqrestore(&device_domain_lock, flags);
413 const struct iommu_ops intel_iommu_ops;
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
417 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
422 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
425 static void init_translation_status(struct intel_iommu *iommu)
429 gsts = readl(iommu->reg + DMAR_GSTS_REG);
430 if (gsts & DMA_GSTS_TES)
431 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
434 static int __init intel_iommu_setup(char *str)
439 if (!strncmp(str, "on", 2)) {
441 pr_info("IOMMU enabled\n");
442 } else if (!strncmp(str, "off", 3)) {
444 no_platform_optin = 1;
445 pr_info("IOMMU disabled\n");
446 } else if (!strncmp(str, "igfx_off", 8)) {
448 pr_info("Disable GFX device mapping\n");
449 } else if (!strncmp(str, "forcedac", 8)) {
450 pr_info("Forcing DAC for PCI devices\n");
452 } else if (!strncmp(str, "strict", 6)) {
453 pr_info("Disable batched IOTLB flush\n");
454 intel_iommu_strict = 1;
455 } else if (!strncmp(str, "sp_off", 6)) {
456 pr_info("Disable supported super page\n");
457 intel_iommu_superpage = 0;
458 } else if (!strncmp(str, "sm_on", 5)) {
459 pr_info("Intel-IOMMU: scalable mode supported\n");
461 } else if (!strncmp(str, "tboot_noforce", 13)) {
462 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce = 1;
464 } else if (!strncmp(str, "nobounce", 8)) {
465 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
469 str += strcspn(str, ",");
475 __setup("intel_iommu=", intel_iommu_setup);
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 struct dmar_domain **domains;
485 domains = iommu->domains[idx];
489 return domains[did & 0xff];
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 struct dmar_domain *domain)
495 struct dmar_domain **domains;
498 if (!iommu->domains[idx]) {
499 size_t size = 256 * sizeof(struct dmar_domain *);
500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503 domains = iommu->domains[idx];
504 if (WARN_ON(!domains))
507 domains[did & 0xff] = domain;
510 void *alloc_pgtable_page(int node)
515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 vaddr = page_address(page);
521 void free_pgtable_page(void *vaddr)
523 free_page((unsigned long)vaddr);
526 static inline void *alloc_domain_mem(void)
528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 static void free_domain_mem(void *vaddr)
533 kmem_cache_free(iommu_domain_cache, vaddr);
536 static inline void * alloc_devinfo_mem(void)
538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 static inline void free_devinfo_mem(void *vaddr)
543 kmem_cache_free(iommu_devinfo_cache, vaddr);
546 static inline int domain_type_is_si(struct dmar_domain *domain)
548 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
559 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
569 sagaw = cap_sagaw(iommu->cap);
570 for (agaw = width_to_agaw(max_gaw);
572 if (test_bit(agaw, &sagaw))
580 * Calculate max SAGAW for each iommu.
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
588 * calculate agaw for each iommu.
589 * "SAGAW" may be different across iommus, use a default agaw, and
590 * get a supported less agaw for iommus that don't support the default agaw.
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
602 /* si_domain and vm domain should not get here. */
603 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
606 for_each_domain_iommu(iommu_id, domain)
609 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
612 return g_iommus[iommu_id];
615 static void domain_update_iommu_coherency(struct dmar_domain *domain)
617 struct dmar_drhd_unit *drhd;
618 struct intel_iommu *iommu;
622 domain->iommu_coherency = 1;
624 for_each_domain_iommu(i, domain) {
626 if (!ecap_coherent(g_iommus[i]->ecap)) {
627 domain->iommu_coherency = 0;
634 /* No hardware attached; use lowest common denominator */
636 for_each_active_iommu(iommu, drhd) {
637 if (!ecap_coherent(iommu->ecap)) {
638 domain->iommu_coherency = 0;
645 static int domain_update_iommu_snooping(struct intel_iommu *skip)
647 struct dmar_drhd_unit *drhd;
648 struct intel_iommu *iommu;
652 for_each_active_iommu(iommu, drhd) {
654 if (!ecap_sc_support(iommu->ecap)) {
665 static int domain_update_iommu_superpage(struct dmar_domain *domain,
666 struct intel_iommu *skip)
668 struct dmar_drhd_unit *drhd;
669 struct intel_iommu *iommu;
672 if (!intel_iommu_superpage) {
676 /* set iommu_superpage to the smallest common denominator */
678 for_each_active_iommu(iommu, drhd) {
680 if (domain && domain_use_first_level(domain)) {
681 if (!cap_fl1gp_support(iommu->cap))
684 mask &= cap_super_page_val(iommu->cap);
696 /* Some capabilities may be different across iommus */
697 static void domain_update_iommu_cap(struct dmar_domain *domain)
699 domain_update_iommu_coherency(domain);
700 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
701 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
704 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
707 struct root_entry *root = &iommu->root_entry[bus];
708 struct context_entry *context;
712 if (sm_supported(iommu)) {
720 context = phys_to_virt(*entry & VTD_PAGE_MASK);
722 unsigned long phy_addr;
726 context = alloc_pgtable_page(iommu->node);
730 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731 phy_addr = virt_to_phys((void *)context);
732 *entry = phy_addr | 1;
733 __iommu_flush_cache(iommu, entry, sizeof(*entry));
735 return &context[devfn];
738 static int iommu_dummy(struct device *dev)
740 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
743 static bool attach_deferred(struct device *dev)
745 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750 * sub-hierarchy of a candidate PCI-PCI bridge
751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752 * @bridge: the candidate PCI-PCI bridge
754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
759 struct pci_dev *pdev, *pbridge;
761 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
764 pdev = to_pci_dev(dev);
765 pbridge = to_pci_dev(bridge);
767 if (pbridge->subordinate &&
768 pbridge->subordinate->number <= pdev->bus->number &&
769 pbridge->subordinate->busn_res.end >= pdev->bus->number)
775 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
777 struct dmar_drhd_unit *drhd = NULL;
778 struct intel_iommu *iommu;
780 struct pci_dev *pdev = NULL;
784 if (iommu_dummy(dev))
787 if (dev_is_pci(dev)) {
788 struct pci_dev *pf_pdev;
790 pdev = pci_real_dma_dev(to_pci_dev(dev));
792 /* VFs aren't listed in scope tables; we need to look up
793 * the PF instead to find the IOMMU. */
794 pf_pdev = pci_physfn(pdev);
796 segment = pci_domain_nr(pdev->bus);
797 } else if (has_acpi_companion(dev))
798 dev = &ACPI_COMPANION(dev)->dev;
801 for_each_active_iommu(iommu, drhd) {
802 if (pdev && segment != drhd->segment)
805 for_each_active_dev_scope(drhd->devices,
806 drhd->devices_cnt, i, tmp) {
808 /* For a VF use its original BDF# not that of the PF
809 * which we used for the IOMMU lookup. Strictly speaking
810 * we could do this for all PCI devices; we only need to
811 * get the BDF# from the scope table for ACPI matches. */
812 if (pdev && pdev->is_virtfn)
815 *bus = drhd->devices[i].bus;
816 *devfn = drhd->devices[i].devfn;
820 if (is_downstream_to_pci_bridge(dev, tmp))
824 if (pdev && drhd->include_all) {
826 *bus = pdev->bus->number;
827 *devfn = pdev->devfn;
838 static void domain_flush_cache(struct dmar_domain *domain,
839 void *addr, int size)
841 if (!domain->iommu_coherency)
842 clflush_cache_range(addr, size);
845 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
847 struct context_entry *context;
851 spin_lock_irqsave(&iommu->lock, flags);
852 context = iommu_context_addr(iommu, bus, devfn, 0);
854 ret = context_present(context);
855 spin_unlock_irqrestore(&iommu->lock, flags);
859 static void free_context_table(struct intel_iommu *iommu)
863 struct context_entry *context;
865 spin_lock_irqsave(&iommu->lock, flags);
866 if (!iommu->root_entry) {
869 for (i = 0; i < ROOT_ENTRY_NR; i++) {
870 context = iommu_context_addr(iommu, i, 0, 0);
872 free_pgtable_page(context);
874 if (!sm_supported(iommu))
877 context = iommu_context_addr(iommu, i, 0x80, 0);
879 free_pgtable_page(context);
882 free_pgtable_page(iommu->root_entry);
883 iommu->root_entry = NULL;
885 spin_unlock_irqrestore(&iommu->lock, flags);
888 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
889 unsigned long pfn, int *target_level)
891 struct dma_pte *parent, *pte;
892 int level = agaw_to_level(domain->agaw);
895 BUG_ON(!domain->pgd);
897 if (!domain_pfn_supported(domain, pfn))
898 /* Address beyond IOMMU's addressing capabilities. */
901 parent = domain->pgd;
906 offset = pfn_level_offset(pfn, level);
907 pte = &parent[offset];
908 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
910 if (level == *target_level)
913 if (!dma_pte_present(pte)) {
916 tmp_page = alloc_pgtable_page(domain->nid);
921 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
922 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
923 if (domain_use_first_level(domain))
924 pteval |= DMA_FL_PTE_XD;
925 if (cmpxchg64(&pte->val, 0ULL, pteval))
926 /* Someone else set it while we were thinking; use theirs. */
927 free_pgtable_page(tmp_page);
929 domain_flush_cache(domain, pte, sizeof(*pte));
934 parent = phys_to_virt(dma_pte_addr(pte));
939 *target_level = level;
944 /* return address's pte at specific level */
945 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
947 int level, int *large_page)
949 struct dma_pte *parent, *pte;
950 int total = agaw_to_level(domain->agaw);
953 parent = domain->pgd;
954 while (level <= total) {
955 offset = pfn_level_offset(pfn, total);
956 pte = &parent[offset];
960 if (!dma_pte_present(pte)) {
965 if (dma_pte_superpage(pte)) {
970 parent = phys_to_virt(dma_pte_addr(pte));
976 /* clear last level pte, a tlb flush should be followed */
977 static void dma_pte_clear_range(struct dmar_domain *domain,
978 unsigned long start_pfn,
979 unsigned long last_pfn)
981 unsigned int large_page;
982 struct dma_pte *first_pte, *pte;
984 BUG_ON(!domain_pfn_supported(domain, start_pfn));
985 BUG_ON(!domain_pfn_supported(domain, last_pfn));
986 BUG_ON(start_pfn > last_pfn);
988 /* we don't need lock here; nobody else touches the iova range */
991 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
993 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
998 start_pfn += lvl_to_nr_pages(large_page);
1000 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1002 domain_flush_cache(domain, first_pte,
1003 (void *)pte - (void *)first_pte);
1005 } while (start_pfn && start_pfn <= last_pfn);
1008 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1009 int retain_level, struct dma_pte *pte,
1010 unsigned long pfn, unsigned long start_pfn,
1011 unsigned long last_pfn)
1013 pfn = max(start_pfn, pfn);
1014 pte = &pte[pfn_level_offset(pfn, level)];
1017 unsigned long level_pfn;
1018 struct dma_pte *level_pte;
1020 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1023 level_pfn = pfn & level_mask(level);
1024 level_pte = phys_to_virt(dma_pte_addr(pte));
1027 dma_pte_free_level(domain, level - 1, retain_level,
1028 level_pte, level_pfn, start_pfn,
1033 * Free the page table if we're below the level we want to
1034 * retain and the range covers the entire table.
1036 if (level < retain_level && !(start_pfn > level_pfn ||
1037 last_pfn < level_pfn + level_size(level) - 1)) {
1039 domain_flush_cache(domain, pte, sizeof(*pte));
1040 free_pgtable_page(level_pte);
1043 pfn += level_size(level);
1044 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1048 * clear last level (leaf) ptes and free page table pages below the
1049 * level we wish to keep intact.
1051 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1052 unsigned long start_pfn,
1053 unsigned long last_pfn,
1056 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058 BUG_ON(start_pfn > last_pfn);
1060 dma_pte_clear_range(domain, start_pfn, last_pfn);
1062 /* We don't need lock here; nobody else touches the iova range */
1063 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1064 domain->pgd, 0, start_pfn, last_pfn);
1067 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1068 free_pgtable_page(domain->pgd);
1073 /* When a page at a given level is being unlinked from its parent, we don't
1074 need to *modify* it at all. All we need to do is make a list of all the
1075 pages which can be freed just as soon as we've flushed the IOTLB and we
1076 know the hardware page-walk will no longer touch them.
1077 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1079 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1080 int level, struct dma_pte *pte,
1081 struct page *freelist)
1085 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1086 pg->freelist = freelist;
1092 pte = page_address(pg);
1094 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1095 freelist = dma_pte_list_pagetables(domain, level - 1,
1098 } while (!first_pte_in_page(pte));
1103 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1104 struct dma_pte *pte, unsigned long pfn,
1105 unsigned long start_pfn,
1106 unsigned long last_pfn,
1107 struct page *freelist)
1109 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1111 pfn = max(start_pfn, pfn);
1112 pte = &pte[pfn_level_offset(pfn, level)];
1115 unsigned long level_pfn;
1117 if (!dma_pte_present(pte))
1120 level_pfn = pfn & level_mask(level);
1122 /* If range covers entire pagetable, free it */
1123 if (start_pfn <= level_pfn &&
1124 last_pfn >= level_pfn + level_size(level) - 1) {
1125 /* These suborbinate page tables are going away entirely. Don't
1126 bother to clear them; we're just going to *free* them. */
1127 if (level > 1 && !dma_pte_superpage(pte))
1128 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1134 } else if (level > 1) {
1135 /* Recurse down into a level that isn't *entirely* obsolete */
1136 freelist = dma_pte_clear_level(domain, level - 1,
1137 phys_to_virt(dma_pte_addr(pte)),
1138 level_pfn, start_pfn, last_pfn,
1142 pfn += level_size(level);
1143 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1146 domain_flush_cache(domain, first_pte,
1147 (void *)++last_pte - (void *)first_pte);
1152 /* We can't just free the pages because the IOMMU may still be walking
1153 the page tables, and may have cached the intermediate levels. The
1154 pages can only be freed after the IOTLB flush has been done. */
1155 static struct page *domain_unmap(struct dmar_domain *domain,
1156 unsigned long start_pfn,
1157 unsigned long last_pfn)
1159 struct page *freelist;
1161 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1162 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1163 BUG_ON(start_pfn > last_pfn);
1165 /* we don't need lock here; nobody else touches the iova range */
1166 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1167 domain->pgd, 0, start_pfn, last_pfn, NULL);
1170 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1171 struct page *pgd_page = virt_to_page(domain->pgd);
1172 pgd_page->freelist = freelist;
1173 freelist = pgd_page;
1181 static void dma_free_pagelist(struct page *freelist)
1185 while ((pg = freelist)) {
1186 freelist = pg->freelist;
1187 free_pgtable_page(page_address(pg));
1191 static void iova_entry_free(unsigned long data)
1193 struct page *freelist = (struct page *)data;
1195 dma_free_pagelist(freelist);
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1201 struct root_entry *root;
1202 unsigned long flags;
1204 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1206 pr_err("Allocating root entry for %s failed\n",
1211 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1213 spin_lock_irqsave(&iommu->lock, flags);
1214 iommu->root_entry = root;
1215 spin_unlock_irqrestore(&iommu->lock, flags);
1220 static void iommu_set_root_entry(struct intel_iommu *iommu)
1226 addr = virt_to_phys(iommu->root_entry);
1227 if (sm_supported(iommu))
1228 addr |= DMA_RTADDR_SMT;
1230 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1233 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1235 /* Make sure hardware complete it */
1236 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237 readl, (sts & DMA_GSTS_RTPS), sts);
1239 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1242 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1247 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1250 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1251 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1253 /* Make sure hardware complete it */
1254 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255 readl, (!(val & DMA_GSTS_WBFS)), val);
1257 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1260 /* return value determine if we need a write buffer flush */
1261 static void __iommu_flush_context(struct intel_iommu *iommu,
1262 u16 did, u16 source_id, u8 function_mask,
1269 case DMA_CCMD_GLOBAL_INVL:
1270 val = DMA_CCMD_GLOBAL_INVL;
1272 case DMA_CCMD_DOMAIN_INVL:
1273 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1275 case DMA_CCMD_DEVICE_INVL:
1276 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1277 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1282 val |= DMA_CCMD_ICC;
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 u64 addr, unsigned int size_order, u64 type)
1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 u64 val = 0, val_iva = 0;
1303 case DMA_TLB_GLOBAL_FLUSH:
1304 /* global flush doesn't need set IVA_REG */
1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307 case DMA_TLB_DSI_FLUSH:
1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310 case DMA_TLB_PSI_FLUSH:
1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 /* IH bit is passed in as part of address */
1313 val_iva = size_order | addr;
1318 /* Note: set drain read/write */
1321 * This is probably to be super secure.. Looks like we can
1322 * ignore it without any impact.
1324 if (cap_read_drain(iommu->cap))
1325 val |= DMA_TLB_READ_DRAIN;
1327 if (cap_write_drain(iommu->cap))
1328 val |= DMA_TLB_WRITE_DRAIN;
1330 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1331 /* Note: Only uses first TLB reg currently */
1333 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1334 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1336 /* Make sure hardware complete it */
1337 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1338 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1340 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342 /* check IOTLB invalidation granularity */
1343 if (DMA_TLB_IAIG(val) == 0)
1344 pr_err("Flush IOTLB failed\n");
1345 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1346 pr_debug("TLB flush request %Lx, actual %Lx\n",
1347 (unsigned long long)DMA_TLB_IIRG(type),
1348 (unsigned long long)DMA_TLB_IAIG(val));
1351 static struct device_domain_info *
1352 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1355 struct device_domain_info *info;
1357 assert_spin_locked(&device_domain_lock);
1362 list_for_each_entry(info, &domain->devices, link)
1363 if (info->iommu == iommu && info->bus == bus &&
1364 info->devfn == devfn) {
1365 if (info->ats_supported && info->dev)
1373 static void domain_update_iotlb(struct dmar_domain *domain)
1375 struct device_domain_info *info;
1376 bool has_iotlb_device = false;
1378 assert_spin_locked(&device_domain_lock);
1380 list_for_each_entry(info, &domain->devices, link) {
1381 struct pci_dev *pdev;
1383 if (!info->dev || !dev_is_pci(info->dev))
1386 pdev = to_pci_dev(info->dev);
1387 if (pdev->ats_enabled) {
1388 has_iotlb_device = true;
1393 domain->has_iotlb_device = has_iotlb_device;
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1398 struct pci_dev *pdev;
1400 assert_spin_locked(&device_domain_lock);
1402 if (!info || !dev_is_pci(info->dev))
1405 pdev = to_pci_dev(info->dev);
1406 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1407 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1408 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1409 * reserved, which should be set to 0.
1411 if (!ecap_dit(info->iommu->ecap))
1414 struct pci_dev *pf_pdev;
1416 /* pdev will be returned if device is not a vf */
1417 pf_pdev = pci_physfn(pdev);
1418 info->pfsid = pci_dev_id(pf_pdev);
1421 #ifdef CONFIG_INTEL_IOMMU_SVM
1422 /* The PCIe spec, in its wisdom, declares that the behaviour of
1423 the device if you enable PASID support after ATS support is
1424 undefined. So always enable PASID support on devices which
1425 have it, even if we can't yet know if we're ever going to
1427 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428 info->pasid_enabled = 1;
1430 if (info->pri_supported &&
1431 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1432 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1433 info->pri_enabled = 1;
1435 if (!pdev->untrusted && info->ats_supported &&
1436 pci_ats_page_aligned(pdev) &&
1437 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438 info->ats_enabled = 1;
1439 domain_update_iotlb(info->domain);
1440 info->ats_qdep = pci_ats_queue_depth(pdev);
1444 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1446 struct pci_dev *pdev;
1448 assert_spin_locked(&device_domain_lock);
1450 if (!dev_is_pci(info->dev))
1453 pdev = to_pci_dev(info->dev);
1455 if (info->ats_enabled) {
1456 pci_disable_ats(pdev);
1457 info->ats_enabled = 0;
1458 domain_update_iotlb(info->domain);
1460 #ifdef CONFIG_INTEL_IOMMU_SVM
1461 if (info->pri_enabled) {
1462 pci_disable_pri(pdev);
1463 info->pri_enabled = 0;
1465 if (info->pasid_enabled) {
1466 pci_disable_pasid(pdev);
1467 info->pasid_enabled = 0;
1472 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1473 u64 addr, unsigned mask)
1476 unsigned long flags;
1477 struct device_domain_info *info;
1479 if (!domain->has_iotlb_device)
1482 spin_lock_irqsave(&device_domain_lock, flags);
1483 list_for_each_entry(info, &domain->devices, link) {
1484 if (!info->ats_enabled)
1487 sid = info->bus << 8 | info->devfn;
1488 qdep = info->ats_qdep;
1489 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1492 spin_unlock_irqrestore(&device_domain_lock, flags);
1495 static void domain_flush_piotlb(struct intel_iommu *iommu,
1496 struct dmar_domain *domain,
1497 u64 addr, unsigned long npages, bool ih)
1499 u16 did = domain->iommu_did[iommu->seq_id];
1501 if (domain->default_pasid)
1502 qi_flush_piotlb(iommu, did, domain->default_pasid,
1505 if (!list_empty(&domain->devices))
1506 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1509 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1510 struct dmar_domain *domain,
1511 unsigned long pfn, unsigned int pages,
1514 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1515 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1516 u16 did = domain->iommu_did[iommu->seq_id];
1523 if (domain_use_first_level(domain)) {
1524 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1527 * Fallback to domain selective flush if no PSI support or
1528 * the size is too big. PSI requires page size to be 2 ^ x,
1529 * and the base address is naturally aligned to the size.
1531 if (!cap_pgsel_inv(iommu->cap) ||
1532 mask > cap_max_amask_val(iommu->cap))
1533 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1536 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1541 * In caching mode, changes of pages from non-present to present require
1542 * flush. However, device IOTLB doesn't need to be flushed in this case.
1544 if (!cap_caching_mode(iommu->cap) || !map)
1545 iommu_flush_dev_iotlb(domain, addr, mask);
1548 /* Notification for newly created mappings */
1549 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1550 struct dmar_domain *domain,
1551 unsigned long pfn, unsigned int pages)
1554 * It's a non-present to present mapping. Only flush if caching mode
1557 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1558 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1560 iommu_flush_write_buffer(iommu);
1563 static void iommu_flush_iova(struct iova_domain *iovad)
1565 struct dmar_domain *domain;
1568 domain = container_of(iovad, struct dmar_domain, iovad);
1570 for_each_domain_iommu(idx, domain) {
1571 struct intel_iommu *iommu = g_iommus[idx];
1572 u16 did = domain->iommu_did[iommu->seq_id];
1574 if (domain_use_first_level(domain))
1575 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1577 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580 if (!cap_caching_mode(iommu->cap))
1581 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1582 0, MAX_AGAW_PFN_WIDTH);
1586 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1589 unsigned long flags;
1591 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1594 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1595 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1596 pmen &= ~DMA_PMEN_EPM;
1597 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1599 /* wait for the protected region status bit to clear */
1600 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1601 readl, !(pmen & DMA_PMEN_PRS), pmen);
1603 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1606 static void iommu_enable_translation(struct intel_iommu *iommu)
1609 unsigned long flags;
1611 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612 iommu->gcmd |= DMA_GCMD_TE;
1613 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1615 /* Make sure hardware complete it */
1616 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1617 readl, (sts & DMA_GSTS_TES), sts);
1619 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1622 static void iommu_disable_translation(struct intel_iommu *iommu)
1627 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1628 iommu->gcmd &= ~DMA_GCMD_TE;
1629 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1631 /* Make sure hardware complete it */
1632 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1633 readl, (!(sts & DMA_GSTS_TES)), sts);
1635 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1638 static int iommu_init_domains(struct intel_iommu *iommu)
1640 u32 ndomains, nlongs;
1643 ndomains = cap_ndoms(iommu->cap);
1644 pr_debug("%s: Number of Domains supported <%d>\n",
1645 iommu->name, ndomains);
1646 nlongs = BITS_TO_LONGS(ndomains);
1648 spin_lock_init(&iommu->lock);
1650 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1651 if (!iommu->domain_ids) {
1652 pr_err("%s: Allocating domain id array failed\n",
1657 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1658 iommu->domains = kzalloc(size, GFP_KERNEL);
1660 if (iommu->domains) {
1661 size = 256 * sizeof(struct dmar_domain *);
1662 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1665 if (!iommu->domains || !iommu->domains[0]) {
1666 pr_err("%s: Allocating domain array failed\n",
1668 kfree(iommu->domain_ids);
1669 kfree(iommu->domains);
1670 iommu->domain_ids = NULL;
1671 iommu->domains = NULL;
1676 * If Caching mode is set, then invalid translations are tagged
1677 * with domain-id 0, hence we need to pre-allocate it. We also
1678 * use domain-id 0 as a marker for non-allocated domain-id, so
1679 * make sure it is not used for a real domain.
1681 set_bit(0, iommu->domain_ids);
1684 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1685 * entry for first-level or pass-through translation modes should
1686 * be programmed with a domain id different from those used for
1687 * second-level or nested translation. We reserve a domain id for
1690 if (sm_supported(iommu))
1691 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1696 static void disable_dmar_iommu(struct intel_iommu *iommu)
1698 struct device_domain_info *info, *tmp;
1699 unsigned long flags;
1701 if (!iommu->domains || !iommu->domain_ids)
1704 spin_lock_irqsave(&device_domain_lock, flags);
1705 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1706 if (info->iommu != iommu)
1709 if (!info->dev || !info->domain)
1712 __dmar_remove_one_dev_info(info);
1714 spin_unlock_irqrestore(&device_domain_lock, flags);
1716 if (iommu->gcmd & DMA_GCMD_TE)
1717 iommu_disable_translation(iommu);
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1722 if ((iommu->domains) && (iommu->domain_ids)) {
1723 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1726 for (i = 0; i < elems; i++)
1727 kfree(iommu->domains[i]);
1728 kfree(iommu->domains);
1729 kfree(iommu->domain_ids);
1730 iommu->domains = NULL;
1731 iommu->domain_ids = NULL;
1734 g_iommus[iommu->seq_id] = NULL;
1736 /* free context mapping */
1737 free_context_table(iommu);
1739 #ifdef CONFIG_INTEL_IOMMU_SVM
1740 if (pasid_supported(iommu)) {
1741 if (ecap_prs(iommu->ecap))
1742 intel_svm_finish_prq(iommu);
1744 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1745 ioasid_unregister_allocator(&iommu->pasid_allocator);
1751 * Check and return whether first level is used by default for
1754 static bool first_level_by_default(void)
1756 struct dmar_drhd_unit *drhd;
1757 struct intel_iommu *iommu;
1758 static int first_level_support = -1;
1760 if (likely(first_level_support != -1))
1761 return first_level_support;
1763 first_level_support = 1;
1766 for_each_active_iommu(iommu, drhd) {
1767 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1768 first_level_support = 0;
1774 return first_level_support;
1777 static struct dmar_domain *alloc_domain(int flags)
1779 struct dmar_domain *domain;
1781 domain = alloc_domain_mem();
1785 memset(domain, 0, sizeof(*domain));
1786 domain->nid = NUMA_NO_NODE;
1787 domain->flags = flags;
1788 if (first_level_by_default())
1789 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1790 domain->has_iotlb_device = false;
1791 INIT_LIST_HEAD(&domain->devices);
1796 /* Must be called with iommu->lock */
1797 static int domain_attach_iommu(struct dmar_domain *domain,
1798 struct intel_iommu *iommu)
1800 unsigned long ndomains;
1803 assert_spin_locked(&device_domain_lock);
1804 assert_spin_locked(&iommu->lock);
1806 domain->iommu_refcnt[iommu->seq_id] += 1;
1807 domain->iommu_count += 1;
1808 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1809 ndomains = cap_ndoms(iommu->cap);
1810 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1812 if (num >= ndomains) {
1813 pr_err("%s: No free domain ids\n", iommu->name);
1814 domain->iommu_refcnt[iommu->seq_id] -= 1;
1815 domain->iommu_count -= 1;
1819 set_bit(num, iommu->domain_ids);
1820 set_iommu_domain(iommu, num, domain);
1822 domain->iommu_did[iommu->seq_id] = num;
1823 domain->nid = iommu->node;
1825 domain_update_iommu_cap(domain);
1831 static int domain_detach_iommu(struct dmar_domain *domain,
1832 struct intel_iommu *iommu)
1836 assert_spin_locked(&device_domain_lock);
1837 assert_spin_locked(&iommu->lock);
1839 domain->iommu_refcnt[iommu->seq_id] -= 1;
1840 count = --domain->iommu_count;
1841 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1842 num = domain->iommu_did[iommu->seq_id];
1843 clear_bit(num, iommu->domain_ids);
1844 set_iommu_domain(iommu, num, NULL);
1846 domain_update_iommu_cap(domain);
1847 domain->iommu_did[iommu->seq_id] = 0;
1853 static struct iova_domain reserved_iova_list;
1854 static struct lock_class_key reserved_rbtree_key;
1856 static int dmar_init_reserved_ranges(void)
1858 struct pci_dev *pdev = NULL;
1862 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1864 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1865 &reserved_rbtree_key);
1867 /* IOAPIC ranges shouldn't be accessed by DMA */
1868 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1869 IOVA_PFN(IOAPIC_RANGE_END));
1871 pr_err("Reserve IOAPIC range failed\n");
1875 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1876 for_each_pci_dev(pdev) {
1879 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1880 r = &pdev->resource[i];
1881 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1883 iova = reserve_iova(&reserved_iova_list,
1887 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1895 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1897 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1900 static inline int guestwidth_to_adjustwidth(int gaw)
1903 int r = (gaw - 12) % 9;
1914 static void domain_exit(struct dmar_domain *domain)
1917 /* Remove associated devices and clear attached or cached domains */
1918 domain_remove_dev_info(domain);
1921 put_iova_domain(&domain->iovad);
1924 struct page *freelist;
1926 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1927 dma_free_pagelist(freelist);
1930 free_domain_mem(domain);
1934 * Get the PASID directory size for scalable mode context entry.
1935 * Value of X in the PDTS field of a scalable mode context entry
1936 * indicates PASID directory with 2^(X + 7) entries.
1938 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1942 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1943 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1951 * Set the RID_PASID field of a scalable mode context entry. The
1952 * IOMMU hardware will use the PASID value set in this field for
1953 * DMA translations of DMA requests without PASID.
1956 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1958 context->hi |= pasid & ((1 << 20) - 1);
1959 context->hi |= (1 << 20);
1963 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1966 static inline void context_set_sm_dte(struct context_entry *context)
1968 context->lo |= (1 << 2);
1972 * Set the PRE(Page Request Enable) field of a scalable mode context
1975 static inline void context_set_sm_pre(struct context_entry *context)
1977 context->lo |= (1 << 4);
1980 /* Convert value to context PASID directory size field coding. */
1981 #define context_pdts(pds) (((pds) & 0x7) << 9)
1983 static int domain_context_mapping_one(struct dmar_domain *domain,
1984 struct intel_iommu *iommu,
1985 struct pasid_table *table,
1988 u16 did = domain->iommu_did[iommu->seq_id];
1989 int translation = CONTEXT_TT_MULTI_LEVEL;
1990 struct device_domain_info *info = NULL;
1991 struct context_entry *context;
1992 unsigned long flags;
1997 if (hw_pass_through && domain_type_is_si(domain))
1998 translation = CONTEXT_TT_PASS_THROUGH;
2000 pr_debug("Set context mapping for %02x:%02x.%d\n",
2001 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2003 BUG_ON(!domain->pgd);
2005 spin_lock_irqsave(&device_domain_lock, flags);
2006 spin_lock(&iommu->lock);
2009 context = iommu_context_addr(iommu, bus, devfn, 1);
2014 if (context_present(context))
2018 * For kdump cases, old valid entries may be cached due to the
2019 * in-flight DMA and copied pgtable, but there is no unmapping
2020 * behaviour for them, thus we need an explicit cache flush for
2021 * the newly-mapped device. For kdump, at this point, the device
2022 * is supposed to finish reset at its driver probe stage, so no
2023 * in-flight DMA will exist, and we don't need to worry anymore
2026 if (context_copied(context)) {
2027 u16 did_old = context_domain_id(context);
2029 if (did_old < cap_ndoms(iommu->cap)) {
2030 iommu->flush.flush_context(iommu, did_old,
2031 (((u16)bus) << 8) | devfn,
2032 DMA_CCMD_MASK_NOBIT,
2033 DMA_CCMD_DEVICE_INVL);
2034 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2039 context_clear_entry(context);
2041 if (sm_supported(iommu)) {
2046 /* Setup the PASID DIR pointer: */
2047 pds = context_get_sm_pds(table);
2048 context->lo = (u64)virt_to_phys(table->table) |
2051 /* Setup the RID_PASID field: */
2052 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2055 * Setup the Device-TLB enable bit and Page request
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 context_set_sm_dte(context);
2061 if (info && info->pri_supported)
2062 context_set_sm_pre(context);
2064 struct dma_pte *pgd = domain->pgd;
2067 context_set_domain_id(context, did);
2069 if (translation != CONTEXT_TT_PASS_THROUGH) {
2071 * Skip top levels of page tables for iommu which has
2072 * less agaw than default. Unnecessary for PT mode.
2074 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2076 pgd = phys_to_virt(dma_pte_addr(pgd));
2077 if (!dma_pte_present(pgd))
2081 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2082 if (info && info->ats_supported)
2083 translation = CONTEXT_TT_DEV_IOTLB;
2085 translation = CONTEXT_TT_MULTI_LEVEL;
2087 context_set_address_root(context, virt_to_phys(pgd));
2088 context_set_address_width(context, agaw);
2091 * In pass through mode, AW must be programmed to
2092 * indicate the largest AGAW value supported by
2093 * hardware. And ASR is ignored by hardware.
2095 context_set_address_width(context, iommu->msagaw);
2098 context_set_translation_type(context, translation);
2101 context_set_fault_enable(context);
2102 context_set_present(context);
2103 domain_flush_cache(domain, context, sizeof(*context));
2106 * It's a non-present to present mapping. If hardware doesn't cache
2107 * non-present entry we only need to flush the write-buffer. If the
2108 * _does_ cache non-present entries, then it does so in the special
2109 * domain #0, which we have to flush:
2111 if (cap_caching_mode(iommu->cap)) {
2112 iommu->flush.flush_context(iommu, 0,
2113 (((u16)bus) << 8) | devfn,
2114 DMA_CCMD_MASK_NOBIT,
2115 DMA_CCMD_DEVICE_INVL);
2116 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2118 iommu_flush_write_buffer(iommu);
2120 iommu_enable_dev_iotlb(info);
2125 spin_unlock(&iommu->lock);
2126 spin_unlock_irqrestore(&device_domain_lock, flags);
2131 struct domain_context_mapping_data {
2132 struct dmar_domain *domain;
2133 struct intel_iommu *iommu;
2134 struct pasid_table *table;
2137 static int domain_context_mapping_cb(struct pci_dev *pdev,
2138 u16 alias, void *opaque)
2140 struct domain_context_mapping_data *data = opaque;
2142 return domain_context_mapping_one(data->domain, data->iommu,
2143 data->table, PCI_BUS_NUM(alias),
2148 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2150 struct domain_context_mapping_data data;
2151 struct pasid_table *table;
2152 struct intel_iommu *iommu;
2155 iommu = device_to_iommu(dev, &bus, &devfn);
2159 table = intel_pasid_get_table(dev);
2161 if (!dev_is_pci(dev))
2162 return domain_context_mapping_one(domain, iommu, table,
2165 data.domain = domain;
2169 return pci_for_each_dma_alias(to_pci_dev(dev),
2170 &domain_context_mapping_cb, &data);
2173 static int domain_context_mapped_cb(struct pci_dev *pdev,
2174 u16 alias, void *opaque)
2176 struct intel_iommu *iommu = opaque;
2178 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2181 static int domain_context_mapped(struct device *dev)
2183 struct intel_iommu *iommu;
2186 iommu = device_to_iommu(dev, &bus, &devfn);
2190 if (!dev_is_pci(dev))
2191 return device_context_mapped(iommu, bus, devfn);
2193 return !pci_for_each_dma_alias(to_pci_dev(dev),
2194 domain_context_mapped_cb, iommu);
2197 /* Returns a number of VTD pages, but aligned to MM page size */
2198 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2201 host_addr &= ~PAGE_MASK;
2202 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2205 /* Return largest possible superpage level for a given mapping */
2206 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2207 unsigned long iov_pfn,
2208 unsigned long phy_pfn,
2209 unsigned long pages)
2211 int support, level = 1;
2212 unsigned long pfnmerge;
2214 support = domain->iommu_superpage;
2216 /* To use a large page, the virtual *and* physical addresses
2217 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2218 of them will mean we have to use smaller pages. So just
2219 merge them and check both at once. */
2220 pfnmerge = iov_pfn | phy_pfn;
2222 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2223 pages >>= VTD_STRIDE_SHIFT;
2226 pfnmerge >>= VTD_STRIDE_SHIFT;
2233 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2234 struct scatterlist *sg, unsigned long phys_pfn,
2235 unsigned long nr_pages, int prot)
2237 struct dma_pte *first_pte = NULL, *pte = NULL;
2238 phys_addr_t uninitialized_var(pteval);
2239 unsigned long sg_res = 0;
2240 unsigned int largepage_lvl = 0;
2241 unsigned long lvl_pages = 0;
2244 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2246 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2249 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2250 if (domain_use_first_level(domain))
2251 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2255 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2258 while (nr_pages > 0) {
2262 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2264 sg_res = aligned_nrpages(sg->offset, sg->length);
2265 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2266 sg->dma_length = sg->length;
2267 pteval = (sg_phys(sg) - pgoff) | attr;
2268 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2272 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2274 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2277 /* It is large page*/
2278 if (largepage_lvl > 1) {
2279 unsigned long nr_superpages, end_pfn;
2281 pteval |= DMA_PTE_LARGE_PAGE;
2282 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2284 nr_superpages = sg_res / lvl_pages;
2285 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2288 * Ensure that old small page tables are
2289 * removed to make room for superpage(s).
2290 * We're adding new large pages, so make sure
2291 * we don't remove their parent tables.
2293 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2296 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2300 /* We don't need lock here, nobody else
2301 * touches the iova range
2303 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2305 static int dumps = 5;
2306 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2307 iov_pfn, tmp, (unsigned long long)pteval);
2310 debug_dma_dump_mappings(NULL);
2315 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2317 BUG_ON(nr_pages < lvl_pages);
2318 BUG_ON(sg_res < lvl_pages);
2320 nr_pages -= lvl_pages;
2321 iov_pfn += lvl_pages;
2322 phys_pfn += lvl_pages;
2323 pteval += lvl_pages * VTD_PAGE_SIZE;
2324 sg_res -= lvl_pages;
2326 /* If the next PTE would be the first in a new page, then we
2327 need to flush the cache on the entries we've just written.
2328 And then we'll need to recalculate 'pte', so clear it and
2329 let it get set again in the if (!pte) block above.
2331 If we're done (!nr_pages) we need to flush the cache too.
2333 Also if we've been setting superpages, we may need to
2334 recalculate 'pte' and switch back to smaller pages for the
2335 end of the mapping, if the trailing size is not enough to
2336 use another superpage (i.e. sg_res < lvl_pages). */
2338 if (!nr_pages || first_pte_in_page(pte) ||
2339 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2340 domain_flush_cache(domain, first_pte,
2341 (void *)pte - (void *)first_pte);
2345 if (!sg_res && nr_pages)
2351 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2352 struct scatterlist *sg, unsigned long phys_pfn,
2353 unsigned long nr_pages, int prot)
2356 struct intel_iommu *iommu;
2358 /* Do the real mapping first */
2359 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2363 for_each_domain_iommu(iommu_id, domain) {
2364 iommu = g_iommus[iommu_id];
2365 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2371 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2372 struct scatterlist *sg, unsigned long nr_pages,
2375 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2378 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2379 unsigned long phys_pfn, unsigned long nr_pages,
2382 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2385 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2387 unsigned long flags;
2388 struct context_entry *context;
2394 spin_lock_irqsave(&iommu->lock, flags);
2395 context = iommu_context_addr(iommu, bus, devfn, 0);
2397 spin_unlock_irqrestore(&iommu->lock, flags);
2400 did_old = context_domain_id(context);
2401 context_clear_entry(context);
2402 __iommu_flush_cache(iommu, context, sizeof(*context));
2403 spin_unlock_irqrestore(&iommu->lock, flags);
2404 iommu->flush.flush_context(iommu,
2406 (((u16)bus) << 8) | devfn,
2407 DMA_CCMD_MASK_NOBIT,
2408 DMA_CCMD_DEVICE_INVL);
2409 iommu->flush.flush_iotlb(iommu,
2416 static inline void unlink_domain_info(struct device_domain_info *info)
2418 assert_spin_locked(&device_domain_lock);
2419 list_del(&info->link);
2420 list_del(&info->global);
2422 info->dev->archdata.iommu = NULL;
2425 static void domain_remove_dev_info(struct dmar_domain *domain)
2427 struct device_domain_info *info, *tmp;
2428 unsigned long flags;
2430 spin_lock_irqsave(&device_domain_lock, flags);
2431 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2432 __dmar_remove_one_dev_info(info);
2433 spin_unlock_irqrestore(&device_domain_lock, flags);
2436 struct dmar_domain *find_domain(struct device *dev)
2438 struct device_domain_info *info;
2440 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2443 if (dev_is_pci(dev))
2444 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2446 /* No lock here, assumes no domain exit in normal case */
2447 info = get_domain_info(dev);
2449 return info->domain;
2454 static void do_deferred_attach(struct device *dev)
2456 struct iommu_domain *domain;
2458 dev->archdata.iommu = NULL;
2459 domain = iommu_get_domain_for_dev(dev);
2461 intel_iommu_attach_device(domain, dev);
2464 static inline struct device_domain_info *
2465 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2467 struct device_domain_info *info;
2469 list_for_each_entry(info, &device_domain_list, global)
2470 if (info->iommu->segment == segment && info->bus == bus &&
2471 info->devfn == devfn)
2477 static int domain_setup_first_level(struct intel_iommu *iommu,
2478 struct dmar_domain *domain,
2482 int flags = PASID_FLAG_SUPERVISOR_MODE;
2483 struct dma_pte *pgd = domain->pgd;
2487 * Skip top levels of page tables for iommu which has
2488 * less agaw than default. Unnecessary for PT mode.
2490 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2491 pgd = phys_to_virt(dma_pte_addr(pgd));
2492 if (!dma_pte_present(pgd))
2496 level = agaw_to_level(agaw);
2497 if (level != 4 && level != 5)
2500 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2502 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2503 domain->iommu_did[iommu->seq_id],
2507 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2510 struct dmar_domain *domain)
2512 struct dmar_domain *found = NULL;
2513 struct device_domain_info *info;
2514 unsigned long flags;
2517 info = alloc_devinfo_mem();
2522 info->devfn = devfn;
2523 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2524 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2527 info->domain = domain;
2528 info->iommu = iommu;
2529 info->pasid_table = NULL;
2530 info->auxd_enabled = 0;
2531 INIT_LIST_HEAD(&info->auxiliary_domains);
2533 if (dev && dev_is_pci(dev)) {
2534 struct pci_dev *pdev = to_pci_dev(info->dev);
2536 if (!pdev->untrusted &&
2537 !pci_ats_disabled() &&
2538 ecap_dev_iotlb_support(iommu->ecap) &&
2539 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2540 dmar_find_matched_atsr_unit(pdev))
2541 info->ats_supported = 1;
2543 if (sm_supported(iommu)) {
2544 if (pasid_supported(iommu)) {
2545 int features = pci_pasid_features(pdev);
2547 info->pasid_supported = features | 1;
2550 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2551 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2552 info->pri_supported = 1;
2556 spin_lock_irqsave(&device_domain_lock, flags);
2558 found = find_domain(dev);
2561 struct device_domain_info *info2;
2562 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2564 found = info2->domain;
2570 spin_unlock_irqrestore(&device_domain_lock, flags);
2571 free_devinfo_mem(info);
2572 /* Caller must free the original domain */
2576 spin_lock(&iommu->lock);
2577 ret = domain_attach_iommu(domain, iommu);
2578 spin_unlock(&iommu->lock);
2581 spin_unlock_irqrestore(&device_domain_lock, flags);
2582 free_devinfo_mem(info);
2586 list_add(&info->link, &domain->devices);
2587 list_add(&info->global, &device_domain_list);
2589 dev->archdata.iommu = info;
2590 spin_unlock_irqrestore(&device_domain_lock, flags);
2592 /* PASID table is mandatory for a PCI device in scalable mode. */
2593 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2594 ret = intel_pasid_alloc_table(dev);
2596 dev_err(dev, "PASID table allocation failed\n");
2597 dmar_remove_one_dev_info(dev);
2601 /* Setup the PASID entry for requests without PASID: */
2602 spin_lock(&iommu->lock);
2603 if (hw_pass_through && domain_type_is_si(domain))
2604 ret = intel_pasid_setup_pass_through(iommu, domain,
2605 dev, PASID_RID2PASID);
2606 else if (domain_use_first_level(domain))
2607 ret = domain_setup_first_level(iommu, domain, dev,
2610 ret = intel_pasid_setup_second_level(iommu, domain,
2611 dev, PASID_RID2PASID);
2612 spin_unlock(&iommu->lock);
2614 dev_err(dev, "Setup RID2PASID failed\n");
2615 dmar_remove_one_dev_info(dev);
2620 if (dev && domain_context_mapping(domain, dev)) {
2621 dev_err(dev, "Domain context map failed\n");
2622 dmar_remove_one_dev_info(dev);
2629 static int iommu_domain_identity_map(struct dmar_domain *domain,
2630 unsigned long long start,
2631 unsigned long long end)
2633 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2634 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2636 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2637 dma_to_mm_pfn(last_vpfn))) {
2638 pr_err("Reserving iova failed\n");
2642 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2644 * RMRR range might have overlap with physical memory range,
2647 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2649 return __domain_mapping(domain, first_vpfn, NULL,
2650 first_vpfn, last_vpfn - first_vpfn + 1,
2651 DMA_PTE_READ|DMA_PTE_WRITE);
2654 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2656 static int __init si_domain_init(int hw)
2658 struct dmar_rmrr_unit *rmrr;
2662 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2666 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2667 domain_exit(si_domain);
2674 for_each_online_node(nid) {
2675 unsigned long start_pfn, end_pfn;
2678 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2679 ret = iommu_domain_identity_map(si_domain,
2680 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2687 * Identity map the RMRRs so that devices with RMRRs could also use
2690 for_each_rmrr_units(rmrr) {
2691 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2693 unsigned long long start = rmrr->base_address;
2694 unsigned long long end = rmrr->end_address;
2696 if (WARN_ON(end < start ||
2697 end >> agaw_to_width(si_domain->agaw)))
2700 ret = iommu_domain_identity_map(si_domain, start, end);
2709 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2711 struct dmar_domain *ndomain;
2712 struct intel_iommu *iommu;
2715 iommu = device_to_iommu(dev, &bus, &devfn);
2719 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2720 if (ndomain != domain)
2726 static bool device_has_rmrr(struct device *dev)
2728 struct dmar_rmrr_unit *rmrr;
2733 for_each_rmrr_units(rmrr) {
2735 * Return TRUE if this RMRR contains the device that
2738 for_each_active_dev_scope(rmrr->devices,
2739 rmrr->devices_cnt, i, tmp)
2741 is_downstream_to_pci_bridge(dev, tmp)) {
2751 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2752 * is relaxable (ie. is allowed to be not enforced under some conditions)
2753 * @dev: device handle
2755 * We assume that PCI USB devices with RMRRs have them largely
2756 * for historical reasons and that the RMRR space is not actively used post
2757 * boot. This exclusion may change if vendors begin to abuse it.
2759 * The same exception is made for graphics devices, with the requirement that
2760 * any use of the RMRR regions will be torn down before assigning the device
2763 * Return: true if the RMRR is relaxable, false otherwise
2765 static bool device_rmrr_is_relaxable(struct device *dev)
2767 struct pci_dev *pdev;
2769 if (!dev_is_pci(dev))
2772 pdev = to_pci_dev(dev);
2773 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2780 * There are a couple cases where we need to restrict the functionality of
2781 * devices associated with RMRRs. The first is when evaluating a device for
2782 * identity mapping because problems exist when devices are moved in and out
2783 * of domains and their respective RMRR information is lost. This means that
2784 * a device with associated RMRRs will never be in a "passthrough" domain.
2785 * The second is use of the device through the IOMMU API. This interface
2786 * expects to have full control of the IOVA space for the device. We cannot
2787 * satisfy both the requirement that RMRR access is maintained and have an
2788 * unencumbered IOVA space. We also have no ability to quiesce the device's
2789 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2790 * We therefore prevent devices associated with an RMRR from participating in
2791 * the IOMMU API, which eliminates them from device assignment.
2793 * In both cases, devices which have relaxable RMRRs are not concerned by this
2794 * restriction. See device_rmrr_is_relaxable comment.
2796 static bool device_is_rmrr_locked(struct device *dev)
2798 if (!device_has_rmrr(dev))
2801 if (device_rmrr_is_relaxable(dev))
2808 * Return the required default domain type for a specific device.
2810 * @dev: the device in query
2811 * @startup: true if this is during early boot
2814 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2815 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2816 * - 0: both identity and dynamic domains work for this device
2818 static int device_def_domain_type(struct device *dev)
2820 if (dev_is_pci(dev)) {
2821 struct pci_dev *pdev = to_pci_dev(dev);
2824 * Prevent any device marked as untrusted from getting
2825 * placed into the statically identity mapping domain.
2827 if (pdev->untrusted)
2828 return IOMMU_DOMAIN_DMA;
2830 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2831 return IOMMU_DOMAIN_IDENTITY;
2833 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2834 return IOMMU_DOMAIN_IDENTITY;
2840 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2843 * Start from the sane iommu hardware state.
2844 * If the queued invalidation is already initialized by us
2845 * (for example, while enabling interrupt-remapping) then
2846 * we got the things already rolling from a sane state.
2850 * Clear any previous faults.
2852 dmar_fault(-1, iommu);
2854 * Disable queued invalidation if supported and already enabled
2855 * before OS handover.
2857 dmar_disable_qi(iommu);
2860 if (dmar_enable_qi(iommu)) {
2862 * Queued Invalidate not enabled, use Register Based Invalidate
2864 iommu->flush.flush_context = __iommu_flush_context;
2865 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2866 pr_info("%s: Using Register based invalidation\n",
2869 iommu->flush.flush_context = qi_flush_context;
2870 iommu->flush.flush_iotlb = qi_flush_iotlb;
2871 pr_info("%s: Using Queued invalidation\n", iommu->name);
2875 static int copy_context_table(struct intel_iommu *iommu,
2876 struct root_entry *old_re,
2877 struct context_entry **tbl,
2880 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2881 struct context_entry *new_ce = NULL, ce;
2882 struct context_entry *old_ce = NULL;
2883 struct root_entry re;
2884 phys_addr_t old_ce_phys;
2886 tbl_idx = ext ? bus * 2 : bus;
2887 memcpy(&re, old_re, sizeof(re));
2889 for (devfn = 0; devfn < 256; devfn++) {
2890 /* First calculate the correct index */
2891 idx = (ext ? devfn * 2 : devfn) % 256;
2894 /* First save what we may have and clean up */
2896 tbl[tbl_idx] = new_ce;
2897 __iommu_flush_cache(iommu, new_ce,
2907 old_ce_phys = root_entry_lctp(&re);
2909 old_ce_phys = root_entry_uctp(&re);
2912 if (ext && devfn == 0) {
2913 /* No LCTP, try UCTP */
2922 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2927 new_ce = alloc_pgtable_page(iommu->node);
2934 /* Now copy the context entry */
2935 memcpy(&ce, old_ce + idx, sizeof(ce));
2937 if (!__context_present(&ce))
2940 did = context_domain_id(&ce);
2941 if (did >= 0 && did < cap_ndoms(iommu->cap))
2942 set_bit(did, iommu->domain_ids);
2945 * We need a marker for copied context entries. This
2946 * marker needs to work for the old format as well as
2947 * for extended context entries.
2949 * Bit 67 of the context entry is used. In the old
2950 * format this bit is available to software, in the
2951 * extended format it is the PGE bit, but PGE is ignored
2952 * by HW if PASIDs are disabled (and thus still
2955 * So disable PASIDs first and then mark the entry
2956 * copied. This means that we don't copy PASID
2957 * translations from the old kernel, but this is fine as
2958 * faults there are not fatal.
2960 context_clear_pasid_enable(&ce);
2961 context_set_copied(&ce);
2966 tbl[tbl_idx + pos] = new_ce;
2968 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2977 static int copy_translation_tables(struct intel_iommu *iommu)
2979 struct context_entry **ctxt_tbls;
2980 struct root_entry *old_rt;
2981 phys_addr_t old_rt_phys;
2982 int ctxt_table_entries;
2983 unsigned long flags;
2988 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2989 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2990 new_ext = !!ecap_ecs(iommu->ecap);
2993 * The RTT bit can only be changed when translation is disabled,
2994 * but disabling translation means to open a window for data
2995 * corruption. So bail out and don't copy anything if we would
2996 * have to change the bit.
3001 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3005 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3009 /* This is too big for the stack - allocate it from slab */
3010 ctxt_table_entries = ext ? 512 : 256;
3012 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3016 for (bus = 0; bus < 256; bus++) {
3017 ret = copy_context_table(iommu, &old_rt[bus],
3018 ctxt_tbls, bus, ext);
3020 pr_err("%s: Failed to copy context table for bus %d\n",
3026 spin_lock_irqsave(&iommu->lock, flags);
3028 /* Context tables are copied, now write them to the root_entry table */
3029 for (bus = 0; bus < 256; bus++) {
3030 int idx = ext ? bus * 2 : bus;
3033 if (ctxt_tbls[idx]) {
3034 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3035 iommu->root_entry[bus].lo = val;
3038 if (!ext || !ctxt_tbls[idx + 1])
3041 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3042 iommu->root_entry[bus].hi = val;
3045 spin_unlock_irqrestore(&iommu->lock, flags);
3049 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3059 #ifdef CONFIG_INTEL_IOMMU_SVM
3060 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3062 struct intel_iommu *iommu = data;
3066 return INVALID_IOASID;
3068 * VT-d virtual command interface always uses the full 20 bit
3069 * PASID range. Host can partition guest PASID range based on
3070 * policies but it is out of guest's control.
3072 if (min < PASID_MIN || max > intel_pasid_max_id)
3073 return INVALID_IOASID;
3075 if (vcmd_alloc_pasid(iommu, &ioasid))
3076 return INVALID_IOASID;
3081 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3083 struct intel_iommu *iommu = data;
3088 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3089 * We can only free the PASID when all the devices are unbound.
3091 if (ioasid_find(NULL, ioasid, NULL)) {
3092 pr_alert("Cannot free active IOASID %d\n", ioasid);
3095 vcmd_free_pasid(iommu, ioasid);
3098 static void register_pasid_allocator(struct intel_iommu *iommu)
3101 * If we are running in the host, no need for custom allocator
3102 * in that PASIDs are allocated from the host system-wide.
3104 if (!cap_caching_mode(iommu->cap))
3107 if (!sm_supported(iommu)) {
3108 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3113 * Register a custom PASID allocator if we are running in a guest,
3114 * guest PASID must be obtained via virtual command interface.
3115 * There can be multiple vIOMMUs in each guest but only one allocator
3116 * is active. All vIOMMU allocators will eventually be calling the same
3119 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3122 pr_info("Register custom PASID allocator\n");
3123 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3124 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3125 iommu->pasid_allocator.pdata = (void *)iommu;
3126 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3127 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3129 * Disable scalable mode on this IOMMU if there
3130 * is no custom allocator. Mixing SM capable vIOMMU
3131 * and non-SM vIOMMU are not supported.
3138 static int __init init_dmars(void)
3140 struct dmar_drhd_unit *drhd;
3141 struct intel_iommu *iommu;
3147 * initialize and program root entry to not present
3150 for_each_drhd_unit(drhd) {
3152 * lock not needed as this is only incremented in the single
3153 * threaded kernel __init code path all other access are read
3156 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3160 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3163 /* Preallocate enough resources for IOMMU hot-addition */
3164 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3165 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3167 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3170 pr_err("Allocating global iommu array failed\n");
3175 for_each_iommu(iommu, drhd) {
3176 if (drhd->ignored) {
3177 iommu_disable_translation(iommu);
3182 * Find the max pasid size of all IOMMU's in the system.
3183 * We need to ensure the system pasid table is no bigger
3184 * than the smallest supported.
3186 if (pasid_supported(iommu)) {
3187 u32 temp = 2 << ecap_pss(iommu->ecap);
3189 intel_pasid_max_id = min_t(u32, temp,
3190 intel_pasid_max_id);
3193 g_iommus[iommu->seq_id] = iommu;
3195 intel_iommu_init_qi(iommu);
3197 ret = iommu_init_domains(iommu);
3201 init_translation_status(iommu);
3203 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3204 iommu_disable_translation(iommu);
3205 clear_translation_pre_enabled(iommu);
3206 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3212 * we could share the same root & context tables
3213 * among all IOMMU's. Need to Split it later.
3215 ret = iommu_alloc_root_entry(iommu);
3219 if (translation_pre_enabled(iommu)) {
3220 pr_info("Translation already enabled - trying to copy translation structures\n");
3222 ret = copy_translation_tables(iommu);
3225 * We found the IOMMU with translation
3226 * enabled - but failed to copy over the
3227 * old root-entry table. Try to proceed
3228 * by disabling translation now and
3229 * allocating a clean root-entry table.
3230 * This might cause DMAR faults, but
3231 * probably the dump will still succeed.
3233 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3235 iommu_disable_translation(iommu);
3236 clear_translation_pre_enabled(iommu);
3238 pr_info("Copied translation tables from previous kernel for %s\n",
3243 if (!ecap_pass_through(iommu->ecap))
3244 hw_pass_through = 0;
3245 intel_svm_check(iommu);
3249 * Now that qi is enabled on all iommus, set the root entry and flush
3250 * caches. This is required on some Intel X58 chipsets, otherwise the
3251 * flush_context function will loop forever and the boot hangs.
3253 for_each_active_iommu(iommu, drhd) {
3254 iommu_flush_write_buffer(iommu);
3255 #ifdef CONFIG_INTEL_IOMMU_SVM
3256 register_pasid_allocator(iommu);
3258 iommu_set_root_entry(iommu);
3259 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3260 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3263 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3268 iommu_identity_mapping |= IDENTMAP_GFX;
3270 check_tylersburg_isoch();
3272 ret = si_domain_init(hw_pass_through);
3279 * global invalidate context cache
3280 * global invalidate iotlb
3281 * enable translation
3283 for_each_iommu(iommu, drhd) {
3284 if (drhd->ignored) {
3286 * we always have to disable PMRs or DMA may fail on
3290 iommu_disable_protect_mem_regions(iommu);
3294 iommu_flush_write_buffer(iommu);
3296 #ifdef CONFIG_INTEL_IOMMU_SVM
3297 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3299 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3300 * could cause possible lock race condition.
3302 up_write(&dmar_global_lock);
3303 ret = intel_svm_enable_prq(iommu);
3304 down_write(&dmar_global_lock);
3309 ret = dmar_set_interrupt(iommu);
3317 for_each_active_iommu(iommu, drhd) {
3318 disable_dmar_iommu(iommu);
3319 free_dmar_iommu(iommu);
3328 /* This takes a number of _MM_ pages, not VTD pages */
3329 static unsigned long intel_alloc_iova(struct device *dev,
3330 struct dmar_domain *domain,
3331 unsigned long nrpages, uint64_t dma_mask)
3333 unsigned long iova_pfn;
3336 * Restrict dma_mask to the width that the iommu can handle.
3337 * First-level translation restricts the input-address to a
3338 * canonical address (i.e., address bits 63:N have the same
3339 * value as address bit [N-1], where N is 48-bits with 4-level
3340 * paging and 57-bits with 5-level paging). Hence, skip bit
3343 if (domain_use_first_level(domain))
3344 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3347 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3350 /* Ensure we reserve the whole size-aligned region */
3351 nrpages = __roundup_pow_of_two(nrpages);
3353 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3355 * First try to allocate an io virtual address in
3356 * DMA_BIT_MASK(32) and if that fails then try allocating
3359 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3360 IOVA_PFN(DMA_BIT_MASK(32)), false);
3364 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3365 IOVA_PFN(dma_mask), true);
3366 if (unlikely(!iova_pfn)) {
3367 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3375 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3376 size_t size, int dir, u64 dma_mask)
3378 struct dmar_domain *domain;
3379 phys_addr_t start_paddr;
3380 unsigned long iova_pfn;
3383 struct intel_iommu *iommu;
3384 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3386 BUG_ON(dir == DMA_NONE);
3388 if (unlikely(attach_deferred(dev)))
3389 do_deferred_attach(dev);
3391 domain = find_domain(dev);
3393 return DMA_MAPPING_ERROR;
3395 iommu = domain_get_iommu(domain);
3396 size = aligned_nrpages(paddr, size);
3398 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3403 * Check if DMAR supports zero-length reads on write only
3406 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3407 !cap_zlr(iommu->cap))
3408 prot |= DMA_PTE_READ;
3409 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3410 prot |= DMA_PTE_WRITE;
3412 * paddr - (paddr + size) might be partial page, we should map the whole
3413 * page. Note: if two part of one page are separately mapped, we
3414 * might have two guest_addr mapping to the same host paddr, but this
3415 * is not a big problem
3417 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3418 mm_to_dma_pfn(paddr_pfn), size, prot);
3422 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3423 start_paddr += paddr & ~PAGE_MASK;
3425 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3431 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3432 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3433 size, (unsigned long long)paddr, dir);
3434 return DMA_MAPPING_ERROR;
3437 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3438 unsigned long offset, size_t size,
3439 enum dma_data_direction dir,
3440 unsigned long attrs)
3442 return __intel_map_single(dev, page_to_phys(page) + offset,
3443 size, dir, *dev->dma_mask);
3446 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3447 size_t size, enum dma_data_direction dir,
3448 unsigned long attrs)
3450 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3453 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3455 struct dmar_domain *domain;
3456 unsigned long start_pfn, last_pfn;
3457 unsigned long nrpages;
3458 unsigned long iova_pfn;
3459 struct intel_iommu *iommu;
3460 struct page *freelist;
3461 struct pci_dev *pdev = NULL;
3463 domain = find_domain(dev);
3466 iommu = domain_get_iommu(domain);
3468 iova_pfn = IOVA_PFN(dev_addr);
3470 nrpages = aligned_nrpages(dev_addr, size);
3471 start_pfn = mm_to_dma_pfn(iova_pfn);
3472 last_pfn = start_pfn + nrpages - 1;
3474 if (dev_is_pci(dev))
3475 pdev = to_pci_dev(dev);
3477 freelist = domain_unmap(domain, start_pfn, last_pfn);
3478 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3479 !has_iova_flush_queue(&domain->iovad)) {
3480 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3481 nrpages, !freelist, 0);
3483 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3484 dma_free_pagelist(freelist);
3486 queue_iova(&domain->iovad, iova_pfn, nrpages,
3487 (unsigned long)freelist);
3489 * queue up the release of the unmap to save the 1/6th of the
3490 * cpu used up by the iotlb flush operation...
3494 trace_unmap_single(dev, dev_addr, size);
3497 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3498 size_t size, enum dma_data_direction dir,
3499 unsigned long attrs)
3501 intel_unmap(dev, dev_addr, size);
3504 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3505 size_t size, enum dma_data_direction dir, unsigned long attrs)
3507 intel_unmap(dev, dev_addr, size);
3510 static void *intel_alloc_coherent(struct device *dev, size_t size,
3511 dma_addr_t *dma_handle, gfp_t flags,
3512 unsigned long attrs)
3514 struct page *page = NULL;
3517 if (unlikely(attach_deferred(dev)))
3518 do_deferred_attach(dev);
3520 size = PAGE_ALIGN(size);
3521 order = get_order(size);
3523 if (gfpflags_allow_blocking(flags)) {
3524 unsigned int count = size >> PAGE_SHIFT;
3526 page = dma_alloc_from_contiguous(dev, count, order,
3527 flags & __GFP_NOWARN);
3531 page = alloc_pages(flags, order);
3534 memset(page_address(page), 0, size);
3536 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3538 dev->coherent_dma_mask);
3539 if (*dma_handle != DMA_MAPPING_ERROR)
3540 return page_address(page);
3541 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3542 __free_pages(page, order);
3547 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3548 dma_addr_t dma_handle, unsigned long attrs)
3551 struct page *page = virt_to_page(vaddr);
3553 size = PAGE_ALIGN(size);
3554 order = get_order(size);
3556 intel_unmap(dev, dma_handle, size);
3557 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3558 __free_pages(page, order);
3561 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3562 int nelems, enum dma_data_direction dir,
3563 unsigned long attrs)
3565 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3566 unsigned long nrpages = 0;
3567 struct scatterlist *sg;
3570 for_each_sg(sglist, sg, nelems, i) {
3571 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3574 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3576 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3579 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3580 enum dma_data_direction dir, unsigned long attrs)
3583 struct dmar_domain *domain;
3586 unsigned long iova_pfn;
3588 struct scatterlist *sg;
3589 unsigned long start_vpfn;
3590 struct intel_iommu *iommu;
3592 BUG_ON(dir == DMA_NONE);
3594 if (unlikely(attach_deferred(dev)))
3595 do_deferred_attach(dev);
3597 domain = find_domain(dev);
3601 iommu = domain_get_iommu(domain);
3603 for_each_sg(sglist, sg, nelems, i)
3604 size += aligned_nrpages(sg->offset, sg->length);
3606 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3609 sglist->dma_length = 0;
3614 * Check if DMAR supports zero-length reads on write only
3617 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3618 !cap_zlr(iommu->cap))
3619 prot |= DMA_PTE_READ;
3620 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3621 prot |= DMA_PTE_WRITE;
3623 start_vpfn = mm_to_dma_pfn(iova_pfn);
3625 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3626 if (unlikely(ret)) {
3627 dma_pte_free_pagetable(domain, start_vpfn,
3628 start_vpfn + size - 1,
3629 agaw_to_level(domain->agaw) + 1);
3630 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3634 for_each_sg(sglist, sg, nelems, i)
3635 trace_map_sg(dev, i + 1, nelems, sg);
3640 static u64 intel_get_required_mask(struct device *dev)
3642 return DMA_BIT_MASK(32);
3645 static const struct dma_map_ops intel_dma_ops = {
3646 .alloc = intel_alloc_coherent,
3647 .free = intel_free_coherent,
3648 .map_sg = intel_map_sg,
3649 .unmap_sg = intel_unmap_sg,
3650 .map_page = intel_map_page,
3651 .unmap_page = intel_unmap_page,
3652 .map_resource = intel_map_resource,
3653 .unmap_resource = intel_unmap_resource,
3654 .dma_supported = dma_direct_supported,
3655 .mmap = dma_common_mmap,
3656 .get_sgtable = dma_common_get_sgtable,
3657 .get_required_mask = intel_get_required_mask,
3661 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3662 enum dma_data_direction dir, enum dma_sync_target target)
3664 struct dmar_domain *domain;
3665 phys_addr_t tlb_addr;
3667 domain = find_domain(dev);
3668 if (WARN_ON(!domain))
3671 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3672 if (is_swiotlb_buffer(tlb_addr))
3673 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3677 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3678 enum dma_data_direction dir, unsigned long attrs,
3681 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3682 struct dmar_domain *domain;
3683 struct intel_iommu *iommu;
3684 unsigned long iova_pfn;
3685 unsigned long nrpages;
3686 phys_addr_t tlb_addr;
3690 if (unlikely(attach_deferred(dev)))
3691 do_deferred_attach(dev);
3693 domain = find_domain(dev);
3695 if (WARN_ON(dir == DMA_NONE || !domain))
3696 return DMA_MAPPING_ERROR;
3698 iommu = domain_get_iommu(domain);
3699 if (WARN_ON(!iommu))
3700 return DMA_MAPPING_ERROR;
3702 nrpages = aligned_nrpages(0, size);
3703 iova_pfn = intel_alloc_iova(dev, domain,
3704 dma_to_mm_pfn(nrpages), dma_mask);
3706 return DMA_MAPPING_ERROR;
3709 * Check if DMAR supports zero-length reads on write only
3712 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3713 !cap_zlr(iommu->cap))
3714 prot |= DMA_PTE_READ;
3715 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3716 prot |= DMA_PTE_WRITE;
3719 * If both the physical buffer start address and size are
3720 * page aligned, we don't need to use a bounce page.
3722 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3723 tlb_addr = swiotlb_tbl_map_single(dev,
3724 __phys_to_dma(dev, io_tlb_start),
3725 paddr, size, aligned_size, dir, attrs);
3726 if (tlb_addr == DMA_MAPPING_ERROR) {
3729 /* Cleanup the padding area. */
3730 void *padding_start = phys_to_virt(tlb_addr);
3731 size_t padding_size = aligned_size;
3733 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3734 (dir == DMA_TO_DEVICE ||
3735 dir == DMA_BIDIRECTIONAL)) {
3736 padding_start += size;
3737 padding_size -= size;
3740 memset(padding_start, 0, padding_size);
3746 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3747 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3751 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3753 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3756 if (is_swiotlb_buffer(tlb_addr))
3757 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3758 aligned_size, dir, attrs);
3760 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3761 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3762 size, (unsigned long long)paddr, dir);
3764 return DMA_MAPPING_ERROR;
3768 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3769 enum dma_data_direction dir, unsigned long attrs)
3771 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3772 struct dmar_domain *domain;
3773 phys_addr_t tlb_addr;
3775 domain = find_domain(dev);
3776 if (WARN_ON(!domain))
3779 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3780 if (WARN_ON(!tlb_addr))
3783 intel_unmap(dev, dev_addr, size);
3784 if (is_swiotlb_buffer(tlb_addr))
3785 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3786 aligned_size, dir, attrs);
3788 trace_bounce_unmap_single(dev, dev_addr, size);
3792 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3793 size_t size, enum dma_data_direction dir, unsigned long attrs)
3795 return bounce_map_single(dev, page_to_phys(page) + offset,
3796 size, dir, attrs, *dev->dma_mask);
3800 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3801 enum dma_data_direction dir, unsigned long attrs)
3803 return bounce_map_single(dev, phys_addr, size,
3804 dir, attrs, *dev->dma_mask);
3808 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3809 enum dma_data_direction dir, unsigned long attrs)
3811 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3815 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3816 enum dma_data_direction dir, unsigned long attrs)
3818 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3822 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3823 enum dma_data_direction dir, unsigned long attrs)
3825 struct scatterlist *sg;
3828 for_each_sg(sglist, sg, nelems, i)
3829 bounce_unmap_page(dev, sg->dma_address,
3830 sg_dma_len(sg), dir, attrs);
3834 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3835 enum dma_data_direction dir, unsigned long attrs)
3838 struct scatterlist *sg;
3840 for_each_sg(sglist, sg, nelems, i) {
3841 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3842 sg->offset, sg->length,
3844 if (sg->dma_address == DMA_MAPPING_ERROR)
3846 sg_dma_len(sg) = sg->length;
3849 for_each_sg(sglist, sg, nelems, i)
3850 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3855 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3860 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3861 size_t size, enum dma_data_direction dir)
3863 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3867 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3868 size_t size, enum dma_data_direction dir)
3870 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3874 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3875 int nelems, enum dma_data_direction dir)
3877 struct scatterlist *sg;
3880 for_each_sg(sglist, sg, nelems, i)
3881 bounce_sync_single(dev, sg_dma_address(sg),
3882 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3886 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3887 int nelems, enum dma_data_direction dir)
3889 struct scatterlist *sg;
3892 for_each_sg(sglist, sg, nelems, i)
3893 bounce_sync_single(dev, sg_dma_address(sg),
3894 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3897 static const struct dma_map_ops bounce_dma_ops = {
3898 .alloc = intel_alloc_coherent,
3899 .free = intel_free_coherent,
3900 .map_sg = bounce_map_sg,
3901 .unmap_sg = bounce_unmap_sg,
3902 .map_page = bounce_map_page,
3903 .unmap_page = bounce_unmap_page,
3904 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3905 .sync_single_for_device = bounce_sync_single_for_device,
3906 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3907 .sync_sg_for_device = bounce_sync_sg_for_device,
3908 .map_resource = bounce_map_resource,
3909 .unmap_resource = bounce_unmap_resource,
3910 .dma_supported = dma_direct_supported,
3913 static inline int iommu_domain_cache_init(void)
3917 iommu_domain_cache = kmem_cache_create("iommu_domain",
3918 sizeof(struct dmar_domain),
3923 if (!iommu_domain_cache) {
3924 pr_err("Couldn't create iommu_domain cache\n");
3931 static inline int iommu_devinfo_cache_init(void)
3935 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3936 sizeof(struct device_domain_info),
3940 if (!iommu_devinfo_cache) {
3941 pr_err("Couldn't create devinfo cache\n");
3948 static int __init iommu_init_mempool(void)
3951 ret = iova_cache_get();
3955 ret = iommu_domain_cache_init();
3959 ret = iommu_devinfo_cache_init();
3963 kmem_cache_destroy(iommu_domain_cache);
3970 static void __init iommu_exit_mempool(void)
3972 kmem_cache_destroy(iommu_devinfo_cache);
3973 kmem_cache_destroy(iommu_domain_cache);
3977 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3979 struct dmar_drhd_unit *drhd;
3983 /* We know that this device on this chipset has its own IOMMU.
3984 * If we find it under a different IOMMU, then the BIOS is lying
3985 * to us. Hope that the IOMMU for this device is actually
3986 * disabled, and it needs no translation...
3988 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3990 /* "can't" happen */
3991 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3994 vtbar &= 0xffff0000;
3996 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3997 drhd = dmar_find_matched_drhd_unit(pdev);
3998 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3999 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4000 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4001 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4004 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4006 static void __init init_no_remapping_devices(void)
4008 struct dmar_drhd_unit *drhd;
4012 for_each_drhd_unit(drhd) {
4013 if (!drhd->include_all) {
4014 for_each_active_dev_scope(drhd->devices,
4015 drhd->devices_cnt, i, dev)
4017 /* ignore DMAR unit if no devices exist */
4018 if (i == drhd->devices_cnt)
4023 for_each_active_drhd_unit(drhd) {
4024 if (drhd->include_all)
4027 for_each_active_dev_scope(drhd->devices,
4028 drhd->devices_cnt, i, dev)
4029 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4031 if (i < drhd->devices_cnt)
4034 /* This IOMMU has *only* gfx devices. Either bypass it or
4035 set the gfx_mapped flag, as appropriate */
4036 if (!dmar_map_gfx) {
4038 for_each_active_dev_scope(drhd->devices,
4039 drhd->devices_cnt, i, dev)
4040 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4045 #ifdef CONFIG_SUSPEND
4046 static int init_iommu_hw(void)
4048 struct dmar_drhd_unit *drhd;
4049 struct intel_iommu *iommu = NULL;
4051 for_each_active_iommu(iommu, drhd)
4053 dmar_reenable_qi(iommu);
4055 for_each_iommu(iommu, drhd) {
4056 if (drhd->ignored) {
4058 * we always have to disable PMRs or DMA may fail on
4062 iommu_disable_protect_mem_regions(iommu);
4066 iommu_flush_write_buffer(iommu);
4068 iommu_set_root_entry(iommu);
4070 iommu->flush.flush_context(iommu, 0, 0, 0,
4071 DMA_CCMD_GLOBAL_INVL);
4072 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4073 iommu_enable_translation(iommu);
4074 iommu_disable_protect_mem_regions(iommu);
4080 static void iommu_flush_all(void)
4082 struct dmar_drhd_unit *drhd;
4083 struct intel_iommu *iommu;
4085 for_each_active_iommu(iommu, drhd) {
4086 iommu->flush.flush_context(iommu, 0, 0, 0,
4087 DMA_CCMD_GLOBAL_INVL);
4088 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4089 DMA_TLB_GLOBAL_FLUSH);
4093 static int iommu_suspend(void)
4095 struct dmar_drhd_unit *drhd;
4096 struct intel_iommu *iommu = NULL;
4099 for_each_active_iommu(iommu, drhd) {
4100 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4102 if (!iommu->iommu_state)
4108 for_each_active_iommu(iommu, drhd) {
4109 iommu_disable_translation(iommu);
4111 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4113 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4114 readl(iommu->reg + DMAR_FECTL_REG);
4115 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4116 readl(iommu->reg + DMAR_FEDATA_REG);
4117 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4118 readl(iommu->reg + DMAR_FEADDR_REG);
4119 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4120 readl(iommu->reg + DMAR_FEUADDR_REG);
4122 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4127 for_each_active_iommu(iommu, drhd)
4128 kfree(iommu->iommu_state);
4133 static void iommu_resume(void)
4135 struct dmar_drhd_unit *drhd;
4136 struct intel_iommu *iommu = NULL;
4139 if (init_iommu_hw()) {
4141 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4143 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4147 for_each_active_iommu(iommu, drhd) {
4149 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4151 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4152 iommu->reg + DMAR_FECTL_REG);
4153 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4154 iommu->reg + DMAR_FEDATA_REG);
4155 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4156 iommu->reg + DMAR_FEADDR_REG);
4157 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4158 iommu->reg + DMAR_FEUADDR_REG);
4160 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4163 for_each_active_iommu(iommu, drhd)
4164 kfree(iommu->iommu_state);
4167 static struct syscore_ops iommu_syscore_ops = {
4168 .resume = iommu_resume,
4169 .suspend = iommu_suspend,
4172 static void __init init_iommu_pm_ops(void)
4174 register_syscore_ops(&iommu_syscore_ops);
4178 static inline void init_iommu_pm_ops(void) {}
4179 #endif /* CONFIG_PM */
4181 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4183 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4184 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4185 rmrr->end_address <= rmrr->base_address ||
4186 arch_rmrr_sanity_check(rmrr))
4192 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4194 struct acpi_dmar_reserved_memory *rmrr;
4195 struct dmar_rmrr_unit *rmrru;
4197 rmrr = (struct acpi_dmar_reserved_memory *)header;
4198 if (rmrr_sanity_check(rmrr)) {
4200 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4201 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4202 rmrr->base_address, rmrr->end_address,
4203 dmi_get_system_info(DMI_BIOS_VENDOR),
4204 dmi_get_system_info(DMI_BIOS_VERSION),
4205 dmi_get_system_info(DMI_PRODUCT_VERSION));
4206 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4209 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4213 rmrru->hdr = header;
4215 rmrru->base_address = rmrr->base_address;
4216 rmrru->end_address = rmrr->end_address;
4218 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4219 ((void *)rmrr) + rmrr->header.length,
4220 &rmrru->devices_cnt);
4221 if (rmrru->devices_cnt && rmrru->devices == NULL)
4224 list_add(&rmrru->list, &dmar_rmrr_units);
4233 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4235 struct dmar_atsr_unit *atsru;
4236 struct acpi_dmar_atsr *tmp;
4238 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4240 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4241 if (atsr->segment != tmp->segment)
4243 if (atsr->header.length != tmp->header.length)
4245 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4252 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4254 struct acpi_dmar_atsr *atsr;
4255 struct dmar_atsr_unit *atsru;
4257 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4260 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4261 atsru = dmar_find_atsr(atsr);
4265 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4270 * If memory is allocated from slab by ACPI _DSM method, we need to
4271 * copy the memory content because the memory buffer will be freed
4274 atsru->hdr = (void *)(atsru + 1);
4275 memcpy(atsru->hdr, hdr, hdr->length);
4276 atsru->include_all = atsr->flags & 0x1;
4277 if (!atsru->include_all) {
4278 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4279 (void *)atsr + atsr->header.length,
4280 &atsru->devices_cnt);
4281 if (atsru->devices_cnt && atsru->devices == NULL) {
4287 list_add_rcu(&atsru->list, &dmar_atsr_units);
4292 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4294 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4298 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4300 struct acpi_dmar_atsr *atsr;
4301 struct dmar_atsr_unit *atsru;
4303 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4304 atsru = dmar_find_atsr(atsr);
4306 list_del_rcu(&atsru->list);
4308 intel_iommu_free_atsr(atsru);
4314 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4318 struct acpi_dmar_atsr *atsr;
4319 struct dmar_atsr_unit *atsru;
4321 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4322 atsru = dmar_find_atsr(atsr);
4326 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4327 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4335 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4338 struct intel_iommu *iommu = dmaru->iommu;
4340 if (g_iommus[iommu->seq_id])
4343 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4344 pr_warn("%s: Doesn't support hardware pass through.\n",
4348 if (!ecap_sc_support(iommu->ecap) &&
4349 domain_update_iommu_snooping(iommu)) {
4350 pr_warn("%s: Doesn't support snooping.\n",
4354 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4355 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4356 pr_warn("%s: Doesn't support large page.\n",
4362 * Disable translation if already enabled prior to OS handover.
4364 if (iommu->gcmd & DMA_GCMD_TE)
4365 iommu_disable_translation(iommu);
4367 g_iommus[iommu->seq_id] = iommu;
4368 ret = iommu_init_domains(iommu);
4370 ret = iommu_alloc_root_entry(iommu);
4374 intel_svm_check(iommu);
4376 if (dmaru->ignored) {
4378 * we always have to disable PMRs or DMA may fail on this device
4381 iommu_disable_protect_mem_regions(iommu);
4385 intel_iommu_init_qi(iommu);
4386 iommu_flush_write_buffer(iommu);
4388 #ifdef CONFIG_INTEL_IOMMU_SVM
4389 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4390 ret = intel_svm_enable_prq(iommu);
4395 ret = dmar_set_interrupt(iommu);
4399 iommu_set_root_entry(iommu);
4400 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4401 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4402 iommu_enable_translation(iommu);
4404 iommu_disable_protect_mem_regions(iommu);
4408 disable_dmar_iommu(iommu);
4410 free_dmar_iommu(iommu);
4414 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4417 struct intel_iommu *iommu = dmaru->iommu;
4419 if (!intel_iommu_enabled)
4425 ret = intel_iommu_add(dmaru);
4427 disable_dmar_iommu(iommu);
4428 free_dmar_iommu(iommu);
4434 static void intel_iommu_free_dmars(void)
4436 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4437 struct dmar_atsr_unit *atsru, *atsr_n;
4439 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4440 list_del(&rmrru->list);
4441 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4445 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4446 list_del(&atsru->list);
4447 intel_iommu_free_atsr(atsru);
4451 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4454 struct pci_bus *bus;
4455 struct pci_dev *bridge = NULL;
4457 struct acpi_dmar_atsr *atsr;
4458 struct dmar_atsr_unit *atsru;
4460 dev = pci_physfn(dev);
4461 for (bus = dev->bus; bus; bus = bus->parent) {
4463 /* If it's an integrated device, allow ATS */
4466 /* Connected via non-PCIe: no ATS */
4467 if (!pci_is_pcie(bridge) ||
4468 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4470 /* If we found the root port, look it up in the ATSR */
4471 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4476 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4477 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4478 if (atsr->segment != pci_domain_nr(dev->bus))
4481 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4482 if (tmp == &bridge->dev)
4485 if (atsru->include_all)
4495 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4498 struct dmar_rmrr_unit *rmrru;
4499 struct dmar_atsr_unit *atsru;
4500 struct acpi_dmar_atsr *atsr;
4501 struct acpi_dmar_reserved_memory *rmrr;
4503 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4506 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4507 rmrr = container_of(rmrru->hdr,
4508 struct acpi_dmar_reserved_memory, header);
4509 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4510 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4511 ((void *)rmrr) + rmrr->header.length,
4512 rmrr->segment, rmrru->devices,
4513 rmrru->devices_cnt);
4516 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4517 dmar_remove_dev_scope(info, rmrr->segment,
4518 rmrru->devices, rmrru->devices_cnt);
4522 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4523 if (atsru->include_all)
4526 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4527 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4528 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4529 (void *)atsr + atsr->header.length,
4530 atsr->segment, atsru->devices,
4531 atsru->devices_cnt);
4536 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4537 if (dmar_remove_dev_scope(info, atsr->segment,
4538 atsru->devices, atsru->devices_cnt))
4546 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4547 unsigned long val, void *v)
4549 struct memory_notify *mhp = v;
4550 unsigned long long start, end;
4551 unsigned long start_vpfn, last_vpfn;
4554 case MEM_GOING_ONLINE:
4555 start = mhp->start_pfn << PAGE_SHIFT;
4556 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4557 if (iommu_domain_identity_map(si_domain, start, end)) {
4558 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4565 case MEM_CANCEL_ONLINE:
4566 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4568 while (start_vpfn <= last_vpfn) {
4570 struct dmar_drhd_unit *drhd;
4571 struct intel_iommu *iommu;
4572 struct page *freelist;
4574 iova = find_iova(&si_domain->iovad, start_vpfn);
4576 pr_debug("Failed get IOVA for PFN %lx\n",
4581 iova = split_and_remove_iova(&si_domain->iovad, iova,
4582 start_vpfn, last_vpfn);
4584 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4585 start_vpfn, last_vpfn);
4589 freelist = domain_unmap(si_domain, iova->pfn_lo,
4593 for_each_active_iommu(iommu, drhd)
4594 iommu_flush_iotlb_psi(iommu, si_domain,
4595 iova->pfn_lo, iova_size(iova),
4598 dma_free_pagelist(freelist);
4600 start_vpfn = iova->pfn_hi + 1;
4601 free_iova_mem(iova);
4609 static struct notifier_block intel_iommu_memory_nb = {
4610 .notifier_call = intel_iommu_memory_notifier,
4614 static void free_all_cpu_cached_iovas(unsigned int cpu)
4618 for (i = 0; i < g_num_of_iommus; i++) {
4619 struct intel_iommu *iommu = g_iommus[i];
4620 struct dmar_domain *domain;
4626 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4627 domain = get_iommu_domain(iommu, (u16)did);
4631 free_cpu_cached_iovas(cpu, &domain->iovad);
4636 static int intel_iommu_cpu_dead(unsigned int cpu)
4638 free_all_cpu_cached_iovas(cpu);
4642 static void intel_disable_iommus(void)
4644 struct intel_iommu *iommu = NULL;
4645 struct dmar_drhd_unit *drhd;
4647 for_each_iommu(iommu, drhd)
4648 iommu_disable_translation(iommu);
4651 void intel_iommu_shutdown(void)
4653 struct dmar_drhd_unit *drhd;
4654 struct intel_iommu *iommu = NULL;
4656 if (no_iommu || dmar_disabled)
4659 down_write(&dmar_global_lock);
4661 /* Disable PMRs explicitly here. */
4662 for_each_iommu(iommu, drhd)
4663 iommu_disable_protect_mem_regions(iommu);
4665 /* Make sure the IOMMUs are switched off */
4666 intel_disable_iommus();
4668 up_write(&dmar_global_lock);
4671 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4673 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4675 return container_of(iommu_dev, struct intel_iommu, iommu);
4678 static ssize_t intel_iommu_show_version(struct device *dev,
4679 struct device_attribute *attr,
4682 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4683 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4684 return sprintf(buf, "%d:%d\n",
4685 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4687 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4689 static ssize_t intel_iommu_show_address(struct device *dev,
4690 struct device_attribute *attr,
4693 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4694 return sprintf(buf, "%llx\n", iommu->reg_phys);
4696 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4698 static ssize_t intel_iommu_show_cap(struct device *dev,
4699 struct device_attribute *attr,
4702 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4703 return sprintf(buf, "%llx\n", iommu->cap);
4705 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4707 static ssize_t intel_iommu_show_ecap(struct device *dev,
4708 struct device_attribute *attr,
4711 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4712 return sprintf(buf, "%llx\n", iommu->ecap);
4714 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4716 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4717 struct device_attribute *attr,
4720 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4721 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4723 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4725 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4726 struct device_attribute *attr,
4729 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4730 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4731 cap_ndoms(iommu->cap)));
4733 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4735 static struct attribute *intel_iommu_attrs[] = {
4736 &dev_attr_version.attr,
4737 &dev_attr_address.attr,
4739 &dev_attr_ecap.attr,
4740 &dev_attr_domains_supported.attr,
4741 &dev_attr_domains_used.attr,
4745 static struct attribute_group intel_iommu_group = {
4746 .name = "intel-iommu",
4747 .attrs = intel_iommu_attrs,
4750 const struct attribute_group *intel_iommu_groups[] = {
4755 static inline bool has_untrusted_dev(void)
4757 struct pci_dev *pdev = NULL;
4759 for_each_pci_dev(pdev)
4760 if (pdev->untrusted)
4766 static int __init platform_optin_force_iommu(void)
4768 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4771 if (no_iommu || dmar_disabled)
4772 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4775 * If Intel-IOMMU is disabled by default, we will apply identity
4776 * map for all devices except those marked as being untrusted.
4779 iommu_set_default_passthrough(false);
4787 static int __init probe_acpi_namespace_devices(void)
4789 struct dmar_drhd_unit *drhd;
4790 /* To avoid a -Wunused-but-set-variable warning. */
4791 struct intel_iommu *iommu __maybe_unused;
4795 for_each_active_iommu(iommu, drhd) {
4796 for_each_active_dev_scope(drhd->devices,
4797 drhd->devices_cnt, i, dev) {
4798 struct acpi_device_physical_node *pn;
4799 struct iommu_group *group;
4800 struct acpi_device *adev;
4802 if (dev->bus != &acpi_bus_type)
4805 adev = to_acpi_device(dev);
4806 mutex_lock(&adev->physical_node_lock);
4807 list_for_each_entry(pn,
4808 &adev->physical_node_list, node) {
4809 group = iommu_group_get(pn->dev);
4811 iommu_group_put(group);
4815 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4816 ret = iommu_probe_device(pn->dev);
4820 mutex_unlock(&adev->physical_node_lock);
4830 int __init intel_iommu_init(void)
4833 struct dmar_drhd_unit *drhd;
4834 struct intel_iommu *iommu;
4837 * Intel IOMMU is required for a TXT/tboot launch or platform
4838 * opt in, so enforce that.
4840 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4842 if (iommu_init_mempool()) {
4844 panic("tboot: Failed to initialize iommu memory\n");
4848 down_write(&dmar_global_lock);
4849 if (dmar_table_init()) {
4851 panic("tboot: Failed to initialize DMAR table\n");
4855 if (dmar_dev_scope_init() < 0) {
4857 panic("tboot: Failed to initialize DMAR device scope\n");
4861 up_write(&dmar_global_lock);
4864 * The bus notifier takes the dmar_global_lock, so lockdep will
4865 * complain later when we register it under the lock.
4867 dmar_register_bus_notifier();
4869 down_write(&dmar_global_lock);
4872 intel_iommu_debugfs_init();
4874 if (no_iommu || dmar_disabled) {
4876 * We exit the function here to ensure IOMMU's remapping and
4877 * mempool aren't setup, which means that the IOMMU's PMRs
4878 * won't be disabled via the call to init_dmars(). So disable
4879 * it explicitly here. The PMRs were setup by tboot prior to
4880 * calling SENTER, but the kernel is expected to reset/tear
4883 if (intel_iommu_tboot_noforce) {
4884 for_each_iommu(iommu, drhd)
4885 iommu_disable_protect_mem_regions(iommu);
4889 * Make sure the IOMMUs are switched off, even when we
4890 * boot into a kexec kernel and the previous kernel left
4893 intel_disable_iommus();
4897 if (list_empty(&dmar_rmrr_units))
4898 pr_info("No RMRR found\n");
4900 if (list_empty(&dmar_atsr_units))
4901 pr_info("No ATSR found\n");
4903 if (dmar_init_reserved_ranges()) {
4905 panic("tboot: Failed to reserve iommu ranges\n");
4906 goto out_free_reserved_range;
4910 intel_iommu_gfx_mapped = 1;
4912 init_no_remapping_devices();
4917 panic("tboot: Failed to initialize DMARs\n");
4918 pr_err("Initialization failed\n");
4919 goto out_free_reserved_range;
4921 up_write(&dmar_global_lock);
4923 init_iommu_pm_ops();
4925 down_read(&dmar_global_lock);
4926 for_each_active_iommu(iommu, drhd) {
4927 iommu_device_sysfs_add(&iommu->iommu, NULL,
4930 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4931 iommu_device_register(&iommu->iommu);
4933 up_read(&dmar_global_lock);
4935 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4936 if (si_domain && !hw_pass_through)
4937 register_memory_notifier(&intel_iommu_memory_nb);
4938 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4939 intel_iommu_cpu_dead);
4941 down_read(&dmar_global_lock);
4942 if (probe_acpi_namespace_devices())
4943 pr_warn("ACPI name space devices didn't probe correctly\n");
4945 /* Finally, we enable the DMA remapping hardware. */
4946 for_each_iommu(iommu, drhd) {
4947 if (!drhd->ignored && !translation_pre_enabled(iommu))
4948 iommu_enable_translation(iommu);
4950 iommu_disable_protect_mem_regions(iommu);
4952 up_read(&dmar_global_lock);
4954 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4956 intel_iommu_enabled = 1;
4960 out_free_reserved_range:
4961 put_iova_domain(&reserved_iova_list);
4963 intel_iommu_free_dmars();
4964 up_write(&dmar_global_lock);
4965 iommu_exit_mempool();
4969 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4971 struct intel_iommu *iommu = opaque;
4973 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4978 * NB - intel-iommu lacks any sort of reference counting for the users of
4979 * dependent devices. If multiple endpoints have intersecting dependent
4980 * devices, unbinding the driver from any one of them will possibly leave
4981 * the others unable to operate.
4983 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4985 if (!iommu || !dev || !dev_is_pci(dev))
4988 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4991 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4993 struct dmar_domain *domain;
4994 struct intel_iommu *iommu;
4995 unsigned long flags;
4997 assert_spin_locked(&device_domain_lock);
5002 iommu = info->iommu;
5003 domain = info->domain;
5006 if (dev_is_pci(info->dev) && sm_supported(iommu))
5007 intel_pasid_tear_down_entry(iommu, info->dev,
5008 PASID_RID2PASID, false);
5010 iommu_disable_dev_iotlb(info);
5011 domain_context_clear(iommu, info->dev);
5012 intel_pasid_free_table(info->dev);
5015 unlink_domain_info(info);
5017 spin_lock_irqsave(&iommu->lock, flags);
5018 domain_detach_iommu(domain, iommu);
5019 spin_unlock_irqrestore(&iommu->lock, flags);
5021 free_devinfo_mem(info);
5024 static void dmar_remove_one_dev_info(struct device *dev)
5026 struct device_domain_info *info;
5027 unsigned long flags;
5029 spin_lock_irqsave(&device_domain_lock, flags);
5030 info = get_domain_info(dev);
5032 __dmar_remove_one_dev_info(info);
5033 spin_unlock_irqrestore(&device_domain_lock, flags);
5036 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5040 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5041 domain_reserve_special_ranges(domain);
5043 /* calculate AGAW */
5044 domain->gaw = guest_width;
5045 adjust_width = guestwidth_to_adjustwidth(guest_width);
5046 domain->agaw = width_to_agaw(adjust_width);
5048 domain->iommu_coherency = 0;
5049 domain->iommu_snooping = 0;
5050 domain->iommu_superpage = 0;
5051 domain->max_addr = 0;
5053 /* always allocate the top pgd */
5054 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5057 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5061 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5063 struct dmar_domain *dmar_domain;
5064 struct iommu_domain *domain;
5068 case IOMMU_DOMAIN_DMA:
5070 case IOMMU_DOMAIN_UNMANAGED:
5071 dmar_domain = alloc_domain(0);
5073 pr_err("Can't allocate dmar_domain\n");
5076 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5077 pr_err("Domain initialization failed\n");
5078 domain_exit(dmar_domain);
5082 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5083 ret = init_iova_flush_queue(&dmar_domain->iovad,
5087 pr_info("iova flush queue initialization failed\n");
5090 domain_update_iommu_cap(dmar_domain);
5092 domain = &dmar_domain->domain;
5093 domain->geometry.aperture_start = 0;
5094 domain->geometry.aperture_end =
5095 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5096 domain->geometry.force_aperture = true;
5099 case IOMMU_DOMAIN_IDENTITY:
5100 return &si_domain->domain;
5108 static void intel_iommu_domain_free(struct iommu_domain *domain)
5110 if (domain != &si_domain->domain)
5111 domain_exit(to_dmar_domain(domain));
5115 * Check whether a @domain could be attached to the @dev through the
5116 * aux-domain attach/detach APIs.
5119 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5121 struct device_domain_info *info = get_domain_info(dev);
5123 return info && info->auxd_enabled &&
5124 domain->type == IOMMU_DOMAIN_UNMANAGED;
5127 static void auxiliary_link_device(struct dmar_domain *domain,
5130 struct device_domain_info *info = get_domain_info(dev);
5132 assert_spin_locked(&device_domain_lock);
5136 domain->auxd_refcnt++;
5137 list_add(&domain->auxd, &info->auxiliary_domains);
5140 static void auxiliary_unlink_device(struct dmar_domain *domain,
5143 struct device_domain_info *info = get_domain_info(dev);
5145 assert_spin_locked(&device_domain_lock);
5149 list_del(&domain->auxd);
5150 domain->auxd_refcnt--;
5152 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5153 ioasid_free(domain->default_pasid);
5156 static int aux_domain_add_dev(struct dmar_domain *domain,
5161 unsigned long flags;
5162 struct intel_iommu *iommu;
5164 iommu = device_to_iommu(dev, &bus, &devfn);
5168 if (domain->default_pasid <= 0) {
5171 /* No private data needed for the default pasid */
5172 pasid = ioasid_alloc(NULL, PASID_MIN,
5173 pci_max_pasids(to_pci_dev(dev)) - 1,
5175 if (pasid == INVALID_IOASID) {
5176 pr_err("Can't allocate default pasid\n");
5179 domain->default_pasid = pasid;
5182 spin_lock_irqsave(&device_domain_lock, flags);
5184 * iommu->lock must be held to attach domain to iommu and setup the
5185 * pasid entry for second level translation.
5187 spin_lock(&iommu->lock);
5188 ret = domain_attach_iommu(domain, iommu);
5192 /* Setup the PASID entry for mediated devices: */
5193 if (domain_use_first_level(domain))
5194 ret = domain_setup_first_level(iommu, domain, dev,
5195 domain->default_pasid);
5197 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5198 domain->default_pasid);
5201 spin_unlock(&iommu->lock);
5203 auxiliary_link_device(domain, dev);
5205 spin_unlock_irqrestore(&device_domain_lock, flags);
5210 domain_detach_iommu(domain, iommu);
5212 spin_unlock(&iommu->lock);
5213 spin_unlock_irqrestore(&device_domain_lock, flags);
5214 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5215 ioasid_free(domain->default_pasid);
5220 static void aux_domain_remove_dev(struct dmar_domain *domain,
5223 struct device_domain_info *info;
5224 struct intel_iommu *iommu;
5225 unsigned long flags;
5227 if (!is_aux_domain(dev, &domain->domain))
5230 spin_lock_irqsave(&device_domain_lock, flags);
5231 info = get_domain_info(dev);
5232 iommu = info->iommu;
5234 auxiliary_unlink_device(domain, dev);
5236 spin_lock(&iommu->lock);
5237 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5238 domain_detach_iommu(domain, iommu);
5239 spin_unlock(&iommu->lock);
5241 spin_unlock_irqrestore(&device_domain_lock, flags);
5244 static int prepare_domain_attach_device(struct iommu_domain *domain,
5247 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5248 struct intel_iommu *iommu;
5252 iommu = device_to_iommu(dev, &bus, &devfn);
5256 /* check if this iommu agaw is sufficient for max mapped address */
5257 addr_width = agaw_to_width(iommu->agaw);
5258 if (addr_width > cap_mgaw(iommu->cap))
5259 addr_width = cap_mgaw(iommu->cap);
5261 if (dmar_domain->max_addr > (1LL << addr_width)) {
5262 dev_err(dev, "%s: iommu width (%d) is not "
5263 "sufficient for the mapped address (%llx)\n",
5264 __func__, addr_width, dmar_domain->max_addr);
5267 dmar_domain->gaw = addr_width;
5270 * Knock out extra levels of page tables if necessary
5272 while (iommu->agaw < dmar_domain->agaw) {
5273 struct dma_pte *pte;
5275 pte = dmar_domain->pgd;
5276 if (dma_pte_present(pte)) {
5277 dmar_domain->pgd = (struct dma_pte *)
5278 phys_to_virt(dma_pte_addr(pte));
5279 free_pgtable_page(pte);
5281 dmar_domain->agaw--;
5287 static int intel_iommu_attach_device(struct iommu_domain *domain,
5292 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5293 device_is_rmrr_locked(dev)) {
5294 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5298 if (is_aux_domain(dev, domain))
5301 /* normally dev is not mapped */
5302 if (unlikely(domain_context_mapped(dev))) {
5303 struct dmar_domain *old_domain;
5305 old_domain = find_domain(dev);
5307 dmar_remove_one_dev_info(dev);
5310 ret = prepare_domain_attach_device(domain, dev);
5314 return domain_add_dev_info(to_dmar_domain(domain), dev);
5317 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5322 if (!is_aux_domain(dev, domain))
5325 ret = prepare_domain_attach_device(domain, dev);
5329 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5332 static void intel_iommu_detach_device(struct iommu_domain *domain,
5335 dmar_remove_one_dev_info(dev);
5338 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5341 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5345 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5346 * VT-d granularity. Invalidation is typically included in the unmap operation
5347 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5348 * owns the first level page tables. Invalidations of translation caches in the
5349 * guest are trapped and passed down to the host.
5351 * vIOMMU in the guest will only expose first level page tables, therefore
5352 * we do not support IOTLB granularity for request without PASID (second level).
5354 * For example, to find the VT-d granularity encoding for IOTLB
5355 * type and page selective granularity within PASID:
5356 * X: indexed by iommu cache type
5357 * Y: indexed by enum iommu_inv_granularity
5358 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5362 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5364 * PASID based IOTLB invalidation: PASID selective (per PASID),
5365 * page selective (address granularity)
5367 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5368 /* PASID based dev TLBs */
5369 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5371 {-EINVAL, -EINVAL, -EINVAL}
5374 static inline int to_vtd_granularity(int type, int granu)
5376 return inv_type_granu_table[type][granu];
5379 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5381 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5383 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5384 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5385 * granu size in contiguous memory.
5387 return order_base_2(nr_pages);
5390 #ifdef CONFIG_INTEL_IOMMU_SVM
5392 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5393 struct iommu_cache_invalidate_info *inv_info)
5395 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5396 struct device_domain_info *info;
5397 struct intel_iommu *iommu;
5398 unsigned long flags;
5405 if (!inv_info || !dmar_domain ||
5406 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5409 if (!dev || !dev_is_pci(dev))
5412 iommu = device_to_iommu(dev, &bus, &devfn);
5416 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5419 spin_lock_irqsave(&device_domain_lock, flags);
5420 spin_lock(&iommu->lock);
5421 info = get_domain_info(dev);
5426 did = dmar_domain->iommu_did[iommu->seq_id];
5427 sid = PCI_DEVID(bus, devfn);
5429 /* Size is only valid in address selective invalidation */
5430 if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5431 size = to_vtd_size(inv_info->addr_info.granule_size,
5432 inv_info->addr_info.nb_granules);
5434 for_each_set_bit(cache_type,
5435 (unsigned long *)&inv_info->cache,
5436 IOMMU_CACHE_INV_TYPE_NR) {
5440 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5441 if (granu == -EINVAL) {
5442 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5443 cache_type, inv_info->granularity);
5448 * PASID is stored in different locations based on the
5451 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5452 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5453 pasid = inv_info->pasid_info.pasid;
5454 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5455 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5456 pasid = inv_info->addr_info.pasid;
5458 switch (BIT(cache_type)) {
5459 case IOMMU_CACHE_INV_TYPE_IOTLB:
5460 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5463 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5464 inv_info->addr_info.addr, size);
5470 * If granu is PASID-selective, address is ignored.
5471 * We use npages = -1 to indicate that.
5473 qi_flush_piotlb(iommu, did, pasid,
5474 mm_to_dma_pfn(inv_info->addr_info.addr),
5475 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5476 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5479 * Always flush device IOTLB if ATS is enabled. vIOMMU
5480 * in the guest may assume IOTLB flush is inclusive,
5481 * which is more efficient.
5483 if (info->ats_enabled)
5484 qi_flush_dev_iotlb_pasid(iommu, sid,
5487 inv_info->addr_info.addr,
5490 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5491 if (info->ats_enabled)
5492 qi_flush_dev_iotlb_pasid(iommu, sid,
5495 inv_info->addr_info.addr,
5498 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5501 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5507 spin_unlock(&iommu->lock);
5508 spin_unlock_irqrestore(&device_domain_lock, flags);
5514 static int intel_iommu_map(struct iommu_domain *domain,
5515 unsigned long iova, phys_addr_t hpa,
5516 size_t size, int iommu_prot, gfp_t gfp)
5518 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5523 if (iommu_prot & IOMMU_READ)
5524 prot |= DMA_PTE_READ;
5525 if (iommu_prot & IOMMU_WRITE)
5526 prot |= DMA_PTE_WRITE;
5527 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5528 prot |= DMA_PTE_SNP;
5530 max_addr = iova + size;
5531 if (dmar_domain->max_addr < max_addr) {
5534 /* check if minimum agaw is sufficient for mapped address */
5535 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5536 if (end < max_addr) {
5537 pr_err("%s: iommu width (%d) is not "
5538 "sufficient for the mapped address (%llx)\n",
5539 __func__, dmar_domain->gaw, max_addr);
5542 dmar_domain->max_addr = max_addr;
5544 /* Round up size to next multiple of PAGE_SIZE, if it and
5545 the low bits of hpa would take us onto the next page */
5546 size = aligned_nrpages(hpa, size);
5547 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5548 hpa >> VTD_PAGE_SHIFT, size, prot);
5552 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5553 unsigned long iova, size_t size,
5554 struct iommu_iotlb_gather *gather)
5556 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5557 struct page *freelist = NULL;
5558 unsigned long start_pfn, last_pfn;
5559 unsigned int npages;
5560 int iommu_id, level = 0;
5562 /* Cope with horrid API which requires us to unmap more than the
5563 size argument if it happens to be a large-page mapping. */
5564 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5566 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5567 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5569 start_pfn = iova >> VTD_PAGE_SHIFT;
5570 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5572 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5574 npages = last_pfn - start_pfn + 1;
5576 for_each_domain_iommu(iommu_id, dmar_domain)
5577 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5578 start_pfn, npages, !freelist, 0);
5580 dma_free_pagelist(freelist);
5582 if (dmar_domain->max_addr == iova + size)
5583 dmar_domain->max_addr = iova;
5588 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5591 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5592 struct dma_pte *pte;
5596 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5597 if (pte && dma_pte_present(pte))
5598 phys = dma_pte_addr(pte) +
5599 (iova & (BIT_MASK(level_to_offset_bits(level) +
5600 VTD_PAGE_SHIFT) - 1));
5605 static inline bool scalable_mode_support(void)
5607 struct dmar_drhd_unit *drhd;
5608 struct intel_iommu *iommu;
5612 for_each_active_iommu(iommu, drhd) {
5613 if (!sm_supported(iommu)) {
5623 static inline bool iommu_pasid_support(void)
5625 struct dmar_drhd_unit *drhd;
5626 struct intel_iommu *iommu;
5630 for_each_active_iommu(iommu, drhd) {
5631 if (!pasid_supported(iommu)) {
5641 static inline bool nested_mode_support(void)
5643 struct dmar_drhd_unit *drhd;
5644 struct intel_iommu *iommu;
5648 for_each_active_iommu(iommu, drhd) {
5649 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5659 static bool intel_iommu_capable(enum iommu_cap cap)
5661 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5662 return domain_update_iommu_snooping(NULL) == 1;
5663 if (cap == IOMMU_CAP_INTR_REMAP)
5664 return irq_remapping_enabled == 1;
5669 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5671 struct intel_iommu *iommu;
5674 iommu = device_to_iommu(dev, &bus, &devfn);
5676 return ERR_PTR(-ENODEV);
5678 if (translation_pre_enabled(iommu))
5679 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5681 return &iommu->iommu;
5684 static void intel_iommu_release_device(struct device *dev)
5686 struct intel_iommu *iommu;
5689 iommu = device_to_iommu(dev, &bus, &devfn);
5693 dmar_remove_one_dev_info(dev);
5695 set_dma_ops(dev, NULL);
5698 static void intel_iommu_probe_finalize(struct device *dev)
5700 struct iommu_domain *domain;
5702 domain = iommu_get_domain_for_dev(dev);
5703 if (device_needs_bounce(dev))
5704 set_dma_ops(dev, &bounce_dma_ops);
5705 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5706 set_dma_ops(dev, &intel_dma_ops);
5708 set_dma_ops(dev, NULL);
5711 static void intel_iommu_get_resv_regions(struct device *device,
5712 struct list_head *head)
5714 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5715 struct iommu_resv_region *reg;
5716 struct dmar_rmrr_unit *rmrr;
5717 struct device *i_dev;
5720 down_read(&dmar_global_lock);
5721 for_each_rmrr_units(rmrr) {
5722 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5724 struct iommu_resv_region *resv;
5725 enum iommu_resv_type type;
5728 if (i_dev != device &&
5729 !is_downstream_to_pci_bridge(device, i_dev))
5732 length = rmrr->end_address - rmrr->base_address + 1;
5734 type = device_rmrr_is_relaxable(device) ?
5735 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5737 resv = iommu_alloc_resv_region(rmrr->base_address,
5738 length, prot, type);
5742 list_add_tail(&resv->list, head);
5745 up_read(&dmar_global_lock);
5747 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5748 if (dev_is_pci(device)) {
5749 struct pci_dev *pdev = to_pci_dev(device);
5751 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5752 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5753 IOMMU_RESV_DIRECT_RELAXABLE);
5755 list_add_tail(®->list, head);
5758 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5760 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5761 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5765 list_add_tail(®->list, head);
5768 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5770 struct device_domain_info *info;
5771 struct context_entry *context;
5772 struct dmar_domain *domain;
5773 unsigned long flags;
5777 domain = find_domain(dev);
5781 spin_lock_irqsave(&device_domain_lock, flags);
5782 spin_lock(&iommu->lock);
5785 info = get_domain_info(dev);
5786 if (!info || !info->pasid_supported)
5789 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5790 if (WARN_ON(!context))
5793 ctx_lo = context[0].lo;
5795 if (!(ctx_lo & CONTEXT_PASIDE)) {
5796 ctx_lo |= CONTEXT_PASIDE;
5797 context[0].lo = ctx_lo;
5799 iommu->flush.flush_context(iommu,
5800 domain->iommu_did[iommu->seq_id],
5801 PCI_DEVID(info->bus, info->devfn),
5802 DMA_CCMD_MASK_NOBIT,
5803 DMA_CCMD_DEVICE_INVL);
5806 /* Enable PASID support in the device, if it wasn't already */
5807 if (!info->pasid_enabled)
5808 iommu_enable_dev_iotlb(info);
5813 spin_unlock(&iommu->lock);
5814 spin_unlock_irqrestore(&device_domain_lock, flags);
5819 static void intel_iommu_apply_resv_region(struct device *dev,
5820 struct iommu_domain *domain,
5821 struct iommu_resv_region *region)
5823 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5824 unsigned long start, end;
5826 start = IOVA_PFN(region->start);
5827 end = IOVA_PFN(region->start + region->length - 1);
5829 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5832 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5834 if (dev_is_pci(dev))
5835 return pci_device_group(dev);
5836 return generic_device_group(dev);
5839 #ifdef CONFIG_INTEL_IOMMU_SVM
5840 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5842 struct intel_iommu *iommu;
5845 if (iommu_dummy(dev)) {
5847 "No IOMMU translation for device; cannot enable SVM\n");
5851 iommu = device_to_iommu(dev, &bus, &devfn);
5853 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5859 #endif /* CONFIG_INTEL_IOMMU_SVM */
5861 static int intel_iommu_enable_auxd(struct device *dev)
5863 struct device_domain_info *info;
5864 struct intel_iommu *iommu;
5865 unsigned long flags;
5869 iommu = device_to_iommu(dev, &bus, &devfn);
5870 if (!iommu || dmar_disabled)
5873 if (!sm_supported(iommu) || !pasid_supported(iommu))
5876 ret = intel_iommu_enable_pasid(iommu, dev);
5880 spin_lock_irqsave(&device_domain_lock, flags);
5881 info = get_domain_info(dev);
5882 info->auxd_enabled = 1;
5883 spin_unlock_irqrestore(&device_domain_lock, flags);
5888 static int intel_iommu_disable_auxd(struct device *dev)
5890 struct device_domain_info *info;
5891 unsigned long flags;
5893 spin_lock_irqsave(&device_domain_lock, flags);
5894 info = get_domain_info(dev);
5895 if (!WARN_ON(!info))
5896 info->auxd_enabled = 0;
5897 spin_unlock_irqrestore(&device_domain_lock, flags);
5903 * A PCI express designated vendor specific extended capability is defined
5904 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5905 * for system software and tools to detect endpoint devices supporting the
5906 * Intel scalable IO virtualization without host driver dependency.
5908 * Returns the address of the matching extended capability structure within
5909 * the device's PCI configuration space or 0 if the device does not support
5912 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5917 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5919 pci_read_config_word(pdev, pos + 4, &vendor);
5920 pci_read_config_word(pdev, pos + 8, &id);
5921 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5924 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5931 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5933 if (feat == IOMMU_DEV_FEAT_AUX) {
5936 if (!dev_is_pci(dev) || dmar_disabled ||
5937 !scalable_mode_support() || !iommu_pasid_support())
5940 ret = pci_pasid_features(to_pci_dev(dev));
5944 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5947 if (feat == IOMMU_DEV_FEAT_SVA) {
5948 struct device_domain_info *info = get_domain_info(dev);
5950 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5951 info->pasid_supported && info->pri_supported &&
5952 info->ats_supported;
5959 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5961 if (feat == IOMMU_DEV_FEAT_AUX)
5962 return intel_iommu_enable_auxd(dev);
5964 if (feat == IOMMU_DEV_FEAT_SVA) {
5965 struct device_domain_info *info = get_domain_info(dev);
5970 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5978 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5980 if (feat == IOMMU_DEV_FEAT_AUX)
5981 return intel_iommu_disable_auxd(dev);
5987 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5989 struct device_domain_info *info = get_domain_info(dev);
5991 if (feat == IOMMU_DEV_FEAT_AUX)
5992 return scalable_mode_support() && info && info->auxd_enabled;
5998 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6000 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6002 return dmar_domain->default_pasid > 0 ?
6003 dmar_domain->default_pasid : -EINVAL;
6006 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6009 return attach_deferred(dev);
6013 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6014 enum iommu_attr attr, void *data)
6016 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6017 unsigned long flags;
6020 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6024 case DOMAIN_ATTR_NESTING:
6025 spin_lock_irqsave(&device_domain_lock, flags);
6026 if (nested_mode_support() &&
6027 list_empty(&dmar_domain->devices)) {
6028 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6029 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6033 spin_unlock_irqrestore(&device_domain_lock, flags);
6043 const struct iommu_ops intel_iommu_ops = {
6044 .capable = intel_iommu_capable,
6045 .domain_alloc = intel_iommu_domain_alloc,
6046 .domain_free = intel_iommu_domain_free,
6047 .domain_set_attr = intel_iommu_domain_set_attr,
6048 .attach_dev = intel_iommu_attach_device,
6049 .detach_dev = intel_iommu_detach_device,
6050 .aux_attach_dev = intel_iommu_aux_attach_device,
6051 .aux_detach_dev = intel_iommu_aux_detach_device,
6052 .aux_get_pasid = intel_iommu_aux_get_pasid,
6053 .map = intel_iommu_map,
6054 .unmap = intel_iommu_unmap,
6055 .iova_to_phys = intel_iommu_iova_to_phys,
6056 .probe_device = intel_iommu_probe_device,
6057 .probe_finalize = intel_iommu_probe_finalize,
6058 .release_device = intel_iommu_release_device,
6059 .get_resv_regions = intel_iommu_get_resv_regions,
6060 .put_resv_regions = generic_iommu_put_resv_regions,
6061 .apply_resv_region = intel_iommu_apply_resv_region,
6062 .device_group = intel_iommu_device_group,
6063 .dev_has_feat = intel_iommu_dev_has_feat,
6064 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6065 .dev_enable_feat = intel_iommu_dev_enable_feat,
6066 .dev_disable_feat = intel_iommu_dev_disable_feat,
6067 .is_attach_deferred = intel_iommu_is_attach_deferred,
6068 .def_domain_type = device_def_domain_type,
6069 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6070 #ifdef CONFIG_INTEL_IOMMU_SVM
6071 .cache_invalidate = intel_iommu_sva_invalidate,
6072 .sva_bind_gpasid = intel_svm_bind_gpasid,
6073 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6074 .sva_bind = intel_svm_bind,
6075 .sva_unbind = intel_svm_unbind,
6076 .sva_get_pasid = intel_svm_get_pasid,
6080 static void quirk_iommu_igfx(struct pci_dev *dev)
6082 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6086 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6095 /* Broadwell igfx malfunctions with dmar */
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6121 static void quirk_iommu_rwbf(struct pci_dev *dev)
6124 * Mobile 4 Series Chipset neglects to set RWBF capability,
6125 * but needs it. Same seems to hold for the desktop versions.
6127 pci_info(dev, "Forcing write-buffer flush capability\n");
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6132 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6133 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6134 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6140 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6141 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6142 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6143 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6144 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6145 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6146 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6147 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6149 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6153 if (pci_read_config_word(dev, GGC, &ggc))
6156 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6157 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6159 } else if (dmar_map_gfx) {
6160 /* we have to ensure the gfx device is idle before we flush */
6161 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6162 intel_iommu_strict = 1;
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6170 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6171 ISOCH DMAR unit for the Azalia sound device, but not give it any
6172 TLB entries, which causes it to deadlock. Check for that. We do
6173 this in a function called from init_dmars(), instead of in a PCI
6174 quirk, because we don't want to print the obnoxious "BIOS broken"
6175 message if VT-d is actually disabled.
6177 static void __init check_tylersburg_isoch(void)
6179 struct pci_dev *pdev;
6180 uint32_t vtisochctrl;
6182 /* If there's no Azalia in the system anyway, forget it. */
6183 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6188 /* System Management Registers. Might be hidden, in which case
6189 we can't do the sanity check. But that's OK, because the
6190 known-broken BIOSes _don't_ actually hide it, so far. */
6191 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6195 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6202 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6203 if (vtisochctrl & 1)
6206 /* Drop all bits other than the number of TLB entries */
6207 vtisochctrl &= 0x1c;
6209 /* If we have the recommended number of TLB entries (16), fine. */
6210 if (vtisochctrl == 0x10)
6213 /* Zero TLB entries? You get to ride the short bus to school. */
6215 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6216 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6217 dmi_get_system_info(DMI_BIOS_VENDOR),
6218 dmi_get_system_info(DMI_BIOS_VERSION),
6219 dmi_get_system_info(DMI_PRODUCT_VERSION));
6220 iommu_identity_mapping |= IDENTMAP_AZALIA;
6224 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",