1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
357 int dmar_disabled = 1;
358 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
360 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
361 int intel_iommu_sm = 1;
364 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
366 int intel_iommu_enabled = 0;
367 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
369 static int dmar_map_gfx = 1;
370 static int dmar_forcedac;
371 static int intel_iommu_strict;
372 static int intel_iommu_superpage = 1;
373 static int iommu_identity_mapping;
374 static int intel_no_bounce;
376 #define IDENTMAP_ALL 1
377 #define IDENTMAP_GFX 2
378 #define IDENTMAP_AZALIA 4
380 int intel_iommu_gfx_mapped;
381 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
383 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
384 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
385 static DEFINE_SPINLOCK(device_domain_lock);
386 static LIST_HEAD(device_domain_list);
388 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
389 to_pci_dev(d)->untrusted)
392 * Iterate over elements in device_domain_list and call the specified
393 * callback @fn against each element.
395 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
396 void *data), void *data)
400 struct device_domain_info *info;
402 spin_lock_irqsave(&device_domain_lock, flags);
403 list_for_each_entry(info, &device_domain_list, global) {
404 ret = fn(info, data);
406 spin_unlock_irqrestore(&device_domain_lock, flags);
410 spin_unlock_irqrestore(&device_domain_lock, flags);
415 const struct iommu_ops intel_iommu_ops;
417 static bool translation_pre_enabled(struct intel_iommu *iommu)
419 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
422 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
424 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
427 static void init_translation_status(struct intel_iommu *iommu)
431 gsts = readl(iommu->reg + DMAR_GSTS_REG);
432 if (gsts & DMA_GSTS_TES)
433 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
436 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
437 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
439 return container_of(dom, struct dmar_domain, domain);
442 static int __init intel_iommu_setup(char *str)
447 if (!strncmp(str, "on", 2)) {
449 pr_info("IOMMU enabled\n");
450 } else if (!strncmp(str, "off", 3)) {
452 no_platform_optin = 1;
453 pr_info("IOMMU disabled\n");
454 } else if (!strncmp(str, "igfx_off", 8)) {
456 pr_info("Disable GFX device mapping\n");
457 } else if (!strncmp(str, "forcedac", 8)) {
458 pr_info("Forcing DAC for PCI devices\n");
460 } else if (!strncmp(str, "strict", 6)) {
461 pr_info("Disable batched IOTLB flush\n");
462 intel_iommu_strict = 1;
463 } else if (!strncmp(str, "sp_off", 6)) {
464 pr_info("Disable supported super page\n");
465 intel_iommu_superpage = 0;
466 } else if (!strncmp(str, "sm_on", 5)) {
467 pr_info("Intel-IOMMU: scalable mode supported\n");
469 } else if (!strncmp(str, "tboot_noforce", 13)) {
471 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
472 intel_iommu_tboot_noforce = 1;
473 } else if (!strncmp(str, "nobounce", 8)) {
474 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
478 str += strcspn(str, ",");
484 __setup("intel_iommu=", intel_iommu_setup);
486 static struct kmem_cache *iommu_domain_cache;
487 static struct kmem_cache *iommu_devinfo_cache;
489 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
491 struct dmar_domain **domains;
494 domains = iommu->domains[idx];
498 return domains[did & 0xff];
501 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
502 struct dmar_domain *domain)
504 struct dmar_domain **domains;
507 if (!iommu->domains[idx]) {
508 size_t size = 256 * sizeof(struct dmar_domain *);
509 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
512 domains = iommu->domains[idx];
513 if (WARN_ON(!domains))
516 domains[did & 0xff] = domain;
519 void *alloc_pgtable_page(int node)
524 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
526 vaddr = page_address(page);
530 void free_pgtable_page(void *vaddr)
532 free_page((unsigned long)vaddr);
535 static inline void *alloc_domain_mem(void)
537 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
540 static void free_domain_mem(void *vaddr)
542 kmem_cache_free(iommu_domain_cache, vaddr);
545 static inline void * alloc_devinfo_mem(void)
547 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
550 static inline void free_devinfo_mem(void *vaddr)
552 kmem_cache_free(iommu_devinfo_cache, vaddr);
555 static inline int domain_type_is_si(struct dmar_domain *domain)
557 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
560 static inline int domain_pfn_supported(struct dmar_domain *domain,
563 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
565 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
568 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
573 sagaw = cap_sagaw(iommu->cap);
574 for (agaw = width_to_agaw(max_gaw);
576 if (test_bit(agaw, &sagaw))
584 * Calculate max SAGAW for each iommu.
586 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
588 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
592 * calculate agaw for each iommu.
593 * "SAGAW" may be different across iommus, use a default agaw, and
594 * get a supported less agaw for iommus that don't support the default agaw.
596 int iommu_calculate_agaw(struct intel_iommu *iommu)
598 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
601 /* This functionin only returns single iommu in a domain */
602 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
606 /* si_domain and vm domain should not get here. */
607 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
610 for_each_domain_iommu(iommu_id, domain)
613 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
616 return g_iommus[iommu_id];
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 struct dmar_drhd_unit *drhd;
622 struct intel_iommu *iommu;
626 domain->iommu_coherency = 1;
628 for_each_domain_iommu(i, domain) {
630 if (!ecap_coherent(g_iommus[i]->ecap)) {
631 domain->iommu_coherency = 0;
638 /* No hardware attached; use lowest common denominator */
640 for_each_active_iommu(iommu, drhd) {
641 if (!ecap_coherent(iommu->ecap)) {
642 domain->iommu_coherency = 0;
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 struct dmar_drhd_unit *drhd;
652 struct intel_iommu *iommu;
656 for_each_active_iommu(iommu, drhd) {
658 if (!ecap_sc_support(iommu->ecap)) {
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
671 struct dmar_drhd_unit *drhd;
672 struct intel_iommu *iommu;
675 if (!intel_iommu_superpage) {
679 /* set iommu_superpage to the smallest common denominator */
681 for_each_active_iommu(iommu, drhd) {
683 mask &= cap_super_page_val(iommu->cap);
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
696 domain_update_iommu_coherency(domain);
697 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
704 struct root_entry *root = &iommu->root_entry[bus];
705 struct context_entry *context;
709 if (sm_supported(iommu)) {
717 context = phys_to_virt(*entry & VTD_PAGE_MASK);
719 unsigned long phy_addr;
723 context = alloc_pgtable_page(iommu->node);
727 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728 phy_addr = virt_to_phys((void *)context);
729 *entry = phy_addr | 1;
730 __iommu_flush_cache(iommu, entry, sizeof(*entry));
732 return &context[devfn];
735 static int iommu_dummy(struct device *dev)
737 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
741 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
742 * sub-hierarchy of a candidate PCI-PCI bridge
743 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
744 * @bridge: the candidate PCI-PCI bridge
746 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
749 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
751 struct pci_dev *pdev, *pbridge;
753 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
756 pdev = to_pci_dev(dev);
757 pbridge = to_pci_dev(bridge);
759 if (pbridge->subordinate &&
760 pbridge->subordinate->number <= pdev->bus->number &&
761 pbridge->subordinate->busn_res.end >= pdev->bus->number)
767 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
769 struct dmar_drhd_unit *drhd = NULL;
770 struct intel_iommu *iommu;
772 struct pci_dev *pdev = NULL;
776 if (iommu_dummy(dev))
779 if (dev_is_pci(dev)) {
780 struct pci_dev *pf_pdev;
782 pdev = to_pci_dev(dev);
785 /* VMD child devices currently cannot be handled individually */
786 if (is_vmd(pdev->bus))
790 /* VFs aren't listed in scope tables; we need to look up
791 * the PF instead to find the IOMMU. */
792 pf_pdev = pci_physfn(pdev);
794 segment = pci_domain_nr(pdev->bus);
795 } else if (has_acpi_companion(dev))
796 dev = &ACPI_COMPANION(dev)->dev;
799 for_each_active_iommu(iommu, drhd) {
800 if (pdev && segment != drhd->segment)
803 for_each_active_dev_scope(drhd->devices,
804 drhd->devices_cnt, i, tmp) {
806 /* For a VF use its original BDF# not that of the PF
807 * which we used for the IOMMU lookup. Strictly speaking
808 * we could do this for all PCI devices; we only need to
809 * get the BDF# from the scope table for ACPI matches. */
810 if (pdev && pdev->is_virtfn)
813 *bus = drhd->devices[i].bus;
814 *devfn = drhd->devices[i].devfn;
818 if (is_downstream_to_pci_bridge(dev, tmp))
822 if (pdev && drhd->include_all) {
824 *bus = pdev->bus->number;
825 *devfn = pdev->devfn;
836 static void domain_flush_cache(struct dmar_domain *domain,
837 void *addr, int size)
839 if (!domain->iommu_coherency)
840 clflush_cache_range(addr, size);
843 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
845 struct context_entry *context;
849 spin_lock_irqsave(&iommu->lock, flags);
850 context = iommu_context_addr(iommu, bus, devfn, 0);
852 ret = context_present(context);
853 spin_unlock_irqrestore(&iommu->lock, flags);
857 static void free_context_table(struct intel_iommu *iommu)
861 struct context_entry *context;
863 spin_lock_irqsave(&iommu->lock, flags);
864 if (!iommu->root_entry) {
867 for (i = 0; i < ROOT_ENTRY_NR; i++) {
868 context = iommu_context_addr(iommu, i, 0, 0);
870 free_pgtable_page(context);
872 if (!sm_supported(iommu))
875 context = iommu_context_addr(iommu, i, 0x80, 0);
877 free_pgtable_page(context);
880 free_pgtable_page(iommu->root_entry);
881 iommu->root_entry = NULL;
883 spin_unlock_irqrestore(&iommu->lock, flags);
886 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
887 unsigned long pfn, int *target_level)
889 struct dma_pte *parent, *pte;
890 int level = agaw_to_level(domain->agaw);
893 BUG_ON(!domain->pgd);
895 if (!domain_pfn_supported(domain, pfn))
896 /* Address beyond IOMMU's addressing capabilities. */
899 parent = domain->pgd;
904 offset = pfn_level_offset(pfn, level);
905 pte = &parent[offset];
906 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
908 if (level == *target_level)
911 if (!dma_pte_present(pte)) {
914 tmp_page = alloc_pgtable_page(domain->nid);
919 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
920 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
921 if (cmpxchg64(&pte->val, 0ULL, pteval))
922 /* Someone else set it while we were thinking; use theirs. */
923 free_pgtable_page(tmp_page);
925 domain_flush_cache(domain, pte, sizeof(*pte));
930 parent = phys_to_virt(dma_pte_addr(pte));
935 *target_level = level;
940 /* return address's pte at specific level */
941 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
943 int level, int *large_page)
945 struct dma_pte *parent, *pte;
946 int total = agaw_to_level(domain->agaw);
949 parent = domain->pgd;
950 while (level <= total) {
951 offset = pfn_level_offset(pfn, total);
952 pte = &parent[offset];
956 if (!dma_pte_present(pte)) {
961 if (dma_pte_superpage(pte)) {
966 parent = phys_to_virt(dma_pte_addr(pte));
972 /* clear last level pte, a tlb flush should be followed */
973 static void dma_pte_clear_range(struct dmar_domain *domain,
974 unsigned long start_pfn,
975 unsigned long last_pfn)
977 unsigned int large_page;
978 struct dma_pte *first_pte, *pte;
980 BUG_ON(!domain_pfn_supported(domain, start_pfn));
981 BUG_ON(!domain_pfn_supported(domain, last_pfn));
982 BUG_ON(start_pfn > last_pfn);
984 /* we don't need lock here; nobody else touches the iova range */
987 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
989 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
994 start_pfn += lvl_to_nr_pages(large_page);
996 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
998 domain_flush_cache(domain, first_pte,
999 (void *)pte - (void *)first_pte);
1001 } while (start_pfn && start_pfn <= last_pfn);
1004 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1005 int retain_level, struct dma_pte *pte,
1006 unsigned long pfn, unsigned long start_pfn,
1007 unsigned long last_pfn)
1009 pfn = max(start_pfn, pfn);
1010 pte = &pte[pfn_level_offset(pfn, level)];
1013 unsigned long level_pfn;
1014 struct dma_pte *level_pte;
1016 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1019 level_pfn = pfn & level_mask(level);
1020 level_pte = phys_to_virt(dma_pte_addr(pte));
1023 dma_pte_free_level(domain, level - 1, retain_level,
1024 level_pte, level_pfn, start_pfn,
1029 * Free the page table if we're below the level we want to
1030 * retain and the range covers the entire table.
1032 if (level < retain_level && !(start_pfn > level_pfn ||
1033 last_pfn < level_pfn + level_size(level) - 1)) {
1035 domain_flush_cache(domain, pte, sizeof(*pte));
1036 free_pgtable_page(level_pte);
1039 pfn += level_size(level);
1040 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1044 * clear last level (leaf) ptes and free page table pages below the
1045 * level we wish to keep intact.
1047 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1048 unsigned long start_pfn,
1049 unsigned long last_pfn,
1052 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1053 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1054 BUG_ON(start_pfn > last_pfn);
1056 dma_pte_clear_range(domain, start_pfn, last_pfn);
1058 /* We don't need lock here; nobody else touches the iova range */
1059 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1060 domain->pgd, 0, start_pfn, last_pfn);
1063 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1064 free_pgtable_page(domain->pgd);
1069 /* When a page at a given level is being unlinked from its parent, we don't
1070 need to *modify* it at all. All we need to do is make a list of all the
1071 pages which can be freed just as soon as we've flushed the IOTLB and we
1072 know the hardware page-walk will no longer touch them.
1073 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1075 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1076 int level, struct dma_pte *pte,
1077 struct page *freelist)
1081 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1082 pg->freelist = freelist;
1088 pte = page_address(pg);
1090 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1091 freelist = dma_pte_list_pagetables(domain, level - 1,
1094 } while (!first_pte_in_page(pte));
1099 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1100 struct dma_pte *pte, unsigned long pfn,
1101 unsigned long start_pfn,
1102 unsigned long last_pfn,
1103 struct page *freelist)
1105 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1107 pfn = max(start_pfn, pfn);
1108 pte = &pte[pfn_level_offset(pfn, level)];
1111 unsigned long level_pfn;
1113 if (!dma_pte_present(pte))
1116 level_pfn = pfn & level_mask(level);
1118 /* If range covers entire pagetable, free it */
1119 if (start_pfn <= level_pfn &&
1120 last_pfn >= level_pfn + level_size(level) - 1) {
1121 /* These suborbinate page tables are going away entirely. Don't
1122 bother to clear them; we're just going to *free* them. */
1123 if (level > 1 && !dma_pte_superpage(pte))
1124 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1130 } else if (level > 1) {
1131 /* Recurse down into a level that isn't *entirely* obsolete */
1132 freelist = dma_pte_clear_level(domain, level - 1,
1133 phys_to_virt(dma_pte_addr(pte)),
1134 level_pfn, start_pfn, last_pfn,
1138 pfn += level_size(level);
1139 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1142 domain_flush_cache(domain, first_pte,
1143 (void *)++last_pte - (void *)first_pte);
1148 /* We can't just free the pages because the IOMMU may still be walking
1149 the page tables, and may have cached the intermediate levels. The
1150 pages can only be freed after the IOTLB flush has been done. */
1151 static struct page *domain_unmap(struct dmar_domain *domain,
1152 unsigned long start_pfn,
1153 unsigned long last_pfn)
1155 struct page *freelist;
1157 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1158 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1159 BUG_ON(start_pfn > last_pfn);
1161 /* we don't need lock here; nobody else touches the iova range */
1162 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1163 domain->pgd, 0, start_pfn, last_pfn, NULL);
1166 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167 struct page *pgd_page = virt_to_page(domain->pgd);
1168 pgd_page->freelist = freelist;
1169 freelist = pgd_page;
1177 static void dma_free_pagelist(struct page *freelist)
1181 while ((pg = freelist)) {
1182 freelist = pg->freelist;
1183 free_pgtable_page(page_address(pg));
1187 static void iova_entry_free(unsigned long data)
1189 struct page *freelist = (struct page *)data;
1191 dma_free_pagelist(freelist);
1194 /* iommu handling */
1195 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1197 struct root_entry *root;
1198 unsigned long flags;
1200 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1202 pr_err("Allocating root entry for %s failed\n",
1207 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1209 spin_lock_irqsave(&iommu->lock, flags);
1210 iommu->root_entry = root;
1211 spin_unlock_irqrestore(&iommu->lock, flags);
1216 static void iommu_set_root_entry(struct intel_iommu *iommu)
1222 addr = virt_to_phys(iommu->root_entry);
1223 if (sm_supported(iommu))
1224 addr |= DMA_RTADDR_SMT;
1226 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1229 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1231 /* Make sure hardware complete it */
1232 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233 readl, (sts & DMA_GSTS_RTPS), sts);
1235 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1243 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1247 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1249 /* Make sure hardware complete it */
1250 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1251 readl, (!(val & DMA_GSTS_WBFS)), val);
1253 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 /* return value determine if we need a write buffer flush */
1257 static void __iommu_flush_context(struct intel_iommu *iommu,
1258 u16 did, u16 source_id, u8 function_mask,
1265 case DMA_CCMD_GLOBAL_INVL:
1266 val = DMA_CCMD_GLOBAL_INVL;
1268 case DMA_CCMD_DOMAIN_INVL:
1269 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271 case DMA_CCMD_DEVICE_INVL:
1272 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1273 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1278 val |= DMA_CCMD_ICC;
1280 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1281 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1283 /* Make sure hardware complete it */
1284 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1285 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1287 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 /* return value determine if we need a write buffer flush */
1291 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1292 u64 addr, unsigned int size_order, u64 type)
1294 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1295 u64 val = 0, val_iva = 0;
1299 case DMA_TLB_GLOBAL_FLUSH:
1300 /* global flush doesn't need set IVA_REG */
1301 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1303 case DMA_TLB_DSI_FLUSH:
1304 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 case DMA_TLB_PSI_FLUSH:
1307 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1308 /* IH bit is passed in as part of address */
1309 val_iva = size_order | addr;
1314 /* Note: set drain read/write */
1317 * This is probably to be super secure.. Looks like we can
1318 * ignore it without any impact.
1320 if (cap_read_drain(iommu->cap))
1321 val |= DMA_TLB_READ_DRAIN;
1323 if (cap_write_drain(iommu->cap))
1324 val |= DMA_TLB_WRITE_DRAIN;
1326 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1327 /* Note: Only uses first TLB reg currently */
1329 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1330 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1332 /* Make sure hardware complete it */
1333 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1334 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1336 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1338 /* check IOTLB invalidation granularity */
1339 if (DMA_TLB_IAIG(val) == 0)
1340 pr_err("Flush IOTLB failed\n");
1341 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1342 pr_debug("TLB flush request %Lx, actual %Lx\n",
1343 (unsigned long long)DMA_TLB_IIRG(type),
1344 (unsigned long long)DMA_TLB_IAIG(val));
1347 static struct device_domain_info *
1348 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1351 struct device_domain_info *info;
1353 assert_spin_locked(&device_domain_lock);
1358 list_for_each_entry(info, &domain->devices, link)
1359 if (info->iommu == iommu && info->bus == bus &&
1360 info->devfn == devfn) {
1361 if (info->ats_supported && info->dev)
1369 static void domain_update_iotlb(struct dmar_domain *domain)
1371 struct device_domain_info *info;
1372 bool has_iotlb_device = false;
1374 assert_spin_locked(&device_domain_lock);
1376 list_for_each_entry(info, &domain->devices, link) {
1377 struct pci_dev *pdev;
1379 if (!info->dev || !dev_is_pci(info->dev))
1382 pdev = to_pci_dev(info->dev);
1383 if (pdev->ats_enabled) {
1384 has_iotlb_device = true;
1389 domain->has_iotlb_device = has_iotlb_device;
1392 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1394 struct pci_dev *pdev;
1396 assert_spin_locked(&device_domain_lock);
1398 if (!info || !dev_is_pci(info->dev))
1401 pdev = to_pci_dev(info->dev);
1402 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1403 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1404 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1405 * reserved, which should be set to 0.
1407 if (!ecap_dit(info->iommu->ecap))
1410 struct pci_dev *pf_pdev;
1412 /* pdev will be returned if device is not a vf */
1413 pf_pdev = pci_physfn(pdev);
1414 info->pfsid = pci_dev_id(pf_pdev);
1417 #ifdef CONFIG_INTEL_IOMMU_SVM
1418 /* The PCIe spec, in its wisdom, declares that the behaviour of
1419 the device if you enable PASID support after ATS support is
1420 undefined. So always enable PASID support on devices which
1421 have it, even if we can't yet know if we're ever going to
1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 info->pasid_enabled = 1;
1426 if (info->pri_supported &&
1427 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1428 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1429 info->pri_enabled = 1;
1431 if (!pdev->untrusted && info->ats_supported &&
1432 pci_ats_page_aligned(pdev) &&
1433 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1434 info->ats_enabled = 1;
1435 domain_update_iotlb(info->domain);
1436 info->ats_qdep = pci_ats_queue_depth(pdev);
1440 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1442 struct pci_dev *pdev;
1444 assert_spin_locked(&device_domain_lock);
1446 if (!dev_is_pci(info->dev))
1449 pdev = to_pci_dev(info->dev);
1451 if (info->ats_enabled) {
1452 pci_disable_ats(pdev);
1453 info->ats_enabled = 0;
1454 domain_update_iotlb(info->domain);
1456 #ifdef CONFIG_INTEL_IOMMU_SVM
1457 if (info->pri_enabled) {
1458 pci_disable_pri(pdev);
1459 info->pri_enabled = 0;
1461 if (info->pasid_enabled) {
1462 pci_disable_pasid(pdev);
1463 info->pasid_enabled = 0;
1468 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1469 u64 addr, unsigned mask)
1472 unsigned long flags;
1473 struct device_domain_info *info;
1475 if (!domain->has_iotlb_device)
1478 spin_lock_irqsave(&device_domain_lock, flags);
1479 list_for_each_entry(info, &domain->devices, link) {
1480 if (!info->ats_enabled)
1483 sid = info->bus << 8 | info->devfn;
1484 qdep = info->ats_qdep;
1485 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1488 spin_unlock_irqrestore(&device_domain_lock, flags);
1491 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1492 struct dmar_domain *domain,
1493 unsigned long pfn, unsigned int pages,
1496 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1497 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1498 u16 did = domain->iommu_did[iommu->seq_id];
1505 * Fallback to domain selective flush if no PSI support or the size is
1507 * PSI requires page size to be 2 ^ x, and the base address is naturally
1508 * aligned to the size
1510 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1511 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1514 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1518 * In caching mode, changes of pages from non-present to present require
1519 * flush. However, device IOTLB doesn't need to be flushed in this case.
1521 if (!cap_caching_mode(iommu->cap) || !map)
1522 iommu_flush_dev_iotlb(domain, addr, mask);
1525 /* Notification for newly created mappings */
1526 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1527 struct dmar_domain *domain,
1528 unsigned long pfn, unsigned int pages)
1530 /* It's a non-present to present mapping. Only flush if caching mode */
1531 if (cap_caching_mode(iommu->cap))
1532 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1534 iommu_flush_write_buffer(iommu);
1537 static void iommu_flush_iova(struct iova_domain *iovad)
1539 struct dmar_domain *domain;
1542 domain = container_of(iovad, struct dmar_domain, iovad);
1544 for_each_domain_iommu(idx, domain) {
1545 struct intel_iommu *iommu = g_iommus[idx];
1546 u16 did = domain->iommu_did[iommu->seq_id];
1548 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1550 if (!cap_caching_mode(iommu->cap))
1551 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1552 0, MAX_AGAW_PFN_WIDTH);
1556 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1559 unsigned long flags;
1561 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1564 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1565 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1566 pmen &= ~DMA_PMEN_EPM;
1567 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1569 /* wait for the protected region status bit to clear */
1570 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1571 readl, !(pmen & DMA_PMEN_PRS), pmen);
1573 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1576 static void iommu_enable_translation(struct intel_iommu *iommu)
1579 unsigned long flags;
1581 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1582 iommu->gcmd |= DMA_GCMD_TE;
1583 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585 /* Make sure hardware complete it */
1586 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587 readl, (sts & DMA_GSTS_TES), sts);
1589 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1592 static void iommu_disable_translation(struct intel_iommu *iommu)
1597 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1598 iommu->gcmd &= ~DMA_GCMD_TE;
1599 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1601 /* Make sure hardware complete it */
1602 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1603 readl, (!(sts & DMA_GSTS_TES)), sts);
1605 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1608 static int iommu_init_domains(struct intel_iommu *iommu)
1610 u32 ndomains, nlongs;
1613 ndomains = cap_ndoms(iommu->cap);
1614 pr_debug("%s: Number of Domains supported <%d>\n",
1615 iommu->name, ndomains);
1616 nlongs = BITS_TO_LONGS(ndomains);
1618 spin_lock_init(&iommu->lock);
1620 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1621 if (!iommu->domain_ids) {
1622 pr_err("%s: Allocating domain id array failed\n",
1627 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1628 iommu->domains = kzalloc(size, GFP_KERNEL);
1630 if (iommu->domains) {
1631 size = 256 * sizeof(struct dmar_domain *);
1632 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1635 if (!iommu->domains || !iommu->domains[0]) {
1636 pr_err("%s: Allocating domain array failed\n",
1638 kfree(iommu->domain_ids);
1639 kfree(iommu->domains);
1640 iommu->domain_ids = NULL;
1641 iommu->domains = NULL;
1646 * If Caching mode is set, then invalid translations are tagged
1647 * with domain-id 0, hence we need to pre-allocate it. We also
1648 * use domain-id 0 as a marker for non-allocated domain-id, so
1649 * make sure it is not used for a real domain.
1651 set_bit(0, iommu->domain_ids);
1654 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1655 * entry for first-level or pass-through translation modes should
1656 * be programmed with a domain id different from those used for
1657 * second-level or nested translation. We reserve a domain id for
1660 if (sm_supported(iommu))
1661 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1666 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 struct device_domain_info *info, *tmp;
1669 unsigned long flags;
1671 if (!iommu->domains || !iommu->domain_ids)
1674 spin_lock_irqsave(&device_domain_lock, flags);
1675 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1676 if (info->iommu != iommu)
1679 if (!info->dev || !info->domain)
1682 __dmar_remove_one_dev_info(info);
1684 spin_unlock_irqrestore(&device_domain_lock, flags);
1686 if (iommu->gcmd & DMA_GCMD_TE)
1687 iommu_disable_translation(iommu);
1690 static void free_dmar_iommu(struct intel_iommu *iommu)
1692 if ((iommu->domains) && (iommu->domain_ids)) {
1693 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1696 for (i = 0; i < elems; i++)
1697 kfree(iommu->domains[i]);
1698 kfree(iommu->domains);
1699 kfree(iommu->domain_ids);
1700 iommu->domains = NULL;
1701 iommu->domain_ids = NULL;
1704 g_iommus[iommu->seq_id] = NULL;
1706 /* free context mapping */
1707 free_context_table(iommu);
1709 #ifdef CONFIG_INTEL_IOMMU_SVM
1710 if (pasid_supported(iommu)) {
1711 if (ecap_prs(iommu->ecap))
1712 intel_svm_finish_prq(iommu);
1717 static struct dmar_domain *alloc_domain(int flags)
1719 struct dmar_domain *domain;
1721 domain = alloc_domain_mem();
1725 memset(domain, 0, sizeof(*domain));
1726 domain->nid = NUMA_NO_NODE;
1727 domain->flags = flags;
1728 domain->has_iotlb_device = false;
1729 INIT_LIST_HEAD(&domain->devices);
1734 /* Must be called with iommu->lock */
1735 static int domain_attach_iommu(struct dmar_domain *domain,
1736 struct intel_iommu *iommu)
1738 unsigned long ndomains;
1741 assert_spin_locked(&device_domain_lock);
1742 assert_spin_locked(&iommu->lock);
1744 domain->iommu_refcnt[iommu->seq_id] += 1;
1745 domain->iommu_count += 1;
1746 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1747 ndomains = cap_ndoms(iommu->cap);
1748 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1750 if (num >= ndomains) {
1751 pr_err("%s: No free domain ids\n", iommu->name);
1752 domain->iommu_refcnt[iommu->seq_id] -= 1;
1753 domain->iommu_count -= 1;
1757 set_bit(num, iommu->domain_ids);
1758 set_iommu_domain(iommu, num, domain);
1760 domain->iommu_did[iommu->seq_id] = num;
1761 domain->nid = iommu->node;
1763 domain_update_iommu_cap(domain);
1769 static int domain_detach_iommu(struct dmar_domain *domain,
1770 struct intel_iommu *iommu)
1774 assert_spin_locked(&device_domain_lock);
1775 assert_spin_locked(&iommu->lock);
1777 domain->iommu_refcnt[iommu->seq_id] -= 1;
1778 count = --domain->iommu_count;
1779 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1780 num = domain->iommu_did[iommu->seq_id];
1781 clear_bit(num, iommu->domain_ids);
1782 set_iommu_domain(iommu, num, NULL);
1784 domain_update_iommu_cap(domain);
1785 domain->iommu_did[iommu->seq_id] = 0;
1791 static struct iova_domain reserved_iova_list;
1792 static struct lock_class_key reserved_rbtree_key;
1794 static int dmar_init_reserved_ranges(void)
1796 struct pci_dev *pdev = NULL;
1800 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1802 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1803 &reserved_rbtree_key);
1805 /* IOAPIC ranges shouldn't be accessed by DMA */
1806 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1807 IOVA_PFN(IOAPIC_RANGE_END));
1809 pr_err("Reserve IOAPIC range failed\n");
1813 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1814 for_each_pci_dev(pdev) {
1817 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1818 r = &pdev->resource[i];
1819 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1821 iova = reserve_iova(&reserved_iova_list,
1825 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1833 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1835 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1838 static inline int guestwidth_to_adjustwidth(int gaw)
1841 int r = (gaw - 12) % 9;
1852 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1855 int adjust_width, agaw;
1856 unsigned long sagaw;
1859 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1861 err = init_iova_flush_queue(&domain->iovad,
1862 iommu_flush_iova, iova_entry_free);
1866 domain_reserve_special_ranges(domain);
1868 /* calculate AGAW */
1869 if (guest_width > cap_mgaw(iommu->cap))
1870 guest_width = cap_mgaw(iommu->cap);
1871 domain->gaw = guest_width;
1872 adjust_width = guestwidth_to_adjustwidth(guest_width);
1873 agaw = width_to_agaw(adjust_width);
1874 sagaw = cap_sagaw(iommu->cap);
1875 if (!test_bit(agaw, &sagaw)) {
1876 /* hardware doesn't support it, choose a bigger one */
1877 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1878 agaw = find_next_bit(&sagaw, 5, agaw);
1882 domain->agaw = agaw;
1884 if (ecap_coherent(iommu->ecap))
1885 domain->iommu_coherency = 1;
1887 domain->iommu_coherency = 0;
1889 if (ecap_sc_support(iommu->ecap))
1890 domain->iommu_snooping = 1;
1892 domain->iommu_snooping = 0;
1894 if (intel_iommu_superpage)
1895 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1897 domain->iommu_superpage = 0;
1899 domain->nid = iommu->node;
1901 /* always allocate the top pgd */
1902 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1905 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1909 static void domain_exit(struct dmar_domain *domain)
1912 /* Remove associated devices and clear attached or cached domains */
1913 domain_remove_dev_info(domain);
1916 put_iova_domain(&domain->iovad);
1919 struct page *freelist;
1921 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922 dma_free_pagelist(freelist);
1925 free_domain_mem(domain);
1929 * Get the PASID directory size for scalable mode context entry.
1930 * Value of X in the PDTS field of a scalable mode context entry
1931 * indicates PASID directory with 2^(X + 7) entries.
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1937 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1946 * Set the RID_PASID field of a scalable mode context entry. The
1947 * IOMMU hardware will use the PASID value set in this field for
1948 * DMA translations of DMA requests without PASID.
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1953 context->hi |= pasid & ((1 << 20) - 1);
1954 context->hi |= (1 << 20);
1958 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1961 static inline void context_set_sm_dte(struct context_entry *context)
1963 context->lo |= (1 << 2);
1967 * Set the PRE(Page Request Enable) field of a scalable mode context
1970 static inline void context_set_sm_pre(struct context_entry *context)
1972 context->lo |= (1 << 4);
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds) (((pds) & 0x7) << 9)
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979 struct intel_iommu *iommu,
1980 struct pasid_table *table,
1983 u16 did = domain->iommu_did[iommu->seq_id];
1984 int translation = CONTEXT_TT_MULTI_LEVEL;
1985 struct device_domain_info *info = NULL;
1986 struct context_entry *context;
1987 unsigned long flags;
1992 if (hw_pass_through && domain_type_is_si(domain))
1993 translation = CONTEXT_TT_PASS_THROUGH;
1995 pr_debug("Set context mapping for %02x:%02x.%d\n",
1996 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1998 BUG_ON(!domain->pgd);
2000 spin_lock_irqsave(&device_domain_lock, flags);
2001 spin_lock(&iommu->lock);
2004 context = iommu_context_addr(iommu, bus, devfn, 1);
2009 if (context_present(context))
2013 * For kdump cases, old valid entries may be cached due to the
2014 * in-flight DMA and copied pgtable, but there is no unmapping
2015 * behaviour for them, thus we need an explicit cache flush for
2016 * the newly-mapped device. For kdump, at this point, the device
2017 * is supposed to finish reset at its driver probe stage, so no
2018 * in-flight DMA will exist, and we don't need to worry anymore
2021 if (context_copied(context)) {
2022 u16 did_old = context_domain_id(context);
2024 if (did_old < cap_ndoms(iommu->cap)) {
2025 iommu->flush.flush_context(iommu, did_old,
2026 (((u16)bus) << 8) | devfn,
2027 DMA_CCMD_MASK_NOBIT,
2028 DMA_CCMD_DEVICE_INVL);
2029 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2034 context_clear_entry(context);
2036 if (sm_supported(iommu)) {
2041 /* Setup the PASID DIR pointer: */
2042 pds = context_get_sm_pds(table);
2043 context->lo = (u64)virt_to_phys(table->table) |
2046 /* Setup the RID_PASID field: */
2047 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2050 * Setup the Device-TLB enable bit and Page request
2053 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054 if (info && info->ats_supported)
2055 context_set_sm_dte(context);
2056 if (info && info->pri_supported)
2057 context_set_sm_pre(context);
2059 struct dma_pte *pgd = domain->pgd;
2062 context_set_domain_id(context, did);
2064 if (translation != CONTEXT_TT_PASS_THROUGH) {
2066 * Skip top levels of page tables for iommu which has
2067 * less agaw than default. Unnecessary for PT mode.
2069 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2071 pgd = phys_to_virt(dma_pte_addr(pgd));
2072 if (!dma_pte_present(pgd))
2076 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077 if (info && info->ats_supported)
2078 translation = CONTEXT_TT_DEV_IOTLB;
2080 translation = CONTEXT_TT_MULTI_LEVEL;
2082 context_set_address_root(context, virt_to_phys(pgd));
2083 context_set_address_width(context, agaw);
2086 * In pass through mode, AW must be programmed to
2087 * indicate the largest AGAW value supported by
2088 * hardware. And ASR is ignored by hardware.
2090 context_set_address_width(context, iommu->msagaw);
2093 context_set_translation_type(context, translation);
2096 context_set_fault_enable(context);
2097 context_set_present(context);
2098 domain_flush_cache(domain, context, sizeof(*context));
2101 * It's a non-present to present mapping. If hardware doesn't cache
2102 * non-present entry we only need to flush the write-buffer. If the
2103 * _does_ cache non-present entries, then it does so in the special
2104 * domain #0, which we have to flush:
2106 if (cap_caching_mode(iommu->cap)) {
2107 iommu->flush.flush_context(iommu, 0,
2108 (((u16)bus) << 8) | devfn,
2109 DMA_CCMD_MASK_NOBIT,
2110 DMA_CCMD_DEVICE_INVL);
2111 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2113 iommu_flush_write_buffer(iommu);
2115 iommu_enable_dev_iotlb(info);
2120 spin_unlock(&iommu->lock);
2121 spin_unlock_irqrestore(&device_domain_lock, flags);
2126 struct domain_context_mapping_data {
2127 struct dmar_domain *domain;
2128 struct intel_iommu *iommu;
2129 struct pasid_table *table;
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133 u16 alias, void *opaque)
2135 struct domain_context_mapping_data *data = opaque;
2137 return domain_context_mapping_one(data->domain, data->iommu,
2138 data->table, PCI_BUS_NUM(alias),
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2145 struct domain_context_mapping_data data;
2146 struct pasid_table *table;
2147 struct intel_iommu *iommu;
2150 iommu = device_to_iommu(dev, &bus, &devfn);
2154 table = intel_pasid_get_table(dev);
2156 if (!dev_is_pci(dev))
2157 return domain_context_mapping_one(domain, iommu, table,
2160 data.domain = domain;
2164 return pci_for_each_dma_alias(to_pci_dev(dev),
2165 &domain_context_mapping_cb, &data);
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169 u16 alias, void *opaque)
2171 struct intel_iommu *iommu = opaque;
2173 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2176 static int domain_context_mapped(struct device *dev)
2178 struct intel_iommu *iommu;
2181 iommu = device_to_iommu(dev, &bus, &devfn);
2185 if (!dev_is_pci(dev))
2186 return device_context_mapped(iommu, bus, devfn);
2188 return !pci_for_each_dma_alias(to_pci_dev(dev),
2189 domain_context_mapped_cb, iommu);
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2196 host_addr &= ~PAGE_MASK;
2197 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202 unsigned long iov_pfn,
2203 unsigned long phy_pfn,
2204 unsigned long pages)
2206 int support, level = 1;
2207 unsigned long pfnmerge;
2209 support = domain->iommu_superpage;
2211 /* To use a large page, the virtual *and* physical addresses
2212 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213 of them will mean we have to use smaller pages. So just
2214 merge them and check both at once. */
2215 pfnmerge = iov_pfn | phy_pfn;
2217 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218 pages >>= VTD_STRIDE_SHIFT;
2221 pfnmerge >>= VTD_STRIDE_SHIFT;
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229 struct scatterlist *sg, unsigned long phys_pfn,
2230 unsigned long nr_pages, int prot)
2232 struct dma_pte *first_pte = NULL, *pte = NULL;
2233 phys_addr_t uninitialized_var(pteval);
2234 unsigned long sg_res = 0;
2235 unsigned int largepage_lvl = 0;
2236 unsigned long lvl_pages = 0;
2238 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2240 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2243 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2247 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2250 while (nr_pages > 0) {
2254 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2256 sg_res = aligned_nrpages(sg->offset, sg->length);
2257 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258 sg->dma_length = sg->length;
2259 pteval = (sg_phys(sg) - pgoff) | prot;
2260 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2264 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2266 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2269 /* It is large page*/
2270 if (largepage_lvl > 1) {
2271 unsigned long nr_superpages, end_pfn;
2273 pteval |= DMA_PTE_LARGE_PAGE;
2274 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2276 nr_superpages = sg_res / lvl_pages;
2277 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2280 * Ensure that old small page tables are
2281 * removed to make room for superpage(s).
2282 * We're adding new large pages, so make sure
2283 * we don't remove their parent tables.
2285 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2288 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2292 /* We don't need lock here, nobody else
2293 * touches the iova range
2295 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2297 static int dumps = 5;
2298 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299 iov_pfn, tmp, (unsigned long long)pteval);
2302 debug_dma_dump_mappings(NULL);
2307 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2309 BUG_ON(nr_pages < lvl_pages);
2310 BUG_ON(sg_res < lvl_pages);
2312 nr_pages -= lvl_pages;
2313 iov_pfn += lvl_pages;
2314 phys_pfn += lvl_pages;
2315 pteval += lvl_pages * VTD_PAGE_SIZE;
2316 sg_res -= lvl_pages;
2318 /* If the next PTE would be the first in a new page, then we
2319 need to flush the cache on the entries we've just written.
2320 And then we'll need to recalculate 'pte', so clear it and
2321 let it get set again in the if (!pte) block above.
2323 If we're done (!nr_pages) we need to flush the cache too.
2325 Also if we've been setting superpages, we may need to
2326 recalculate 'pte' and switch back to smaller pages for the
2327 end of the mapping, if the trailing size is not enough to
2328 use another superpage (i.e. sg_res < lvl_pages). */
2330 if (!nr_pages || first_pte_in_page(pte) ||
2331 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332 domain_flush_cache(domain, first_pte,
2333 (void *)pte - (void *)first_pte);
2337 if (!sg_res && nr_pages)
2343 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344 struct scatterlist *sg, unsigned long phys_pfn,
2345 unsigned long nr_pages, int prot)
2348 struct intel_iommu *iommu;
2350 /* Do the real mapping first */
2351 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2355 for_each_domain_iommu(iommu_id, domain) {
2356 iommu = g_iommus[iommu_id];
2357 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2363 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364 struct scatterlist *sg, unsigned long nr_pages,
2367 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2370 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2371 unsigned long phys_pfn, unsigned long nr_pages,
2374 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2377 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2379 unsigned long flags;
2380 struct context_entry *context;
2386 spin_lock_irqsave(&iommu->lock, flags);
2387 context = iommu_context_addr(iommu, bus, devfn, 0);
2389 spin_unlock_irqrestore(&iommu->lock, flags);
2392 did_old = context_domain_id(context);
2393 context_clear_entry(context);
2394 __iommu_flush_cache(iommu, context, sizeof(*context));
2395 spin_unlock_irqrestore(&iommu->lock, flags);
2396 iommu->flush.flush_context(iommu,
2398 (((u16)bus) << 8) | devfn,
2399 DMA_CCMD_MASK_NOBIT,
2400 DMA_CCMD_DEVICE_INVL);
2401 iommu->flush.flush_iotlb(iommu,
2408 static inline void unlink_domain_info(struct device_domain_info *info)
2410 assert_spin_locked(&device_domain_lock);
2411 list_del(&info->link);
2412 list_del(&info->global);
2414 info->dev->archdata.iommu = NULL;
2417 static void domain_remove_dev_info(struct dmar_domain *domain)
2419 struct device_domain_info *info, *tmp;
2420 unsigned long flags;
2422 spin_lock_irqsave(&device_domain_lock, flags);
2423 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2424 __dmar_remove_one_dev_info(info);
2425 spin_unlock_irqrestore(&device_domain_lock, flags);
2428 static struct dmar_domain *find_domain(struct device *dev)
2430 struct device_domain_info *info;
2432 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2433 dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2436 /* No lock here, assumes no domain exit in normal case */
2437 info = dev->archdata.iommu;
2439 return info->domain;
2444 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2446 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2447 struct iommu_domain *domain;
2449 dev->archdata.iommu = NULL;
2450 domain = iommu_get_domain_for_dev(dev);
2452 intel_iommu_attach_device(domain, dev);
2455 return find_domain(dev);
2458 static inline struct device_domain_info *
2459 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2461 struct device_domain_info *info;
2463 list_for_each_entry(info, &device_domain_list, global)
2464 if (info->iommu->segment == segment && info->bus == bus &&
2465 info->devfn == devfn)
2471 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2474 struct dmar_domain *domain)
2476 struct dmar_domain *found = NULL;
2477 struct device_domain_info *info;
2478 unsigned long flags;
2481 info = alloc_devinfo_mem();
2486 info->devfn = devfn;
2487 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2488 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2491 info->domain = domain;
2492 info->iommu = iommu;
2493 info->pasid_table = NULL;
2494 info->auxd_enabled = 0;
2495 INIT_LIST_HEAD(&info->auxiliary_domains);
2497 if (dev && dev_is_pci(dev)) {
2498 struct pci_dev *pdev = to_pci_dev(info->dev);
2500 if (!pdev->untrusted &&
2501 !pci_ats_disabled() &&
2502 ecap_dev_iotlb_support(iommu->ecap) &&
2503 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2504 dmar_find_matched_atsr_unit(pdev))
2505 info->ats_supported = 1;
2507 if (sm_supported(iommu)) {
2508 if (pasid_supported(iommu)) {
2509 int features = pci_pasid_features(pdev);
2511 info->pasid_supported = features | 1;
2514 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2515 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2516 info->pri_supported = 1;
2520 spin_lock_irqsave(&device_domain_lock, flags);
2522 found = find_domain(dev);
2525 struct device_domain_info *info2;
2526 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2528 found = info2->domain;
2534 spin_unlock_irqrestore(&device_domain_lock, flags);
2535 free_devinfo_mem(info);
2536 /* Caller must free the original domain */
2540 spin_lock(&iommu->lock);
2541 ret = domain_attach_iommu(domain, iommu);
2542 spin_unlock(&iommu->lock);
2545 spin_unlock_irqrestore(&device_domain_lock, flags);
2546 free_devinfo_mem(info);
2550 list_add(&info->link, &domain->devices);
2551 list_add(&info->global, &device_domain_list);
2553 dev->archdata.iommu = info;
2554 spin_unlock_irqrestore(&device_domain_lock, flags);
2556 /* PASID table is mandatory for a PCI device in scalable mode. */
2557 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2558 ret = intel_pasid_alloc_table(dev);
2560 dev_err(dev, "PASID table allocation failed\n");
2561 dmar_remove_one_dev_info(dev);
2565 /* Setup the PASID entry for requests without PASID: */
2566 spin_lock(&iommu->lock);
2567 if (hw_pass_through && domain_type_is_si(domain))
2568 ret = intel_pasid_setup_pass_through(iommu, domain,
2569 dev, PASID_RID2PASID);
2571 ret = intel_pasid_setup_second_level(iommu, domain,
2572 dev, PASID_RID2PASID);
2573 spin_unlock(&iommu->lock);
2575 dev_err(dev, "Setup RID2PASID failed\n");
2576 dmar_remove_one_dev_info(dev);
2581 if (dev && domain_context_mapping(domain, dev)) {
2582 dev_err(dev, "Domain context map failed\n");
2583 dmar_remove_one_dev_info(dev);
2590 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2592 *(u16 *)opaque = alias;
2596 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2598 struct device_domain_info *info;
2599 struct dmar_domain *domain = NULL;
2600 struct intel_iommu *iommu;
2602 unsigned long flags;
2605 iommu = device_to_iommu(dev, &bus, &devfn);
2609 if (dev_is_pci(dev)) {
2610 struct pci_dev *pdev = to_pci_dev(dev);
2612 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2614 spin_lock_irqsave(&device_domain_lock, flags);
2615 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2616 PCI_BUS_NUM(dma_alias),
2619 iommu = info->iommu;
2620 domain = info->domain;
2622 spin_unlock_irqrestore(&device_domain_lock, flags);
2624 /* DMA alias already has a domain, use it */
2629 /* Allocate and initialize new domain for the device */
2630 domain = alloc_domain(0);
2633 if (domain_init(domain, iommu, gaw)) {
2634 domain_exit(domain);
2642 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2643 struct dmar_domain *domain)
2645 struct intel_iommu *iommu;
2646 struct dmar_domain *tmp;
2647 u16 req_id, dma_alias;
2650 iommu = device_to_iommu(dev, &bus, &devfn);
2654 req_id = ((u16)bus << 8) | devfn;
2656 if (dev_is_pci(dev)) {
2657 struct pci_dev *pdev = to_pci_dev(dev);
2659 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2661 /* register PCI DMA alias device */
2662 if (req_id != dma_alias) {
2663 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2664 dma_alias & 0xff, NULL, domain);
2666 if (!tmp || tmp != domain)
2671 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2672 if (!tmp || tmp != domain)
2678 static int iommu_domain_identity_map(struct dmar_domain *domain,
2679 unsigned long long start,
2680 unsigned long long end)
2682 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2683 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2685 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2686 dma_to_mm_pfn(last_vpfn))) {
2687 pr_err("Reserving iova failed\n");
2691 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2693 * RMRR range might have overlap with physical memory range,
2696 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2698 return __domain_mapping(domain, first_vpfn, NULL,
2699 first_vpfn, last_vpfn - first_vpfn + 1,
2700 DMA_PTE_READ|DMA_PTE_WRITE);
2703 static int domain_prepare_identity_map(struct device *dev,
2704 struct dmar_domain *domain,
2705 unsigned long long start,
2706 unsigned long long end)
2708 /* For _hardware_ passthrough, don't bother. But for software
2709 passthrough, we do it anyway -- it may indicate a memory
2710 range which is reserved in E820, so which didn't get set
2711 up to start with in si_domain */
2712 if (domain == si_domain && hw_pass_through) {
2713 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2718 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2721 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2722 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2723 dmi_get_system_info(DMI_BIOS_VENDOR),
2724 dmi_get_system_info(DMI_BIOS_VERSION),
2725 dmi_get_system_info(DMI_PRODUCT_VERSION));
2729 if (end >> agaw_to_width(domain->agaw)) {
2730 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2731 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2732 agaw_to_width(domain->agaw),
2733 dmi_get_system_info(DMI_BIOS_VENDOR),
2734 dmi_get_system_info(DMI_BIOS_VERSION),
2735 dmi_get_system_info(DMI_PRODUCT_VERSION));
2739 return iommu_domain_identity_map(domain, start, end);
2742 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2744 static int __init si_domain_init(int hw)
2746 struct dmar_rmrr_unit *rmrr;
2750 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2754 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2755 domain_exit(si_domain);
2762 for_each_online_node(nid) {
2763 unsigned long start_pfn, end_pfn;
2766 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2767 ret = iommu_domain_identity_map(si_domain,
2768 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2775 * Normally we use DMA domains for devices which have RMRRs. But we
2776 * loose this requirement for graphic and usb devices. Identity map
2777 * the RMRRs for graphic and USB devices so that they could use the
2780 for_each_rmrr_units(rmrr) {
2781 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2783 unsigned long long start = rmrr->base_address;
2784 unsigned long long end = rmrr->end_address;
2786 if (device_is_rmrr_locked(dev))
2789 if (WARN_ON(end < start ||
2790 end >> agaw_to_width(si_domain->agaw)))
2793 ret = iommu_domain_identity_map(si_domain, start, end);
2802 static int identity_mapping(struct device *dev)
2804 struct device_domain_info *info;
2806 info = dev->archdata.iommu;
2807 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2808 return (info->domain == si_domain);
2813 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2815 struct dmar_domain *ndomain;
2816 struct intel_iommu *iommu;
2819 iommu = device_to_iommu(dev, &bus, &devfn);
2823 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2824 if (ndomain != domain)
2830 static bool device_has_rmrr(struct device *dev)
2832 struct dmar_rmrr_unit *rmrr;
2837 for_each_rmrr_units(rmrr) {
2839 * Return TRUE if this RMRR contains the device that
2842 for_each_active_dev_scope(rmrr->devices,
2843 rmrr->devices_cnt, i, tmp)
2845 is_downstream_to_pci_bridge(dev, tmp)) {
2855 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2856 * is relaxable (ie. is allowed to be not enforced under some conditions)
2857 * @dev: device handle
2859 * We assume that PCI USB devices with RMRRs have them largely
2860 * for historical reasons and that the RMRR space is not actively used post
2861 * boot. This exclusion may change if vendors begin to abuse it.
2863 * The same exception is made for graphics devices, with the requirement that
2864 * any use of the RMRR regions will be torn down before assigning the device
2867 * Return: true if the RMRR is relaxable, false otherwise
2869 static bool device_rmrr_is_relaxable(struct device *dev)
2871 struct pci_dev *pdev;
2873 if (!dev_is_pci(dev))
2876 pdev = to_pci_dev(dev);
2877 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2884 * There are a couple cases where we need to restrict the functionality of
2885 * devices associated with RMRRs. The first is when evaluating a device for
2886 * identity mapping because problems exist when devices are moved in and out
2887 * of domains and their respective RMRR information is lost. This means that
2888 * a device with associated RMRRs will never be in a "passthrough" domain.
2889 * The second is use of the device through the IOMMU API. This interface
2890 * expects to have full control of the IOVA space for the device. We cannot
2891 * satisfy both the requirement that RMRR access is maintained and have an
2892 * unencumbered IOVA space. We also have no ability to quiesce the device's
2893 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2894 * We therefore prevent devices associated with an RMRR from participating in
2895 * the IOMMU API, which eliminates them from device assignment.
2897 * In both cases, devices which have relaxable RMRRs are not concerned by this
2898 * restriction. See device_rmrr_is_relaxable comment.
2900 static bool device_is_rmrr_locked(struct device *dev)
2902 if (!device_has_rmrr(dev))
2905 if (device_rmrr_is_relaxable(dev))
2912 * Return the required default domain type for a specific device.
2914 * @dev: the device in query
2915 * @startup: true if this is during early boot
2918 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2919 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2920 * - 0: both identity and dynamic domains work for this device
2922 static int device_def_domain_type(struct device *dev)
2924 if (dev_is_pci(dev)) {
2925 struct pci_dev *pdev = to_pci_dev(dev);
2927 if (device_is_rmrr_locked(dev))
2928 return IOMMU_DOMAIN_DMA;
2931 * Prevent any device marked as untrusted from getting
2932 * placed into the statically identity mapping domain.
2934 if (pdev->untrusted)
2935 return IOMMU_DOMAIN_DMA;
2937 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2938 return IOMMU_DOMAIN_IDENTITY;
2940 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2941 return IOMMU_DOMAIN_IDENTITY;
2944 * We want to start off with all devices in the 1:1 domain, and
2945 * take them out later if we find they can't access all of memory.
2947 * However, we can't do this for PCI devices behind bridges,
2948 * because all PCI devices behind the same bridge will end up
2949 * with the same source-id on their transactions.
2951 * Practically speaking, we can't change things around for these
2952 * devices at run-time, because we can't be sure there'll be no
2953 * DMA transactions in flight for any of their siblings.
2955 * So PCI devices (unless they're on the root bus) as well as
2956 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2957 * the 1:1 domain, just in _case_ one of their siblings turns out
2958 * not to be able to map all of memory.
2960 if (!pci_is_pcie(pdev)) {
2961 if (!pci_is_root_bus(pdev->bus))
2962 return IOMMU_DOMAIN_DMA;
2963 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2964 return IOMMU_DOMAIN_DMA;
2965 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2966 return IOMMU_DOMAIN_DMA;
2968 if (device_has_rmrr(dev))
2969 return IOMMU_DOMAIN_DMA;
2972 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2973 IOMMU_DOMAIN_IDENTITY : 0;
2976 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2979 * Start from the sane iommu hardware state.
2980 * If the queued invalidation is already initialized by us
2981 * (for example, while enabling interrupt-remapping) then
2982 * we got the things already rolling from a sane state.
2986 * Clear any previous faults.
2988 dmar_fault(-1, iommu);
2990 * Disable queued invalidation if supported and already enabled
2991 * before OS handover.
2993 dmar_disable_qi(iommu);
2996 if (dmar_enable_qi(iommu)) {
2998 * Queued Invalidate not enabled, use Register Based Invalidate
3000 iommu->flush.flush_context = __iommu_flush_context;
3001 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3002 pr_info("%s: Using Register based invalidation\n",
3005 iommu->flush.flush_context = qi_flush_context;
3006 iommu->flush.flush_iotlb = qi_flush_iotlb;
3007 pr_info("%s: Using Queued invalidation\n", iommu->name);
3011 static int copy_context_table(struct intel_iommu *iommu,
3012 struct root_entry *old_re,
3013 struct context_entry **tbl,
3016 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3017 struct context_entry *new_ce = NULL, ce;
3018 struct context_entry *old_ce = NULL;
3019 struct root_entry re;
3020 phys_addr_t old_ce_phys;
3022 tbl_idx = ext ? bus * 2 : bus;
3023 memcpy(&re, old_re, sizeof(re));
3025 for (devfn = 0; devfn < 256; devfn++) {
3026 /* First calculate the correct index */
3027 idx = (ext ? devfn * 2 : devfn) % 256;
3030 /* First save what we may have and clean up */
3032 tbl[tbl_idx] = new_ce;
3033 __iommu_flush_cache(iommu, new_ce,
3043 old_ce_phys = root_entry_lctp(&re);
3045 old_ce_phys = root_entry_uctp(&re);
3048 if (ext && devfn == 0) {
3049 /* No LCTP, try UCTP */
3058 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3063 new_ce = alloc_pgtable_page(iommu->node);
3070 /* Now copy the context entry */
3071 memcpy(&ce, old_ce + idx, sizeof(ce));
3073 if (!__context_present(&ce))
3076 did = context_domain_id(&ce);
3077 if (did >= 0 && did < cap_ndoms(iommu->cap))
3078 set_bit(did, iommu->domain_ids);
3081 * We need a marker for copied context entries. This
3082 * marker needs to work for the old format as well as
3083 * for extended context entries.
3085 * Bit 67 of the context entry is used. In the old
3086 * format this bit is available to software, in the
3087 * extended format it is the PGE bit, but PGE is ignored
3088 * by HW if PASIDs are disabled (and thus still
3091 * So disable PASIDs first and then mark the entry
3092 * copied. This means that we don't copy PASID
3093 * translations from the old kernel, but this is fine as
3094 * faults there are not fatal.
3096 context_clear_pasid_enable(&ce);
3097 context_set_copied(&ce);
3102 tbl[tbl_idx + pos] = new_ce;
3104 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3113 static int copy_translation_tables(struct intel_iommu *iommu)
3115 struct context_entry **ctxt_tbls;
3116 struct root_entry *old_rt;
3117 phys_addr_t old_rt_phys;
3118 int ctxt_table_entries;
3119 unsigned long flags;
3124 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3125 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3126 new_ext = !!ecap_ecs(iommu->ecap);
3129 * The RTT bit can only be changed when translation is disabled,
3130 * but disabling translation means to open a window for data
3131 * corruption. So bail out and don't copy anything if we would
3132 * have to change the bit.
3137 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3141 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3145 /* This is too big for the stack - allocate it from slab */
3146 ctxt_table_entries = ext ? 512 : 256;
3148 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3152 for (bus = 0; bus < 256; bus++) {
3153 ret = copy_context_table(iommu, &old_rt[bus],
3154 ctxt_tbls, bus, ext);
3156 pr_err("%s: Failed to copy context table for bus %d\n",
3162 spin_lock_irqsave(&iommu->lock, flags);
3164 /* Context tables are copied, now write them to the root_entry table */
3165 for (bus = 0; bus < 256; bus++) {
3166 int idx = ext ? bus * 2 : bus;
3169 if (ctxt_tbls[idx]) {
3170 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3171 iommu->root_entry[bus].lo = val;
3174 if (!ext || !ctxt_tbls[idx + 1])
3177 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3178 iommu->root_entry[bus].hi = val;
3181 spin_unlock_irqrestore(&iommu->lock, flags);
3185 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3195 static int __init init_dmars(void)
3197 struct dmar_drhd_unit *drhd;
3198 struct intel_iommu *iommu;
3204 * initialize and program root entry to not present
3207 for_each_drhd_unit(drhd) {
3209 * lock not needed as this is only incremented in the single
3210 * threaded kernel __init code path all other access are read
3213 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3217 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3220 /* Preallocate enough resources for IOMMU hot-addition */
3221 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3222 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3227 pr_err("Allocating global iommu array failed\n");
3232 for_each_iommu(iommu, drhd) {
3233 if (drhd->ignored) {
3234 iommu_disable_translation(iommu);
3239 * Find the max pasid size of all IOMMU's in the system.
3240 * We need to ensure the system pasid table is no bigger
3241 * than the smallest supported.
3243 if (pasid_supported(iommu)) {
3244 u32 temp = 2 << ecap_pss(iommu->ecap);
3246 intel_pasid_max_id = min_t(u32, temp,
3247 intel_pasid_max_id);
3250 g_iommus[iommu->seq_id] = iommu;
3252 intel_iommu_init_qi(iommu);
3254 ret = iommu_init_domains(iommu);
3258 init_translation_status(iommu);
3260 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3261 iommu_disable_translation(iommu);
3262 clear_translation_pre_enabled(iommu);
3263 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3269 * we could share the same root & context tables
3270 * among all IOMMU's. Need to Split it later.
3272 ret = iommu_alloc_root_entry(iommu);
3276 if (translation_pre_enabled(iommu)) {
3277 pr_info("Translation already enabled - trying to copy translation structures\n");
3279 ret = copy_translation_tables(iommu);
3282 * We found the IOMMU with translation
3283 * enabled - but failed to copy over the
3284 * old root-entry table. Try to proceed
3285 * by disabling translation now and
3286 * allocating a clean root-entry table.
3287 * This might cause DMAR faults, but
3288 * probably the dump will still succeed.
3290 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292 iommu_disable_translation(iommu);
3293 clear_translation_pre_enabled(iommu);
3295 pr_info("Copied translation tables from previous kernel for %s\n",
3300 if (!ecap_pass_through(iommu->ecap))
3301 hw_pass_through = 0;
3302 intel_svm_check(iommu);
3306 * Now that qi is enabled on all iommus, set the root entry and flush
3307 * caches. This is required on some Intel X58 chipsets, otherwise the
3308 * flush_context function will loop forever and the boot hangs.
3310 for_each_active_iommu(iommu, drhd) {
3311 iommu_flush_write_buffer(iommu);
3312 iommu_set_root_entry(iommu);
3313 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3314 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3317 if (iommu_default_passthrough())
3318 iommu_identity_mapping |= IDENTMAP_ALL;
3320 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3325 iommu_identity_mapping |= IDENTMAP_GFX;
3327 check_tylersburg_isoch();
3329 ret = si_domain_init(hw_pass_through);
3336 * global invalidate context cache
3337 * global invalidate iotlb
3338 * enable translation
3340 for_each_iommu(iommu, drhd) {
3341 if (drhd->ignored) {
3343 * we always have to disable PMRs or DMA may fail on
3347 iommu_disable_protect_mem_regions(iommu);
3351 iommu_flush_write_buffer(iommu);
3353 #ifdef CONFIG_INTEL_IOMMU_SVM
3354 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3357 * could cause possible lock race condition.
3359 up_write(&dmar_global_lock);
3360 ret = intel_svm_enable_prq(iommu);
3361 down_write(&dmar_global_lock);
3366 ret = dmar_set_interrupt(iommu);
3374 for_each_active_iommu(iommu, drhd) {
3375 disable_dmar_iommu(iommu);
3376 free_dmar_iommu(iommu);
3385 /* This takes a number of _MM_ pages, not VTD pages */
3386 static unsigned long intel_alloc_iova(struct device *dev,
3387 struct dmar_domain *domain,
3388 unsigned long nrpages, uint64_t dma_mask)
3390 unsigned long iova_pfn;
3392 /* Restrict dma_mask to the width that the iommu can handle */
3393 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3394 /* Ensure we reserve the whole size-aligned region */
3395 nrpages = __roundup_pow_of_two(nrpages);
3397 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3399 * First try to allocate an io virtual address in
3400 * DMA_BIT_MASK(32) and if that fails then try allocating
3403 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3404 IOVA_PFN(DMA_BIT_MASK(32)), false);
3408 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3409 IOVA_PFN(dma_mask), true);
3410 if (unlikely(!iova_pfn)) {
3411 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3418 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3420 struct dmar_domain *domain, *tmp;
3421 struct dmar_rmrr_unit *rmrr;
3422 struct device *i_dev;
3425 /* Device shouldn't be attached by any domains. */
3426 domain = find_domain(dev);
3430 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3434 /* We have a new domain - setup possible RMRRs for the device */
3436 for_each_rmrr_units(rmrr) {
3437 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3442 ret = domain_prepare_identity_map(dev, domain,
3446 dev_err(dev, "Mapping reserved region failed\n");
3451 tmp = set_domain_for_dev(dev, domain);
3452 if (!tmp || domain != tmp) {
3453 domain_exit(domain);
3459 dev_err(dev, "Allocating domain failed\n");
3461 domain->domain.type = IOMMU_DOMAIN_DMA;
3466 /* Check if the dev needs to go through non-identity map and unmap process.*/
3467 static bool iommu_need_mapping(struct device *dev)
3471 if (iommu_dummy(dev))
3474 ret = identity_mapping(dev);
3476 u64 dma_mask = *dev->dma_mask;
3478 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3479 dma_mask = dev->coherent_dma_mask;
3481 if (dma_mask >= dma_direct_get_required_mask(dev))
3485 * 32 bit DMA is removed from si_domain and fall back to
3486 * non-identity mapping.
3488 dmar_remove_one_dev_info(dev);
3489 ret = iommu_request_dma_domain_for_dev(dev);
3491 struct iommu_domain *domain;
3492 struct dmar_domain *dmar_domain;
3494 domain = iommu_get_domain_for_dev(dev);
3496 dmar_domain = to_dmar_domain(domain);
3497 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3499 dmar_remove_one_dev_info(dev);
3500 get_private_domain_for_dev(dev);
3503 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3509 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3510 size_t size, int dir, u64 dma_mask)
3512 struct dmar_domain *domain;
3513 phys_addr_t start_paddr;
3514 unsigned long iova_pfn;
3517 struct intel_iommu *iommu;
3518 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3520 BUG_ON(dir == DMA_NONE);
3522 domain = deferred_attach_domain(dev);
3524 return DMA_MAPPING_ERROR;
3526 iommu = domain_get_iommu(domain);
3527 size = aligned_nrpages(paddr, size);
3529 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3534 * Check if DMAR supports zero-length reads on write only
3537 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3538 !cap_zlr(iommu->cap))
3539 prot |= DMA_PTE_READ;
3540 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3541 prot |= DMA_PTE_WRITE;
3543 * paddr - (paddr + size) might be partial page, we should map the whole
3544 * page. Note: if two part of one page are separately mapped, we
3545 * might have two guest_addr mapping to the same host paddr, but this
3546 * is not a big problem
3548 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3549 mm_to_dma_pfn(paddr_pfn), size, prot);
3553 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3554 start_paddr += paddr & ~PAGE_MASK;
3556 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3562 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3563 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3564 size, (unsigned long long)paddr, dir);
3565 return DMA_MAPPING_ERROR;
3568 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3569 unsigned long offset, size_t size,
3570 enum dma_data_direction dir,
3571 unsigned long attrs)
3573 if (iommu_need_mapping(dev))
3574 return __intel_map_single(dev, page_to_phys(page) + offset,
3575 size, dir, *dev->dma_mask);
3576 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3579 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3580 size_t size, enum dma_data_direction dir,
3581 unsigned long attrs)
3583 if (iommu_need_mapping(dev))
3584 return __intel_map_single(dev, phys_addr, size, dir,
3586 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3589 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3591 struct dmar_domain *domain;
3592 unsigned long start_pfn, last_pfn;
3593 unsigned long nrpages;
3594 unsigned long iova_pfn;
3595 struct intel_iommu *iommu;
3596 struct page *freelist;
3597 struct pci_dev *pdev = NULL;
3599 domain = find_domain(dev);
3602 iommu = domain_get_iommu(domain);
3604 iova_pfn = IOVA_PFN(dev_addr);
3606 nrpages = aligned_nrpages(dev_addr, size);
3607 start_pfn = mm_to_dma_pfn(iova_pfn);
3608 last_pfn = start_pfn + nrpages - 1;
3610 if (dev_is_pci(dev))
3611 pdev = to_pci_dev(dev);
3613 freelist = domain_unmap(domain, start_pfn, last_pfn);
3614 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3615 !has_iova_flush_queue(&domain->iovad)) {
3616 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3617 nrpages, !freelist, 0);
3619 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3620 dma_free_pagelist(freelist);
3622 queue_iova(&domain->iovad, iova_pfn, nrpages,
3623 (unsigned long)freelist);
3625 * queue up the release of the unmap to save the 1/6th of the
3626 * cpu used up by the iotlb flush operation...
3630 trace_unmap_single(dev, dev_addr, size);
3633 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3634 size_t size, enum dma_data_direction dir,
3635 unsigned long attrs)
3637 if (iommu_need_mapping(dev))
3638 intel_unmap(dev, dev_addr, size);
3640 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3643 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3644 size_t size, enum dma_data_direction dir, unsigned long attrs)
3646 if (iommu_need_mapping(dev))
3647 intel_unmap(dev, dev_addr, size);
3650 static void *intel_alloc_coherent(struct device *dev, size_t size,
3651 dma_addr_t *dma_handle, gfp_t flags,
3652 unsigned long attrs)
3654 struct page *page = NULL;
3657 if (!iommu_need_mapping(dev))
3658 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3660 size = PAGE_ALIGN(size);
3661 order = get_order(size);
3663 if (gfpflags_allow_blocking(flags)) {
3664 unsigned int count = size >> PAGE_SHIFT;
3666 page = dma_alloc_from_contiguous(dev, count, order,
3667 flags & __GFP_NOWARN);
3671 page = alloc_pages(flags, order);
3674 memset(page_address(page), 0, size);
3676 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3678 dev->coherent_dma_mask);
3679 if (*dma_handle != DMA_MAPPING_ERROR)
3680 return page_address(page);
3681 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3682 __free_pages(page, order);
3687 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3688 dma_addr_t dma_handle, unsigned long attrs)
3691 struct page *page = virt_to_page(vaddr);
3693 if (!iommu_need_mapping(dev))
3694 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3696 size = PAGE_ALIGN(size);
3697 order = get_order(size);
3699 intel_unmap(dev, dma_handle, size);
3700 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3701 __free_pages(page, order);
3704 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3705 int nelems, enum dma_data_direction dir,
3706 unsigned long attrs)
3708 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3709 unsigned long nrpages = 0;
3710 struct scatterlist *sg;
3713 if (!iommu_need_mapping(dev))
3714 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3716 for_each_sg(sglist, sg, nelems, i) {
3717 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3720 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3722 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3725 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3726 enum dma_data_direction dir, unsigned long attrs)
3729 struct dmar_domain *domain;
3732 unsigned long iova_pfn;
3734 struct scatterlist *sg;
3735 unsigned long start_vpfn;
3736 struct intel_iommu *iommu;
3738 BUG_ON(dir == DMA_NONE);
3739 if (!iommu_need_mapping(dev))
3740 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3742 domain = deferred_attach_domain(dev);
3746 iommu = domain_get_iommu(domain);
3748 for_each_sg(sglist, sg, nelems, i)
3749 size += aligned_nrpages(sg->offset, sg->length);
3751 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3754 sglist->dma_length = 0;
3759 * Check if DMAR supports zero-length reads on write only
3762 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3763 !cap_zlr(iommu->cap))
3764 prot |= DMA_PTE_READ;
3765 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3766 prot |= DMA_PTE_WRITE;
3768 start_vpfn = mm_to_dma_pfn(iova_pfn);
3770 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3771 if (unlikely(ret)) {
3772 dma_pte_free_pagetable(domain, start_vpfn,
3773 start_vpfn + size - 1,
3774 agaw_to_level(domain->agaw) + 1);
3775 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3779 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3780 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3785 static u64 intel_get_required_mask(struct device *dev)
3787 if (!iommu_need_mapping(dev))
3788 return dma_direct_get_required_mask(dev);
3789 return DMA_BIT_MASK(32);
3792 static const struct dma_map_ops intel_dma_ops = {
3793 .alloc = intel_alloc_coherent,
3794 .free = intel_free_coherent,
3795 .map_sg = intel_map_sg,
3796 .unmap_sg = intel_unmap_sg,
3797 .map_page = intel_map_page,
3798 .unmap_page = intel_unmap_page,
3799 .map_resource = intel_map_resource,
3800 .unmap_resource = intel_unmap_resource,
3801 .dma_supported = dma_direct_supported,
3802 .mmap = dma_common_mmap,
3803 .get_sgtable = dma_common_get_sgtable,
3804 .get_required_mask = intel_get_required_mask,
3808 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3809 enum dma_data_direction dir, enum dma_sync_target target)
3811 struct dmar_domain *domain;
3812 phys_addr_t tlb_addr;
3814 domain = find_domain(dev);
3815 if (WARN_ON(!domain))
3818 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3819 if (is_swiotlb_buffer(tlb_addr))
3820 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3824 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3825 enum dma_data_direction dir, unsigned long attrs,
3828 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3829 struct dmar_domain *domain;
3830 struct intel_iommu *iommu;
3831 unsigned long iova_pfn;
3832 unsigned long nrpages;
3833 phys_addr_t tlb_addr;
3837 domain = deferred_attach_domain(dev);
3838 if (WARN_ON(dir == DMA_NONE || !domain))
3839 return DMA_MAPPING_ERROR;
3841 iommu = domain_get_iommu(domain);
3842 if (WARN_ON(!iommu))
3843 return DMA_MAPPING_ERROR;
3845 nrpages = aligned_nrpages(0, size);
3846 iova_pfn = intel_alloc_iova(dev, domain,
3847 dma_to_mm_pfn(nrpages), dma_mask);
3849 return DMA_MAPPING_ERROR;
3852 * Check if DMAR supports zero-length reads on write only
3855 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3856 !cap_zlr(iommu->cap))
3857 prot |= DMA_PTE_READ;
3858 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3859 prot |= DMA_PTE_WRITE;
3862 * If both the physical buffer start address and size are
3863 * page aligned, we don't need to use a bounce page.
3865 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3866 tlb_addr = swiotlb_tbl_map_single(dev,
3867 __phys_to_dma(dev, io_tlb_start),
3868 paddr, size, aligned_size, dir, attrs);
3869 if (tlb_addr == DMA_MAPPING_ERROR) {
3872 /* Cleanup the padding area. */
3873 void *padding_start = phys_to_virt(tlb_addr);
3874 size_t padding_size = aligned_size;
3876 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3877 (dir == DMA_TO_DEVICE ||
3878 dir == DMA_BIDIRECTIONAL)) {
3879 padding_start += size;
3880 padding_size -= size;
3883 memset(padding_start, 0, padding_size);
3889 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3890 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3894 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3896 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3899 if (is_swiotlb_buffer(tlb_addr))
3900 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3901 aligned_size, dir, attrs);
3903 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3904 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3905 size, (unsigned long long)paddr, dir);
3907 return DMA_MAPPING_ERROR;
3911 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3912 enum dma_data_direction dir, unsigned long attrs)
3914 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3915 struct dmar_domain *domain;
3916 phys_addr_t tlb_addr;
3918 domain = find_domain(dev);
3919 if (WARN_ON(!domain))
3922 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3923 if (WARN_ON(!tlb_addr))
3926 intel_unmap(dev, dev_addr, size);
3927 if (is_swiotlb_buffer(tlb_addr))
3928 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3929 aligned_size, dir, attrs);
3931 trace_bounce_unmap_single(dev, dev_addr, size);
3935 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3936 size_t size, enum dma_data_direction dir, unsigned long attrs)
3938 return bounce_map_single(dev, page_to_phys(page) + offset,
3939 size, dir, attrs, *dev->dma_mask);
3943 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs)
3946 return bounce_map_single(dev, phys_addr, size,
3947 dir, attrs, *dev->dma_mask);
3951 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3952 enum dma_data_direction dir, unsigned long attrs)
3954 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3958 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3959 enum dma_data_direction dir, unsigned long attrs)
3961 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3965 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3966 enum dma_data_direction dir, unsigned long attrs)
3968 struct scatterlist *sg;
3971 for_each_sg(sglist, sg, nelems, i)
3972 bounce_unmap_page(dev, sg->dma_address,
3973 sg_dma_len(sg), dir, attrs);
3977 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3978 enum dma_data_direction dir, unsigned long attrs)
3981 struct scatterlist *sg;
3983 for_each_sg(sglist, sg, nelems, i) {
3984 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3985 sg->offset, sg->length,
3987 if (sg->dma_address == DMA_MAPPING_ERROR)
3989 sg_dma_len(sg) = sg->length;
3995 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4000 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4001 size_t size, enum dma_data_direction dir)
4003 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4007 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4008 size_t size, enum dma_data_direction dir)
4010 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4014 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4015 int nelems, enum dma_data_direction dir)
4017 struct scatterlist *sg;
4020 for_each_sg(sglist, sg, nelems, i)
4021 bounce_sync_single(dev, sg_dma_address(sg),
4022 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4026 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4027 int nelems, enum dma_data_direction dir)
4029 struct scatterlist *sg;
4032 for_each_sg(sglist, sg, nelems, i)
4033 bounce_sync_single(dev, sg_dma_address(sg),
4034 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4037 static const struct dma_map_ops bounce_dma_ops = {
4038 .alloc = intel_alloc_coherent,
4039 .free = intel_free_coherent,
4040 .map_sg = bounce_map_sg,
4041 .unmap_sg = bounce_unmap_sg,
4042 .map_page = bounce_map_page,
4043 .unmap_page = bounce_unmap_page,
4044 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4045 .sync_single_for_device = bounce_sync_single_for_device,
4046 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4047 .sync_sg_for_device = bounce_sync_sg_for_device,
4048 .map_resource = bounce_map_resource,
4049 .unmap_resource = bounce_unmap_resource,
4050 .dma_supported = dma_direct_supported,
4053 static inline int iommu_domain_cache_init(void)
4057 iommu_domain_cache = kmem_cache_create("iommu_domain",
4058 sizeof(struct dmar_domain),
4063 if (!iommu_domain_cache) {
4064 pr_err("Couldn't create iommu_domain cache\n");
4071 static inline int iommu_devinfo_cache_init(void)
4075 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4076 sizeof(struct device_domain_info),
4080 if (!iommu_devinfo_cache) {
4081 pr_err("Couldn't create devinfo cache\n");
4088 static int __init iommu_init_mempool(void)
4091 ret = iova_cache_get();
4095 ret = iommu_domain_cache_init();
4099 ret = iommu_devinfo_cache_init();
4103 kmem_cache_destroy(iommu_domain_cache);
4110 static void __init iommu_exit_mempool(void)
4112 kmem_cache_destroy(iommu_devinfo_cache);
4113 kmem_cache_destroy(iommu_domain_cache);
4117 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4119 struct dmar_drhd_unit *drhd;
4123 /* We know that this device on this chipset has its own IOMMU.
4124 * If we find it under a different IOMMU, then the BIOS is lying
4125 * to us. Hope that the IOMMU for this device is actually
4126 * disabled, and it needs no translation...
4128 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4130 /* "can't" happen */
4131 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4134 vtbar &= 0xffff0000;
4136 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4137 drhd = dmar_find_matched_drhd_unit(pdev);
4138 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4139 TAINT_FIRMWARE_WORKAROUND,
4140 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4141 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4143 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4145 static void __init init_no_remapping_devices(void)
4147 struct dmar_drhd_unit *drhd;
4151 for_each_drhd_unit(drhd) {
4152 if (!drhd->include_all) {
4153 for_each_active_dev_scope(drhd->devices,
4154 drhd->devices_cnt, i, dev)
4156 /* ignore DMAR unit if no devices exist */
4157 if (i == drhd->devices_cnt)
4162 for_each_active_drhd_unit(drhd) {
4163 if (drhd->include_all)
4166 for_each_active_dev_scope(drhd->devices,
4167 drhd->devices_cnt, i, dev)
4168 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4170 if (i < drhd->devices_cnt)
4173 /* This IOMMU has *only* gfx devices. Either bypass it or
4174 set the gfx_mapped flag, as appropriate */
4175 if (!dmar_map_gfx) {
4177 for_each_active_dev_scope(drhd->devices,
4178 drhd->devices_cnt, i, dev)
4179 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4184 #ifdef CONFIG_SUSPEND
4185 static int init_iommu_hw(void)
4187 struct dmar_drhd_unit *drhd;
4188 struct intel_iommu *iommu = NULL;
4190 for_each_active_iommu(iommu, drhd)
4192 dmar_reenable_qi(iommu);
4194 for_each_iommu(iommu, drhd) {
4195 if (drhd->ignored) {
4197 * we always have to disable PMRs or DMA may fail on
4201 iommu_disable_protect_mem_regions(iommu);
4205 iommu_flush_write_buffer(iommu);
4207 iommu_set_root_entry(iommu);
4209 iommu->flush.flush_context(iommu, 0, 0, 0,
4210 DMA_CCMD_GLOBAL_INVL);
4211 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4212 iommu_enable_translation(iommu);
4213 iommu_disable_protect_mem_regions(iommu);
4219 static void iommu_flush_all(void)
4221 struct dmar_drhd_unit *drhd;
4222 struct intel_iommu *iommu;
4224 for_each_active_iommu(iommu, drhd) {
4225 iommu->flush.flush_context(iommu, 0, 0, 0,
4226 DMA_CCMD_GLOBAL_INVL);
4227 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4228 DMA_TLB_GLOBAL_FLUSH);
4232 static int iommu_suspend(void)
4234 struct dmar_drhd_unit *drhd;
4235 struct intel_iommu *iommu = NULL;
4238 for_each_active_iommu(iommu, drhd) {
4239 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4241 if (!iommu->iommu_state)
4247 for_each_active_iommu(iommu, drhd) {
4248 iommu_disable_translation(iommu);
4250 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4252 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4253 readl(iommu->reg + DMAR_FECTL_REG);
4254 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4255 readl(iommu->reg + DMAR_FEDATA_REG);
4256 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4257 readl(iommu->reg + DMAR_FEADDR_REG);
4258 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4259 readl(iommu->reg + DMAR_FEUADDR_REG);
4261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4266 for_each_active_iommu(iommu, drhd)
4267 kfree(iommu->iommu_state);
4272 static void iommu_resume(void)
4274 struct dmar_drhd_unit *drhd;
4275 struct intel_iommu *iommu = NULL;
4278 if (init_iommu_hw()) {
4280 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4282 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4286 for_each_active_iommu(iommu, drhd) {
4288 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4290 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4291 iommu->reg + DMAR_FECTL_REG);
4292 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4293 iommu->reg + DMAR_FEDATA_REG);
4294 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4295 iommu->reg + DMAR_FEADDR_REG);
4296 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4297 iommu->reg + DMAR_FEUADDR_REG);
4299 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4302 for_each_active_iommu(iommu, drhd)
4303 kfree(iommu->iommu_state);
4306 static struct syscore_ops iommu_syscore_ops = {
4307 .resume = iommu_resume,
4308 .suspend = iommu_suspend,
4311 static void __init init_iommu_pm_ops(void)
4313 register_syscore_ops(&iommu_syscore_ops);
4317 static inline void init_iommu_pm_ops(void) {}
4318 #endif /* CONFIG_PM */
4320 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4322 struct acpi_dmar_reserved_memory *rmrr;
4323 struct dmar_rmrr_unit *rmrru;
4326 rmrr = (struct acpi_dmar_reserved_memory *)header;
4327 ret = arch_rmrr_sanity_check(rmrr);
4331 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4335 rmrru->hdr = header;
4337 rmrru->base_address = rmrr->base_address;
4338 rmrru->end_address = rmrr->end_address;
4340 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4341 ((void *)rmrr) + rmrr->header.length,
4342 &rmrru->devices_cnt);
4343 if (rmrru->devices_cnt && rmrru->devices == NULL)
4346 list_add(&rmrru->list, &dmar_rmrr_units);
4355 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4357 struct dmar_atsr_unit *atsru;
4358 struct acpi_dmar_atsr *tmp;
4360 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4361 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4362 if (atsr->segment != tmp->segment)
4364 if (atsr->header.length != tmp->header.length)
4366 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4373 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4375 struct acpi_dmar_atsr *atsr;
4376 struct dmar_atsr_unit *atsru;
4378 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4381 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4382 atsru = dmar_find_atsr(atsr);
4386 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4391 * If memory is allocated from slab by ACPI _DSM method, we need to
4392 * copy the memory content because the memory buffer will be freed
4395 atsru->hdr = (void *)(atsru + 1);
4396 memcpy(atsru->hdr, hdr, hdr->length);
4397 atsru->include_all = atsr->flags & 0x1;
4398 if (!atsru->include_all) {
4399 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4400 (void *)atsr + atsr->header.length,
4401 &atsru->devices_cnt);
4402 if (atsru->devices_cnt && atsru->devices == NULL) {
4408 list_add_rcu(&atsru->list, &dmar_atsr_units);
4413 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4415 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4419 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4421 struct acpi_dmar_atsr *atsr;
4422 struct dmar_atsr_unit *atsru;
4424 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4425 atsru = dmar_find_atsr(atsr);
4427 list_del_rcu(&atsru->list);
4429 intel_iommu_free_atsr(atsru);
4435 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4439 struct acpi_dmar_atsr *atsr;
4440 struct dmar_atsr_unit *atsru;
4442 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4443 atsru = dmar_find_atsr(atsr);
4447 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4448 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4456 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4459 struct intel_iommu *iommu = dmaru->iommu;
4461 if (g_iommus[iommu->seq_id])
4464 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4465 pr_warn("%s: Doesn't support hardware pass through.\n",
4469 if (!ecap_sc_support(iommu->ecap) &&
4470 domain_update_iommu_snooping(iommu)) {
4471 pr_warn("%s: Doesn't support snooping.\n",
4475 sp = domain_update_iommu_superpage(iommu) - 1;
4476 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4477 pr_warn("%s: Doesn't support large page.\n",
4483 * Disable translation if already enabled prior to OS handover.
4485 if (iommu->gcmd & DMA_GCMD_TE)
4486 iommu_disable_translation(iommu);
4488 g_iommus[iommu->seq_id] = iommu;
4489 ret = iommu_init_domains(iommu);
4491 ret = iommu_alloc_root_entry(iommu);
4495 intel_svm_check(iommu);
4497 if (dmaru->ignored) {
4499 * we always have to disable PMRs or DMA may fail on this device
4502 iommu_disable_protect_mem_regions(iommu);
4506 intel_iommu_init_qi(iommu);
4507 iommu_flush_write_buffer(iommu);
4509 #ifdef CONFIG_INTEL_IOMMU_SVM
4510 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4511 ret = intel_svm_enable_prq(iommu);
4516 ret = dmar_set_interrupt(iommu);
4520 iommu_set_root_entry(iommu);
4521 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4522 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4523 iommu_enable_translation(iommu);
4525 iommu_disable_protect_mem_regions(iommu);
4529 disable_dmar_iommu(iommu);
4531 free_dmar_iommu(iommu);
4535 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4538 struct intel_iommu *iommu = dmaru->iommu;
4540 if (!intel_iommu_enabled)
4546 ret = intel_iommu_add(dmaru);
4548 disable_dmar_iommu(iommu);
4549 free_dmar_iommu(iommu);
4555 static void intel_iommu_free_dmars(void)
4557 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4558 struct dmar_atsr_unit *atsru, *atsr_n;
4560 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4561 list_del(&rmrru->list);
4562 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4566 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4567 list_del(&atsru->list);
4568 intel_iommu_free_atsr(atsru);
4572 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4575 struct pci_bus *bus;
4576 struct pci_dev *bridge = NULL;
4578 struct acpi_dmar_atsr *atsr;
4579 struct dmar_atsr_unit *atsru;
4581 dev = pci_physfn(dev);
4582 for (bus = dev->bus; bus; bus = bus->parent) {
4584 /* If it's an integrated device, allow ATS */
4587 /* Connected via non-PCIe: no ATS */
4588 if (!pci_is_pcie(bridge) ||
4589 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4591 /* If we found the root port, look it up in the ATSR */
4592 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4597 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4598 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4599 if (atsr->segment != pci_domain_nr(dev->bus))
4602 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4603 if (tmp == &bridge->dev)
4606 if (atsru->include_all)
4616 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4619 struct dmar_rmrr_unit *rmrru;
4620 struct dmar_atsr_unit *atsru;
4621 struct acpi_dmar_atsr *atsr;
4622 struct acpi_dmar_reserved_memory *rmrr;
4624 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4627 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4628 rmrr = container_of(rmrru->hdr,
4629 struct acpi_dmar_reserved_memory, header);
4630 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4631 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4632 ((void *)rmrr) + rmrr->header.length,
4633 rmrr->segment, rmrru->devices,
4634 rmrru->devices_cnt);
4637 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4638 dmar_remove_dev_scope(info, rmrr->segment,
4639 rmrru->devices, rmrru->devices_cnt);
4643 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4644 if (atsru->include_all)
4647 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4648 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4649 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4650 (void *)atsr + atsr->header.length,
4651 atsr->segment, atsru->devices,
4652 atsru->devices_cnt);
4657 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4658 if (dmar_remove_dev_scope(info, atsr->segment,
4659 atsru->devices, atsru->devices_cnt))
4667 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4668 unsigned long val, void *v)
4670 struct memory_notify *mhp = v;
4671 unsigned long long start, end;
4672 unsigned long start_vpfn, last_vpfn;
4675 case MEM_GOING_ONLINE:
4676 start = mhp->start_pfn << PAGE_SHIFT;
4677 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4678 if (iommu_domain_identity_map(si_domain, start, end)) {
4679 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4686 case MEM_CANCEL_ONLINE:
4687 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4688 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4689 while (start_vpfn <= last_vpfn) {
4691 struct dmar_drhd_unit *drhd;
4692 struct intel_iommu *iommu;
4693 struct page *freelist;
4695 iova = find_iova(&si_domain->iovad, start_vpfn);
4697 pr_debug("Failed get IOVA for PFN %lx\n",
4702 iova = split_and_remove_iova(&si_domain->iovad, iova,
4703 start_vpfn, last_vpfn);
4705 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4706 start_vpfn, last_vpfn);
4710 freelist = domain_unmap(si_domain, iova->pfn_lo,
4714 for_each_active_iommu(iommu, drhd)
4715 iommu_flush_iotlb_psi(iommu, si_domain,
4716 iova->pfn_lo, iova_size(iova),
4719 dma_free_pagelist(freelist);
4721 start_vpfn = iova->pfn_hi + 1;
4722 free_iova_mem(iova);
4730 static struct notifier_block intel_iommu_memory_nb = {
4731 .notifier_call = intel_iommu_memory_notifier,
4735 static void free_all_cpu_cached_iovas(unsigned int cpu)
4739 for (i = 0; i < g_num_of_iommus; i++) {
4740 struct intel_iommu *iommu = g_iommus[i];
4741 struct dmar_domain *domain;
4747 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4748 domain = get_iommu_domain(iommu, (u16)did);
4752 free_cpu_cached_iovas(cpu, &domain->iovad);
4757 static int intel_iommu_cpu_dead(unsigned int cpu)
4759 free_all_cpu_cached_iovas(cpu);
4763 static void intel_disable_iommus(void)
4765 struct intel_iommu *iommu = NULL;
4766 struct dmar_drhd_unit *drhd;
4768 for_each_iommu(iommu, drhd)
4769 iommu_disable_translation(iommu);
4772 void intel_iommu_shutdown(void)
4774 struct dmar_drhd_unit *drhd;
4775 struct intel_iommu *iommu = NULL;
4777 if (no_iommu || dmar_disabled)
4780 down_write(&dmar_global_lock);
4782 /* Disable PMRs explicitly here. */
4783 for_each_iommu(iommu, drhd)
4784 iommu_disable_protect_mem_regions(iommu);
4786 /* Make sure the IOMMUs are switched off */
4787 intel_disable_iommus();
4789 up_write(&dmar_global_lock);
4792 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4794 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4796 return container_of(iommu_dev, struct intel_iommu, iommu);
4799 static ssize_t intel_iommu_show_version(struct device *dev,
4800 struct device_attribute *attr,
4803 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4804 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4805 return sprintf(buf, "%d:%d\n",
4806 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4808 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4810 static ssize_t intel_iommu_show_address(struct device *dev,
4811 struct device_attribute *attr,
4814 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4815 return sprintf(buf, "%llx\n", iommu->reg_phys);
4817 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4819 static ssize_t intel_iommu_show_cap(struct device *dev,
4820 struct device_attribute *attr,
4823 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4824 return sprintf(buf, "%llx\n", iommu->cap);
4826 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4828 static ssize_t intel_iommu_show_ecap(struct device *dev,
4829 struct device_attribute *attr,
4832 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4833 return sprintf(buf, "%llx\n", iommu->ecap);
4835 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4837 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4838 struct device_attribute *attr,
4841 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4842 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4844 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4846 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4847 struct device_attribute *attr,
4850 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4851 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4852 cap_ndoms(iommu->cap)));
4854 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4856 static struct attribute *intel_iommu_attrs[] = {
4857 &dev_attr_version.attr,
4858 &dev_attr_address.attr,
4860 &dev_attr_ecap.attr,
4861 &dev_attr_domains_supported.attr,
4862 &dev_attr_domains_used.attr,
4866 static struct attribute_group intel_iommu_group = {
4867 .name = "intel-iommu",
4868 .attrs = intel_iommu_attrs,
4871 const struct attribute_group *intel_iommu_groups[] = {
4876 static inline bool has_untrusted_dev(void)
4878 struct pci_dev *pdev = NULL;
4880 for_each_pci_dev(pdev)
4881 if (pdev->untrusted)
4887 static int __init platform_optin_force_iommu(void)
4889 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4892 if (no_iommu || dmar_disabled)
4893 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4896 * If Intel-IOMMU is disabled by default, we will apply identity
4897 * map for all devices except those marked as being untrusted.
4900 iommu_identity_mapping |= IDENTMAP_ALL;
4908 static int __init probe_acpi_namespace_devices(void)
4910 struct dmar_drhd_unit *drhd;
4911 /* To avoid a -Wunused-but-set-variable warning. */
4912 struct intel_iommu *iommu __maybe_unused;
4916 for_each_active_iommu(iommu, drhd) {
4917 for_each_active_dev_scope(drhd->devices,
4918 drhd->devices_cnt, i, dev) {
4919 struct acpi_device_physical_node *pn;
4920 struct iommu_group *group;
4921 struct acpi_device *adev;
4923 if (dev->bus != &acpi_bus_type)
4926 adev = to_acpi_device(dev);
4927 mutex_lock(&adev->physical_node_lock);
4928 list_for_each_entry(pn,
4929 &adev->physical_node_list, node) {
4930 group = iommu_group_get(pn->dev);
4932 iommu_group_put(group);
4936 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4937 ret = iommu_probe_device(pn->dev);
4941 mutex_unlock(&adev->physical_node_lock);
4951 int __init intel_iommu_init(void)
4954 struct dmar_drhd_unit *drhd;
4955 struct intel_iommu *iommu;
4958 * Intel IOMMU is required for a TXT/tboot launch or platform
4959 * opt in, so enforce that.
4961 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4963 if (iommu_init_mempool()) {
4965 panic("tboot: Failed to initialize iommu memory\n");
4969 down_write(&dmar_global_lock);
4970 if (dmar_table_init()) {
4972 panic("tboot: Failed to initialize DMAR table\n");
4976 if (dmar_dev_scope_init() < 0) {
4978 panic("tboot: Failed to initialize DMAR device scope\n");
4982 up_write(&dmar_global_lock);
4985 * The bus notifier takes the dmar_global_lock, so lockdep will
4986 * complain later when we register it under the lock.
4988 dmar_register_bus_notifier();
4990 down_write(&dmar_global_lock);
4992 if (no_iommu || dmar_disabled) {
4994 * We exit the function here to ensure IOMMU's remapping and
4995 * mempool aren't setup, which means that the IOMMU's PMRs
4996 * won't be disabled via the call to init_dmars(). So disable
4997 * it explicitly here. The PMRs were setup by tboot prior to
4998 * calling SENTER, but the kernel is expected to reset/tear
5001 if (intel_iommu_tboot_noforce) {
5002 for_each_iommu(iommu, drhd)
5003 iommu_disable_protect_mem_regions(iommu);
5007 * Make sure the IOMMUs are switched off, even when we
5008 * boot into a kexec kernel and the previous kernel left
5011 intel_disable_iommus();
5015 if (list_empty(&dmar_rmrr_units))
5016 pr_info("No RMRR found\n");
5018 if (list_empty(&dmar_atsr_units))
5019 pr_info("No ATSR found\n");
5021 if (dmar_init_reserved_ranges()) {
5023 panic("tboot: Failed to reserve iommu ranges\n");
5024 goto out_free_reserved_range;
5028 intel_iommu_gfx_mapped = 1;
5030 init_no_remapping_devices();
5035 panic("tboot: Failed to initialize DMARs\n");
5036 pr_err("Initialization failed\n");
5037 goto out_free_reserved_range;
5039 up_write(&dmar_global_lock);
5041 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5043 * If the system has no untrusted device or the user has decided
5044 * to disable the bounce page mechanisms, we don't need swiotlb.
5045 * Mark this and the pre-allocated bounce pages will be released
5048 if (!has_untrusted_dev() || intel_no_bounce)
5051 dma_ops = &intel_dma_ops;
5053 init_iommu_pm_ops();
5055 for_each_active_iommu(iommu, drhd) {
5056 iommu_device_sysfs_add(&iommu->iommu, NULL,
5059 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5060 iommu_device_register(&iommu->iommu);
5063 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5064 if (si_domain && !hw_pass_through)
5065 register_memory_notifier(&intel_iommu_memory_nb);
5066 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5067 intel_iommu_cpu_dead);
5069 down_read(&dmar_global_lock);
5070 if (probe_acpi_namespace_devices())
5071 pr_warn("ACPI name space devices didn't probe correctly\n");
5072 up_read(&dmar_global_lock);
5074 /* Finally, we enable the DMA remapping hardware. */
5075 for_each_iommu(iommu, drhd) {
5076 if (!drhd->ignored && !translation_pre_enabled(iommu))
5077 iommu_enable_translation(iommu);
5079 iommu_disable_protect_mem_regions(iommu);
5081 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5083 intel_iommu_enabled = 1;
5084 intel_iommu_debugfs_init();
5088 out_free_reserved_range:
5089 put_iova_domain(&reserved_iova_list);
5091 intel_iommu_free_dmars();
5092 up_write(&dmar_global_lock);
5093 iommu_exit_mempool();
5097 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5099 struct intel_iommu *iommu = opaque;
5101 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5106 * NB - intel-iommu lacks any sort of reference counting for the users of
5107 * dependent devices. If multiple endpoints have intersecting dependent
5108 * devices, unbinding the driver from any one of them will possibly leave
5109 * the others unable to operate.
5111 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5113 if (!iommu || !dev || !dev_is_pci(dev))
5116 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5119 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5121 struct dmar_domain *domain;
5122 struct intel_iommu *iommu;
5123 unsigned long flags;
5125 assert_spin_locked(&device_domain_lock);
5130 iommu = info->iommu;
5131 domain = info->domain;
5134 if (dev_is_pci(info->dev) && sm_supported(iommu))
5135 intel_pasid_tear_down_entry(iommu, info->dev,
5138 iommu_disable_dev_iotlb(info);
5139 domain_context_clear(iommu, info->dev);
5140 intel_pasid_free_table(info->dev);
5143 unlink_domain_info(info);
5145 spin_lock_irqsave(&iommu->lock, flags);
5146 domain_detach_iommu(domain, iommu);
5147 spin_unlock_irqrestore(&iommu->lock, flags);
5149 /* free the private domain */
5150 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5151 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5152 list_empty(&domain->devices))
5153 domain_exit(info->domain);
5155 free_devinfo_mem(info);
5158 static void dmar_remove_one_dev_info(struct device *dev)
5160 struct device_domain_info *info;
5161 unsigned long flags;
5163 spin_lock_irqsave(&device_domain_lock, flags);
5164 info = dev->archdata.iommu;
5166 __dmar_remove_one_dev_info(info);
5167 spin_unlock_irqrestore(&device_domain_lock, flags);
5170 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5174 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5175 domain_reserve_special_ranges(domain);
5177 /* calculate AGAW */
5178 domain->gaw = guest_width;
5179 adjust_width = guestwidth_to_adjustwidth(guest_width);
5180 domain->agaw = width_to_agaw(adjust_width);
5182 domain->iommu_coherency = 0;
5183 domain->iommu_snooping = 0;
5184 domain->iommu_superpage = 0;
5185 domain->max_addr = 0;
5187 /* always allocate the top pgd */
5188 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5191 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5195 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5197 struct dmar_domain *dmar_domain;
5198 struct iommu_domain *domain;
5201 case IOMMU_DOMAIN_DMA:
5203 case IOMMU_DOMAIN_UNMANAGED:
5204 dmar_domain = alloc_domain(0);
5206 pr_err("Can't allocate dmar_domain\n");
5209 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5210 pr_err("Domain initialization failed\n");
5211 domain_exit(dmar_domain);
5215 if (type == IOMMU_DOMAIN_DMA &&
5216 init_iova_flush_queue(&dmar_domain->iovad,
5217 iommu_flush_iova, iova_entry_free)) {
5218 pr_warn("iova flush queue initialization failed\n");
5219 intel_iommu_strict = 1;
5222 domain_update_iommu_cap(dmar_domain);
5224 domain = &dmar_domain->domain;
5225 domain->geometry.aperture_start = 0;
5226 domain->geometry.aperture_end =
5227 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5228 domain->geometry.force_aperture = true;
5231 case IOMMU_DOMAIN_IDENTITY:
5232 return &si_domain->domain;
5240 static void intel_iommu_domain_free(struct iommu_domain *domain)
5242 if (domain != &si_domain->domain)
5243 domain_exit(to_dmar_domain(domain));
5247 * Check whether a @domain could be attached to the @dev through the
5248 * aux-domain attach/detach APIs.
5251 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5253 struct device_domain_info *info = dev->archdata.iommu;
5255 return info && info->auxd_enabled &&
5256 domain->type == IOMMU_DOMAIN_UNMANAGED;
5259 static void auxiliary_link_device(struct dmar_domain *domain,
5262 struct device_domain_info *info = dev->archdata.iommu;
5264 assert_spin_locked(&device_domain_lock);
5268 domain->auxd_refcnt++;
5269 list_add(&domain->auxd, &info->auxiliary_domains);
5272 static void auxiliary_unlink_device(struct dmar_domain *domain,
5275 struct device_domain_info *info = dev->archdata.iommu;
5277 assert_spin_locked(&device_domain_lock);
5281 list_del(&domain->auxd);
5282 domain->auxd_refcnt--;
5284 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5285 ioasid_free(domain->default_pasid);
5288 static int aux_domain_add_dev(struct dmar_domain *domain,
5293 unsigned long flags;
5294 struct intel_iommu *iommu;
5296 iommu = device_to_iommu(dev, &bus, &devfn);
5300 if (domain->default_pasid <= 0) {
5303 /* No private data needed for the default pasid */
5304 pasid = ioasid_alloc(NULL, PASID_MIN,
5305 pci_max_pasids(to_pci_dev(dev)) - 1,
5307 if (pasid == INVALID_IOASID) {
5308 pr_err("Can't allocate default pasid\n");
5311 domain->default_pasid = pasid;
5314 spin_lock_irqsave(&device_domain_lock, flags);
5316 * iommu->lock must be held to attach domain to iommu and setup the
5317 * pasid entry for second level translation.
5319 spin_lock(&iommu->lock);
5320 ret = domain_attach_iommu(domain, iommu);
5324 /* Setup the PASID entry for mediated devices: */
5325 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5326 domain->default_pasid);
5329 spin_unlock(&iommu->lock);
5331 auxiliary_link_device(domain, dev);
5333 spin_unlock_irqrestore(&device_domain_lock, flags);
5338 domain_detach_iommu(domain, iommu);
5340 spin_unlock(&iommu->lock);
5341 spin_unlock_irqrestore(&device_domain_lock, flags);
5342 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5343 ioasid_free(domain->default_pasid);
5348 static void aux_domain_remove_dev(struct dmar_domain *domain,
5351 struct device_domain_info *info;
5352 struct intel_iommu *iommu;
5353 unsigned long flags;
5355 if (!is_aux_domain(dev, &domain->domain))
5358 spin_lock_irqsave(&device_domain_lock, flags);
5359 info = dev->archdata.iommu;
5360 iommu = info->iommu;
5362 auxiliary_unlink_device(domain, dev);
5364 spin_lock(&iommu->lock);
5365 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5366 domain_detach_iommu(domain, iommu);
5367 spin_unlock(&iommu->lock);
5369 spin_unlock_irqrestore(&device_domain_lock, flags);
5372 static int prepare_domain_attach_device(struct iommu_domain *domain,
5375 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5376 struct intel_iommu *iommu;
5380 iommu = device_to_iommu(dev, &bus, &devfn);
5384 /* check if this iommu agaw is sufficient for max mapped address */
5385 addr_width = agaw_to_width(iommu->agaw);
5386 if (addr_width > cap_mgaw(iommu->cap))
5387 addr_width = cap_mgaw(iommu->cap);
5389 if (dmar_domain->max_addr > (1LL << addr_width)) {
5390 dev_err(dev, "%s: iommu width (%d) is not "
5391 "sufficient for the mapped address (%llx)\n",
5392 __func__, addr_width, dmar_domain->max_addr);
5395 dmar_domain->gaw = addr_width;
5398 * Knock out extra levels of page tables if necessary
5400 while (iommu->agaw < dmar_domain->agaw) {
5401 struct dma_pte *pte;
5403 pte = dmar_domain->pgd;
5404 if (dma_pte_present(pte)) {
5405 dmar_domain->pgd = (struct dma_pte *)
5406 phys_to_virt(dma_pte_addr(pte));
5407 free_pgtable_page(pte);
5409 dmar_domain->agaw--;
5415 static int intel_iommu_attach_device(struct iommu_domain *domain,
5420 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5421 device_is_rmrr_locked(dev)) {
5422 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5426 if (is_aux_domain(dev, domain))
5429 /* normally dev is not mapped */
5430 if (unlikely(domain_context_mapped(dev))) {
5431 struct dmar_domain *old_domain;
5433 old_domain = find_domain(dev);
5435 dmar_remove_one_dev_info(dev);
5438 ret = prepare_domain_attach_device(domain, dev);
5442 return domain_add_dev_info(to_dmar_domain(domain), dev);
5445 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5450 if (!is_aux_domain(dev, domain))
5453 ret = prepare_domain_attach_device(domain, dev);
5457 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5460 static void intel_iommu_detach_device(struct iommu_domain *domain,
5463 dmar_remove_one_dev_info(dev);
5466 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5469 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5472 static int intel_iommu_map(struct iommu_domain *domain,
5473 unsigned long iova, phys_addr_t hpa,
5474 size_t size, int iommu_prot, gfp_t gfp)
5476 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5481 if (iommu_prot & IOMMU_READ)
5482 prot |= DMA_PTE_READ;
5483 if (iommu_prot & IOMMU_WRITE)
5484 prot |= DMA_PTE_WRITE;
5485 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5486 prot |= DMA_PTE_SNP;
5488 max_addr = iova + size;
5489 if (dmar_domain->max_addr < max_addr) {
5492 /* check if minimum agaw is sufficient for mapped address */
5493 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5494 if (end < max_addr) {
5495 pr_err("%s: iommu width (%d) is not "
5496 "sufficient for the mapped address (%llx)\n",
5497 __func__, dmar_domain->gaw, max_addr);
5500 dmar_domain->max_addr = max_addr;
5502 /* Round up size to next multiple of PAGE_SIZE, if it and
5503 the low bits of hpa would take us onto the next page */
5504 size = aligned_nrpages(hpa, size);
5505 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5506 hpa >> VTD_PAGE_SHIFT, size, prot);
5510 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5511 unsigned long iova, size_t size,
5512 struct iommu_iotlb_gather *gather)
5514 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5515 struct page *freelist = NULL;
5516 unsigned long start_pfn, last_pfn;
5517 unsigned int npages;
5518 int iommu_id, level = 0;
5520 /* Cope with horrid API which requires us to unmap more than the
5521 size argument if it happens to be a large-page mapping. */
5522 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5524 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5525 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5527 start_pfn = iova >> VTD_PAGE_SHIFT;
5528 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5530 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5532 npages = last_pfn - start_pfn + 1;
5534 for_each_domain_iommu(iommu_id, dmar_domain)
5535 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5536 start_pfn, npages, !freelist, 0);
5538 dma_free_pagelist(freelist);
5540 if (dmar_domain->max_addr == iova + size)
5541 dmar_domain->max_addr = iova;
5546 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5549 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5550 struct dma_pte *pte;
5554 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5556 phys = dma_pte_addr(pte);
5561 static inline bool scalable_mode_support(void)
5563 struct dmar_drhd_unit *drhd;
5564 struct intel_iommu *iommu;
5568 for_each_active_iommu(iommu, drhd) {
5569 if (!sm_supported(iommu)) {
5579 static inline bool iommu_pasid_support(void)
5581 struct dmar_drhd_unit *drhd;
5582 struct intel_iommu *iommu;
5586 for_each_active_iommu(iommu, drhd) {
5587 if (!pasid_supported(iommu)) {
5597 static bool intel_iommu_capable(enum iommu_cap cap)
5599 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5600 return domain_update_iommu_snooping(NULL) == 1;
5601 if (cap == IOMMU_CAP_INTR_REMAP)
5602 return irq_remapping_enabled == 1;
5607 static int intel_iommu_add_device(struct device *dev)
5609 struct dmar_domain *dmar_domain;
5610 struct iommu_domain *domain;
5611 struct intel_iommu *iommu;
5612 struct iommu_group *group;
5616 iommu = device_to_iommu(dev, &bus, &devfn);
5620 iommu_device_link(&iommu->iommu, dev);
5622 if (translation_pre_enabled(iommu))
5623 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5625 group = iommu_group_get_for_dev(dev);
5628 return PTR_ERR(group);
5630 iommu_group_put(group);
5632 domain = iommu_get_domain_for_dev(dev);
5633 dmar_domain = to_dmar_domain(domain);
5634 if (domain->type == IOMMU_DOMAIN_DMA) {
5635 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5636 ret = iommu_request_dm_for_dev(dev);
5638 dmar_remove_one_dev_info(dev);
5639 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5640 domain_add_dev_info(si_domain, dev);
5642 "Device uses a private identity domain.\n");
5646 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5647 ret = iommu_request_dma_domain_for_dev(dev);
5649 dmar_remove_one_dev_info(dev);
5650 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5651 if (!get_private_domain_for_dev(dev)) {
5653 "Failed to get a private domain.\n");
5658 "Device uses a private dma domain.\n");
5663 if (device_needs_bounce(dev)) {
5664 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5665 set_dma_ops(dev, &bounce_dma_ops);
5671 static void intel_iommu_remove_device(struct device *dev)
5673 struct intel_iommu *iommu;
5676 iommu = device_to_iommu(dev, &bus, &devfn);
5680 dmar_remove_one_dev_info(dev);
5682 iommu_group_remove_device(dev);
5684 iommu_device_unlink(&iommu->iommu, dev);
5686 if (device_needs_bounce(dev))
5687 set_dma_ops(dev, NULL);
5690 static void intel_iommu_get_resv_regions(struct device *device,
5691 struct list_head *head)
5693 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5694 struct iommu_resv_region *reg;
5695 struct dmar_rmrr_unit *rmrr;
5696 struct device *i_dev;
5699 down_read(&dmar_global_lock);
5700 for_each_rmrr_units(rmrr) {
5701 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5703 struct iommu_resv_region *resv;
5704 enum iommu_resv_type type;
5707 if (i_dev != device &&
5708 !is_downstream_to_pci_bridge(device, i_dev))
5711 length = rmrr->end_address - rmrr->base_address + 1;
5713 type = device_rmrr_is_relaxable(device) ?
5714 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5716 resv = iommu_alloc_resv_region(rmrr->base_address,
5717 length, prot, type);
5721 list_add_tail(&resv->list, head);
5724 up_read(&dmar_global_lock);
5726 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5727 if (dev_is_pci(device)) {
5728 struct pci_dev *pdev = to_pci_dev(device);
5730 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5731 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5732 IOMMU_RESV_DIRECT_RELAXABLE);
5734 list_add_tail(®->list, head);
5737 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5739 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5740 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5744 list_add_tail(®->list, head);
5747 static void intel_iommu_put_resv_regions(struct device *dev,
5748 struct list_head *head)
5750 struct iommu_resv_region *entry, *next;
5752 list_for_each_entry_safe(entry, next, head, list)
5756 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5758 struct device_domain_info *info;
5759 struct context_entry *context;
5760 struct dmar_domain *domain;
5761 unsigned long flags;
5765 domain = find_domain(dev);
5769 spin_lock_irqsave(&device_domain_lock, flags);
5770 spin_lock(&iommu->lock);
5773 info = dev->archdata.iommu;
5774 if (!info || !info->pasid_supported)
5777 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5778 if (WARN_ON(!context))
5781 ctx_lo = context[0].lo;
5783 if (!(ctx_lo & CONTEXT_PASIDE)) {
5784 ctx_lo |= CONTEXT_PASIDE;
5785 context[0].lo = ctx_lo;
5787 iommu->flush.flush_context(iommu,
5788 domain->iommu_did[iommu->seq_id],
5789 PCI_DEVID(info->bus, info->devfn),
5790 DMA_CCMD_MASK_NOBIT,
5791 DMA_CCMD_DEVICE_INVL);
5794 /* Enable PASID support in the device, if it wasn't already */
5795 if (!info->pasid_enabled)
5796 iommu_enable_dev_iotlb(info);
5801 spin_unlock(&iommu->lock);
5802 spin_unlock_irqrestore(&device_domain_lock, flags);
5807 static void intel_iommu_apply_resv_region(struct device *dev,
5808 struct iommu_domain *domain,
5809 struct iommu_resv_region *region)
5811 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5812 unsigned long start, end;
5814 start = IOVA_PFN(region->start);
5815 end = IOVA_PFN(region->start + region->length - 1);
5817 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5820 #ifdef CONFIG_INTEL_IOMMU_SVM
5821 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5823 struct intel_iommu *iommu;
5826 if (iommu_dummy(dev)) {
5828 "No IOMMU translation for device; cannot enable SVM\n");
5832 iommu = device_to_iommu(dev, &bus, &devfn);
5834 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5840 #endif /* CONFIG_INTEL_IOMMU_SVM */
5842 static int intel_iommu_enable_auxd(struct device *dev)
5844 struct device_domain_info *info;
5845 struct intel_iommu *iommu;
5846 unsigned long flags;
5850 iommu = device_to_iommu(dev, &bus, &devfn);
5851 if (!iommu || dmar_disabled)
5854 if (!sm_supported(iommu) || !pasid_supported(iommu))
5857 ret = intel_iommu_enable_pasid(iommu, dev);
5861 spin_lock_irqsave(&device_domain_lock, flags);
5862 info = dev->archdata.iommu;
5863 info->auxd_enabled = 1;
5864 spin_unlock_irqrestore(&device_domain_lock, flags);
5869 static int intel_iommu_disable_auxd(struct device *dev)
5871 struct device_domain_info *info;
5872 unsigned long flags;
5874 spin_lock_irqsave(&device_domain_lock, flags);
5875 info = dev->archdata.iommu;
5876 if (!WARN_ON(!info))
5877 info->auxd_enabled = 0;
5878 spin_unlock_irqrestore(&device_domain_lock, flags);
5884 * A PCI express designated vendor specific extended capability is defined
5885 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886 * for system software and tools to detect endpoint devices supporting the
5887 * Intel scalable IO virtualization without host driver dependency.
5889 * Returns the address of the matching extended capability structure within
5890 * the device's PCI configuration space or 0 if the device does not support
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5898 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5900 pci_read_config_word(pdev, pos + 4, &vendor);
5901 pci_read_config_word(pdev, pos + 8, &id);
5902 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5905 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5914 if (feat == IOMMU_DEV_FEAT_AUX) {
5917 if (!dev_is_pci(dev) || dmar_disabled ||
5918 !scalable_mode_support() || !iommu_pasid_support())
5921 ret = pci_pasid_features(to_pci_dev(dev));
5925 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5932 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5934 if (feat == IOMMU_DEV_FEAT_AUX)
5935 return intel_iommu_enable_auxd(dev);
5941 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5943 if (feat == IOMMU_DEV_FEAT_AUX)
5944 return intel_iommu_disable_auxd(dev);
5950 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5952 struct device_domain_info *info = dev->archdata.iommu;
5954 if (feat == IOMMU_DEV_FEAT_AUX)
5955 return scalable_mode_support() && info && info->auxd_enabled;
5961 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5963 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5965 return dmar_domain->default_pasid > 0 ?
5966 dmar_domain->default_pasid : -EINVAL;
5969 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5972 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5975 const struct iommu_ops intel_iommu_ops = {
5976 .capable = intel_iommu_capable,
5977 .domain_alloc = intel_iommu_domain_alloc,
5978 .domain_free = intel_iommu_domain_free,
5979 .attach_dev = intel_iommu_attach_device,
5980 .detach_dev = intel_iommu_detach_device,
5981 .aux_attach_dev = intel_iommu_aux_attach_device,
5982 .aux_detach_dev = intel_iommu_aux_detach_device,
5983 .aux_get_pasid = intel_iommu_aux_get_pasid,
5984 .map = intel_iommu_map,
5985 .unmap = intel_iommu_unmap,
5986 .iova_to_phys = intel_iommu_iova_to_phys,
5987 .add_device = intel_iommu_add_device,
5988 .remove_device = intel_iommu_remove_device,
5989 .get_resv_regions = intel_iommu_get_resv_regions,
5990 .put_resv_regions = intel_iommu_put_resv_regions,
5991 .apply_resv_region = intel_iommu_apply_resv_region,
5992 .device_group = pci_device_group,
5993 .dev_has_feat = intel_iommu_dev_has_feat,
5994 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5995 .dev_enable_feat = intel_iommu_dev_enable_feat,
5996 .dev_disable_feat = intel_iommu_dev_disable_feat,
5997 .is_attach_deferred = intel_iommu_is_attach_deferred,
5998 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6001 static void quirk_iommu_igfx(struct pci_dev *dev)
6003 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6007 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6016 /* Broadwell igfx malfunctions with dmar */
6017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6042 static void quirk_iommu_rwbf(struct pci_dev *dev)
6045 * Mobile 4 Series Chipset neglects to set RWBF capability,
6046 * but needs it. Same seems to hold for the desktop versions.
6048 pci_info(dev, "Forcing write-buffer flush capability\n");
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6053 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6058 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6061 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6062 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6063 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6064 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6065 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6066 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6067 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6068 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6070 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6074 if (pci_read_config_word(dev, GGC, &ggc))
6077 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6078 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6080 } else if (dmar_map_gfx) {
6081 /* we have to ensure the gfx device is idle before we flush */
6082 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6083 intel_iommu_strict = 1;
6086 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6091 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6092 ISOCH DMAR unit for the Azalia sound device, but not give it any
6093 TLB entries, which causes it to deadlock. Check for that. We do
6094 this in a function called from init_dmars(), instead of in a PCI
6095 quirk, because we don't want to print the obnoxious "BIOS broken"
6096 message if VT-d is actually disabled.
6098 static void __init check_tylersburg_isoch(void)
6100 struct pci_dev *pdev;
6101 uint32_t vtisochctrl;
6103 /* If there's no Azalia in the system anyway, forget it. */
6104 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6109 /* System Management Registers. Might be hidden, in which case
6110 we can't do the sanity check. But that's OK, because the
6111 known-broken BIOSes _don't_ actually hide it, so far. */
6112 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6116 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6123 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6124 if (vtisochctrl & 1)
6127 /* Drop all bits other than the number of TLB entries */
6128 vtisochctrl &= 0x1c;
6130 /* If we have the recommended number of TLB entries (16), fine. */
6131 if (vtisochctrl == 0x10)
6134 /* Zero TLB entries? You get to ride the short bus to school. */
6136 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6137 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6138 dmi_get_system_info(DMI_BIOS_VENDOR),
6139 dmi_get_system_info(DMI_BIOS_VERSION),
6140 dmi_get_system_info(DMI_PRODUCT_VERSION));
6141 iommu_identity_mapping |= IDENTMAP_AZALIA;
6145 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",