1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
413 struct device_domain_info *info;
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
419 spin_unlock_irqrestore(&device_domain_lock, flags);
423 spin_unlock_irqrestore(&device_domain_lock, flags);
428 const struct iommu_ops intel_iommu_ops;
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
440 static void init_translation_status(struct intel_iommu *iommu)
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
452 return container_of(dom, struct dmar_domain, domain);
455 static int __init intel_iommu_setup(char *str)
460 if (!strncmp(str, "on", 2)) {
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
491 str += strcspn(str, ",");
497 __setup("intel_iommu=", intel_iommu_setup);
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
504 struct dmar_domain **domains;
507 domains = iommu->domains[idx];
511 return domains[did & 0xff];
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
517 struct dmar_domain **domains;
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
529 domains[did & 0xff] = domain;
532 void *alloc_pgtable_page(int node)
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
539 vaddr = page_address(page);
543 void free_pgtable_page(void *vaddr)
545 free_page((unsigned long)vaddr);
548 static inline void *alloc_domain_mem(void)
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
553 static void free_domain_mem(void *vaddr)
555 kmem_cache_free(iommu_domain_cache, vaddr);
558 static inline void * alloc_devinfo_mem(void)
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
563 static inline void free_devinfo_mem(void *vaddr)
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
568 static inline int domain_type_is_si(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
594 if (test_bit(agaw, &sagaw))
602 * Calculate max SAGAW for each iommu.
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
628 for_each_domain_iommu(iommu_id, domain)
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
634 return g_iommus[iommu_id];
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
644 domain->iommu_coherency = 1;
646 for_each_domain_iommu(i, domain) {
648 if (!ecap_coherent(g_iommus[i]->ecap)) {
649 domain->iommu_coherency = 0;
656 /* No hardware attached; use lowest common denominator */
658 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_coherent(iommu->ecap)) {
660 domain->iommu_coherency = 0;
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
669 struct dmar_drhd_unit *drhd;
670 struct intel_iommu *iommu;
674 for_each_active_iommu(iommu, drhd) {
676 if (!ecap_sc_support(iommu->ecap)) {
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688 struct intel_iommu *skip)
690 struct dmar_drhd_unit *drhd;
691 struct intel_iommu *iommu;
694 if (!intel_iommu_superpage) {
698 /* set iommu_superpage to the smallest common denominator */
700 for_each_active_iommu(iommu, drhd) {
702 if (domain && domain_use_first_level(domain)) {
703 if (!cap_fl1gp_support(iommu->cap))
706 mask &= cap_super_page_val(iommu->cap);
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
721 domain_update_iommu_coherency(domain);
722 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
729 struct root_entry *root = &iommu->root_entry[bus];
730 struct context_entry *context;
734 if (sm_supported(iommu)) {
742 context = phys_to_virt(*entry & VTD_PAGE_MASK);
744 unsigned long phy_addr;
748 context = alloc_pgtable_page(iommu->node);
752 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753 phy_addr = virt_to_phys((void *)context);
754 *entry = phy_addr | 1;
755 __iommu_flush_cache(iommu, entry, sizeof(*entry));
757 return &context[devfn];
760 static int iommu_dummy(struct device *dev)
762 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
765 static bool attach_deferred(struct device *dev)
767 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
771 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772 * sub-hierarchy of a candidate PCI-PCI bridge
773 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774 * @bridge: the candidate PCI-PCI bridge
776 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
781 struct pci_dev *pdev, *pbridge;
783 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
786 pdev = to_pci_dev(dev);
787 pbridge = to_pci_dev(bridge);
789 if (pbridge->subordinate &&
790 pbridge->subordinate->number <= pdev->bus->number &&
791 pbridge->subordinate->busn_res.end >= pdev->bus->number)
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
799 struct dmar_drhd_unit *drhd = NULL;
800 struct intel_iommu *iommu;
802 struct pci_dev *pdev = NULL;
806 if (iommu_dummy(dev))
809 if (dev_is_pci(dev)) {
810 struct pci_dev *pf_pdev;
812 pdev = pci_real_dma_dev(to_pci_dev(dev));
814 /* VFs aren't listed in scope tables; we need to look up
815 * the PF instead to find the IOMMU. */
816 pf_pdev = pci_physfn(pdev);
818 segment = pci_domain_nr(pdev->bus);
819 } else if (has_acpi_companion(dev))
820 dev = &ACPI_COMPANION(dev)->dev;
823 for_each_active_iommu(iommu, drhd) {
824 if (pdev && segment != drhd->segment)
827 for_each_active_dev_scope(drhd->devices,
828 drhd->devices_cnt, i, tmp) {
830 /* For a VF use its original BDF# not that of the PF
831 * which we used for the IOMMU lookup. Strictly speaking
832 * we could do this for all PCI devices; we only need to
833 * get the BDF# from the scope table for ACPI matches. */
834 if (pdev && pdev->is_virtfn)
837 *bus = drhd->devices[i].bus;
838 *devfn = drhd->devices[i].devfn;
842 if (is_downstream_to_pci_bridge(dev, tmp))
846 if (pdev && drhd->include_all) {
848 *bus = pdev->bus->number;
849 *devfn = pdev->devfn;
860 static void domain_flush_cache(struct dmar_domain *domain,
861 void *addr, int size)
863 if (!domain->iommu_coherency)
864 clflush_cache_range(addr, size);
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
869 struct context_entry *context;
873 spin_lock_irqsave(&iommu->lock, flags);
874 context = iommu_context_addr(iommu, bus, devfn, 0);
876 ret = context_present(context);
877 spin_unlock_irqrestore(&iommu->lock, flags);
881 static void free_context_table(struct intel_iommu *iommu)
885 struct context_entry *context;
887 spin_lock_irqsave(&iommu->lock, flags);
888 if (!iommu->root_entry) {
891 for (i = 0; i < ROOT_ENTRY_NR; i++) {
892 context = iommu_context_addr(iommu, i, 0, 0);
894 free_pgtable_page(context);
896 if (!sm_supported(iommu))
899 context = iommu_context_addr(iommu, i, 0x80, 0);
901 free_pgtable_page(context);
904 free_pgtable_page(iommu->root_entry);
905 iommu->root_entry = NULL;
907 spin_unlock_irqrestore(&iommu->lock, flags);
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
917 BUG_ON(!domain->pgd);
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
923 parent = domain->pgd;
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
932 if (level == *target_level)
935 if (!dma_pte_present(pte)) {
938 tmp_page = alloc_pgtable_page(domain->nid);
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain_use_first_level(domain))
946 pteval |= DMA_FL_PTE_XD;
947 if (cmpxchg64(&pte->val, 0ULL, pteval))
948 /* Someone else set it while we were thinking; use theirs. */
949 free_pgtable_page(tmp_page);
951 domain_flush_cache(domain, pte, sizeof(*pte));
956 parent = phys_to_virt(dma_pte_addr(pte));
961 *target_level = level;
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 int level, int *large_page)
971 struct dma_pte *parent, *pte;
972 int total = agaw_to_level(domain->agaw);
975 parent = domain->pgd;
976 while (level <= total) {
977 offset = pfn_level_offset(pfn, total);
978 pte = &parent[offset];
982 if (!dma_pte_present(pte)) {
987 if (dma_pte_superpage(pte)) {
992 parent = phys_to_virt(dma_pte_addr(pte));
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 unsigned long start_pfn,
1001 unsigned long last_pfn)
1003 unsigned int large_page;
1004 struct dma_pte *first_pte, *pte;
1006 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008 BUG_ON(start_pfn > last_pfn);
1010 /* we don't need lock here; nobody else touches the iova range */
1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1020 start_pfn += lvl_to_nr_pages(large_page);
1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024 domain_flush_cache(domain, first_pte,
1025 (void *)pte - (void *)first_pte);
1027 } while (start_pfn && start_pfn <= last_pfn);
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 int retain_level, struct dma_pte *pte,
1032 unsigned long pfn, unsigned long start_pfn,
1033 unsigned long last_pfn)
1035 pfn = max(start_pfn, pfn);
1036 pte = &pte[pfn_level_offset(pfn, level)];
1039 unsigned long level_pfn;
1040 struct dma_pte *level_pte;
1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1045 level_pfn = pfn & level_mask(level);
1046 level_pte = phys_to_virt(dma_pte_addr(pte));
1049 dma_pte_free_level(domain, level - 1, retain_level,
1050 level_pte, level_pfn, start_pfn,
1055 * Free the page table if we're below the level we want to
1056 * retain and the range covers the entire table.
1058 if (level < retain_level && !(start_pfn > level_pfn ||
1059 last_pfn < level_pfn + level_size(level) - 1)) {
1061 domain_flush_cache(domain, pte, sizeof(*pte));
1062 free_pgtable_page(level_pte);
1065 pfn += level_size(level);
1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1070 * clear last level (leaf) ptes and free page table pages below the
1071 * level we wish to keep intact.
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 unsigned long start_pfn,
1075 unsigned long last_pfn,
1078 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 BUG_ON(start_pfn > last_pfn);
1082 dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086 domain->pgd, 0, start_pfn, last_pfn);
1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 free_pgtable_page(domain->pgd);
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102 int level, struct dma_pte *pte,
1103 struct page *freelist)
1107 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108 pg->freelist = freelist;
1114 pte = page_address(pg);
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 freelist = dma_pte_list_pagetables(domain, level - 1,
1120 } while (!first_pte_in_page(pte));
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126 struct dma_pte *pte, unsigned long pfn,
1127 unsigned long start_pfn,
1128 unsigned long last_pfn,
1129 struct page *freelist)
1131 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1133 pfn = max(start_pfn, pfn);
1134 pte = &pte[pfn_level_offset(pfn, level)];
1137 unsigned long level_pfn;
1139 if (!dma_pte_present(pte))
1142 level_pfn = pfn & level_mask(level);
1144 /* If range covers entire pagetable, free it */
1145 if (start_pfn <= level_pfn &&
1146 last_pfn >= level_pfn + level_size(level) - 1) {
1147 /* These suborbinate page tables are going away entirely. Don't
1148 bother to clear them; we're just going to *free* them. */
1149 if (level > 1 && !dma_pte_superpage(pte))
1150 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1156 } else if (level > 1) {
1157 /* Recurse down into a level that isn't *entirely* obsolete */
1158 freelist = dma_pte_clear_level(domain, level - 1,
1159 phys_to_virt(dma_pte_addr(pte)),
1160 level_pfn, start_pfn, last_pfn,
1164 pfn += level_size(level);
1165 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1168 domain_flush_cache(domain, first_pte,
1169 (void *)++last_pte - (void *)first_pte);
1174 /* We can't just free the pages because the IOMMU may still be walking
1175 the page tables, and may have cached the intermediate levels. The
1176 pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178 unsigned long start_pfn,
1179 unsigned long last_pfn)
1181 struct page *freelist;
1183 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185 BUG_ON(start_pfn > last_pfn);
1187 /* we don't need lock here; nobody else touches the iova range */
1188 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189 domain->pgd, 0, start_pfn, last_pfn, NULL);
1192 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193 struct page *pgd_page = virt_to_page(domain->pgd);
1194 pgd_page->freelist = freelist;
1195 freelist = pgd_page;
1203 static void dma_free_pagelist(struct page *freelist)
1207 while ((pg = freelist)) {
1208 freelist = pg->freelist;
1209 free_pgtable_page(page_address(pg));
1213 static void iova_entry_free(unsigned long data)
1215 struct page *freelist = (struct page *)data;
1217 dma_free_pagelist(freelist);
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1223 struct root_entry *root;
1224 unsigned long flags;
1226 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1228 pr_err("Allocating root entry for %s failed\n",
1233 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1235 spin_lock_irqsave(&iommu->lock, flags);
1236 iommu->root_entry = root;
1237 spin_unlock_irqrestore(&iommu->lock, flags);
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1248 addr = virt_to_phys(iommu->root_entry);
1249 if (sm_supported(iommu))
1250 addr |= DMA_RTADDR_SMT;
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1255 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (sts & DMA_GSTS_RTPS), sts);
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1269 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1272 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1275 /* Make sure hardware complete it */
1276 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277 readl, (!(val & DMA_GSTS_WBFS)), val);
1279 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284 u16 did, u16 source_id, u8 function_mask,
1291 case DMA_CCMD_GLOBAL_INVL:
1292 val = DMA_CCMD_GLOBAL_INVL;
1294 case DMA_CCMD_DOMAIN_INVL:
1295 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1297 case DMA_CCMD_DEVICE_INVL:
1298 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1304 val |= DMA_CCMD_ICC;
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1313 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318 u64 addr, unsigned int size_order, u64 type)
1320 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321 u64 val = 0, val_iva = 0;
1325 case DMA_TLB_GLOBAL_FLUSH:
1326 /* global flush doesn't need set IVA_REG */
1327 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1329 case DMA_TLB_DSI_FLUSH:
1330 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1332 case DMA_TLB_PSI_FLUSH:
1333 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334 /* IH bit is passed in as part of address */
1335 val_iva = size_order | addr;
1340 /* Note: set drain read/write */
1343 * This is probably to be super secure.. Looks like we can
1344 * ignore it without any impact.
1346 if (cap_read_drain(iommu->cap))
1347 val |= DMA_TLB_READ_DRAIN;
1349 if (cap_write_drain(iommu->cap))
1350 val |= DMA_TLB_WRITE_DRAIN;
1352 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353 /* Note: Only uses first TLB reg currently */
1355 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1358 /* Make sure hardware complete it */
1359 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1362 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 /* check IOTLB invalidation granularity */
1365 if (DMA_TLB_IAIG(val) == 0)
1366 pr_err("Flush IOTLB failed\n");
1367 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369 (unsigned long long)DMA_TLB_IIRG(type),
1370 (unsigned long long)DMA_TLB_IAIG(val));
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1377 struct device_domain_info *info;
1379 assert_spin_locked(&device_domain_lock);
1384 list_for_each_entry(info, &domain->devices, link)
1385 if (info->iommu == iommu && info->bus == bus &&
1386 info->devfn == devfn) {
1387 if (info->ats_supported && info->dev)
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1397 struct device_domain_info *info;
1398 bool has_iotlb_device = false;
1400 assert_spin_locked(&device_domain_lock);
1402 list_for_each_entry(info, &domain->devices, link) {
1403 struct pci_dev *pdev;
1405 if (!info->dev || !dev_is_pci(info->dev))
1408 pdev = to_pci_dev(info->dev);
1409 if (pdev->ats_enabled) {
1410 has_iotlb_device = true;
1415 domain->has_iotlb_device = has_iotlb_device;
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1420 struct pci_dev *pdev;
1422 assert_spin_locked(&device_domain_lock);
1424 if (!info || !dev_is_pci(info->dev))
1427 pdev = to_pci_dev(info->dev);
1428 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431 * reserved, which should be set to 0.
1433 if (!ecap_dit(info->iommu->ecap))
1436 struct pci_dev *pf_pdev;
1438 /* pdev will be returned if device is not a vf */
1439 pf_pdev = pci_physfn(pdev);
1440 info->pfsid = pci_dev_id(pf_pdev);
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 /* The PCIe spec, in its wisdom, declares that the behaviour of
1445 the device if you enable PASID support after ATS support is
1446 undefined. So always enable PASID support on devices which
1447 have it, even if we can't yet know if we're ever going to
1449 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450 info->pasid_enabled = 1;
1452 if (info->pri_supported &&
1453 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1454 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455 info->pri_enabled = 1;
1457 if (!pdev->untrusted && info->ats_supported &&
1458 pci_ats_page_aligned(pdev) &&
1459 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460 info->ats_enabled = 1;
1461 domain_update_iotlb(info->domain);
1462 info->ats_qdep = pci_ats_queue_depth(pdev);
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1468 struct pci_dev *pdev;
1470 assert_spin_locked(&device_domain_lock);
1472 if (!dev_is_pci(info->dev))
1475 pdev = to_pci_dev(info->dev);
1477 if (info->ats_enabled) {
1478 pci_disable_ats(pdev);
1479 info->ats_enabled = 0;
1480 domain_update_iotlb(info->domain);
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483 if (info->pri_enabled) {
1484 pci_disable_pri(pdev);
1485 info->pri_enabled = 0;
1487 if (info->pasid_enabled) {
1488 pci_disable_pasid(pdev);
1489 info->pasid_enabled = 0;
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495 u64 addr, unsigned mask)
1498 unsigned long flags;
1499 struct device_domain_info *info;
1501 if (!domain->has_iotlb_device)
1504 spin_lock_irqsave(&device_domain_lock, flags);
1505 list_for_each_entry(info, &domain->devices, link) {
1506 if (!info->ats_enabled)
1509 sid = info->bus << 8 | info->devfn;
1510 qdep = info->ats_qdep;
1511 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1514 spin_unlock_irqrestore(&device_domain_lock, flags);
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518 struct dmar_domain *domain,
1519 u64 addr, unsigned long npages, bool ih)
1521 u16 did = domain->iommu_did[iommu->seq_id];
1523 if (domain->default_pasid)
1524 qi_flush_piotlb(iommu, did, domain->default_pasid,
1527 if (!list_empty(&domain->devices))
1528 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532 struct dmar_domain *domain,
1533 unsigned long pfn, unsigned int pages,
1536 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538 u16 did = domain->iommu_did[iommu->seq_id];
1545 if (domain_use_first_level(domain)) {
1546 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1549 * Fallback to domain selective flush if no PSI support or
1550 * the size is too big. PSI requires page size to be 2 ^ x,
1551 * and the base address is naturally aligned to the size.
1553 if (!cap_pgsel_inv(iommu->cap) ||
1554 mask > cap_max_amask_val(iommu->cap))
1555 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1558 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1563 * In caching mode, changes of pages from non-present to present require
1564 * flush. However, device IOTLB doesn't need to be flushed in this case.
1566 if (!cap_caching_mode(iommu->cap) || !map)
1567 iommu_flush_dev_iotlb(domain, addr, mask);
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572 struct dmar_domain *domain,
1573 unsigned long pfn, unsigned int pages)
1576 * It's a non-present to present mapping. Only flush if caching mode
1579 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1582 iommu_flush_write_buffer(iommu);
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1587 struct dmar_domain *domain;
1590 domain = container_of(iovad, struct dmar_domain, iovad);
1592 for_each_domain_iommu(idx, domain) {
1593 struct intel_iommu *iommu = g_iommus[idx];
1594 u16 did = domain->iommu_did[iommu->seq_id];
1596 if (domain_use_first_level(domain))
1597 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1599 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1602 if (!cap_caching_mode(iommu->cap))
1603 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604 0, MAX_AGAW_PFN_WIDTH);
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1611 unsigned long flags;
1613 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618 pmen &= ~DMA_PMEN_EPM;
1619 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621 /* wait for the protected region status bit to clear */
1622 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1631 unsigned long flags;
1633 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634 iommu->gcmd |= DMA_GCMD_TE;
1635 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637 /* Make sure hardware complete it */
1638 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639 readl, (sts & DMA_GSTS_TES), sts);
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650 iommu->gcmd &= ~DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (!(sts & DMA_GSTS_TES)), sts);
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1662 u32 ndomains, nlongs;
1665 ndomains = cap_ndoms(iommu->cap);
1666 pr_debug("%s: Number of Domains supported <%d>\n",
1667 iommu->name, ndomains);
1668 nlongs = BITS_TO_LONGS(ndomains);
1670 spin_lock_init(&iommu->lock);
1672 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673 if (!iommu->domain_ids) {
1674 pr_err("%s: Allocating domain id array failed\n",
1679 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680 iommu->domains = kzalloc(size, GFP_KERNEL);
1682 if (iommu->domains) {
1683 size = 256 * sizeof(struct dmar_domain *);
1684 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1687 if (!iommu->domains || !iommu->domains[0]) {
1688 pr_err("%s: Allocating domain array failed\n",
1690 kfree(iommu->domain_ids);
1691 kfree(iommu->domains);
1692 iommu->domain_ids = NULL;
1693 iommu->domains = NULL;
1698 * If Caching mode is set, then invalid translations are tagged
1699 * with domain-id 0, hence we need to pre-allocate it. We also
1700 * use domain-id 0 as a marker for non-allocated domain-id, so
1701 * make sure it is not used for a real domain.
1703 set_bit(0, iommu->domain_ids);
1706 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707 * entry for first-level or pass-through translation modes should
1708 * be programmed with a domain id different from those used for
1709 * second-level or nested translation. We reserve a domain id for
1712 if (sm_supported(iommu))
1713 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1720 struct device_domain_info *info, *tmp;
1721 unsigned long flags;
1723 if (!iommu->domains || !iommu->domain_ids)
1726 spin_lock_irqsave(&device_domain_lock, flags);
1727 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728 if (info->iommu != iommu)
1731 if (!info->dev || !info->domain)
1734 __dmar_remove_one_dev_info(info);
1736 spin_unlock_irqrestore(&device_domain_lock, flags);
1738 if (iommu->gcmd & DMA_GCMD_TE)
1739 iommu_disable_translation(iommu);
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1744 if ((iommu->domains) && (iommu->domain_ids)) {
1745 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1748 for (i = 0; i < elems; i++)
1749 kfree(iommu->domains[i]);
1750 kfree(iommu->domains);
1751 kfree(iommu->domain_ids);
1752 iommu->domains = NULL;
1753 iommu->domain_ids = NULL;
1756 g_iommus[iommu->seq_id] = NULL;
1758 /* free context mapping */
1759 free_context_table(iommu);
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762 if (pasid_supported(iommu)) {
1763 if (ecap_prs(iommu->ecap))
1764 intel_svm_finish_prq(iommu);
1770 * Check and return whether first level is used by default for
1773 static bool first_level_by_default(void)
1775 struct dmar_drhd_unit *drhd;
1776 struct intel_iommu *iommu;
1777 static int first_level_support = -1;
1779 if (likely(first_level_support != -1))
1780 return first_level_support;
1782 first_level_support = 1;
1785 for_each_active_iommu(iommu, drhd) {
1786 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787 first_level_support = 0;
1793 return first_level_support;
1796 static struct dmar_domain *alloc_domain(int flags)
1798 struct dmar_domain *domain;
1800 domain = alloc_domain_mem();
1804 memset(domain, 0, sizeof(*domain));
1805 domain->nid = NUMA_NO_NODE;
1806 domain->flags = flags;
1807 if (first_level_by_default())
1808 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809 domain->has_iotlb_device = false;
1810 INIT_LIST_HEAD(&domain->devices);
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817 struct intel_iommu *iommu)
1819 unsigned long ndomains;
1822 assert_spin_locked(&device_domain_lock);
1823 assert_spin_locked(&iommu->lock);
1825 domain->iommu_refcnt[iommu->seq_id] += 1;
1826 domain->iommu_count += 1;
1827 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828 ndomains = cap_ndoms(iommu->cap);
1829 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1831 if (num >= ndomains) {
1832 pr_err("%s: No free domain ids\n", iommu->name);
1833 domain->iommu_refcnt[iommu->seq_id] -= 1;
1834 domain->iommu_count -= 1;
1838 set_bit(num, iommu->domain_ids);
1839 set_iommu_domain(iommu, num, domain);
1841 domain->iommu_did[iommu->seq_id] = num;
1842 domain->nid = iommu->node;
1844 domain_update_iommu_cap(domain);
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851 struct intel_iommu *iommu)
1855 assert_spin_locked(&device_domain_lock);
1856 assert_spin_locked(&iommu->lock);
1858 domain->iommu_refcnt[iommu->seq_id] -= 1;
1859 count = --domain->iommu_count;
1860 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861 num = domain->iommu_did[iommu->seq_id];
1862 clear_bit(num, iommu->domain_ids);
1863 set_iommu_domain(iommu, num, NULL);
1865 domain_update_iommu_cap(domain);
1866 domain->iommu_did[iommu->seq_id] = 0;
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1875 static int dmar_init_reserved_ranges(void)
1877 struct pci_dev *pdev = NULL;
1881 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1883 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884 &reserved_rbtree_key);
1886 /* IOAPIC ranges shouldn't be accessed by DMA */
1887 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888 IOVA_PFN(IOAPIC_RANGE_END));
1890 pr_err("Reserve IOAPIC range failed\n");
1894 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895 for_each_pci_dev(pdev) {
1898 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899 r = &pdev->resource[i];
1900 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1902 iova = reserve_iova(&reserved_iova_list,
1906 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1916 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1922 int r = (gaw - 12) % 9;
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1936 int adjust_width, agaw;
1937 unsigned long sagaw;
1940 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1942 if (!intel_iommu_strict) {
1943 ret = init_iova_flush_queue(&domain->iovad,
1944 iommu_flush_iova, iova_entry_free);
1946 pr_info("iova flush queue initialization failed\n");
1949 domain_reserve_special_ranges(domain);
1951 /* calculate AGAW */
1952 if (guest_width > cap_mgaw(iommu->cap))
1953 guest_width = cap_mgaw(iommu->cap);
1954 domain->gaw = guest_width;
1955 adjust_width = guestwidth_to_adjustwidth(guest_width);
1956 agaw = width_to_agaw(adjust_width);
1957 sagaw = cap_sagaw(iommu->cap);
1958 if (!test_bit(agaw, &sagaw)) {
1959 /* hardware doesn't support it, choose a bigger one */
1960 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961 agaw = find_next_bit(&sagaw, 5, agaw);
1965 domain->agaw = agaw;
1967 if (ecap_coherent(iommu->ecap))
1968 domain->iommu_coherency = 1;
1970 domain->iommu_coherency = 0;
1972 if (ecap_sc_support(iommu->ecap))
1973 domain->iommu_snooping = 1;
1975 domain->iommu_snooping = 0;
1977 if (intel_iommu_superpage)
1978 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1980 domain->iommu_superpage = 0;
1982 domain->nid = iommu->node;
1984 /* always allocate the top pgd */
1985 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1988 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1992 static void domain_exit(struct dmar_domain *domain)
1995 /* Remove associated devices and clear attached or cached domains */
1996 domain_remove_dev_info(domain);
1999 put_iova_domain(&domain->iovad);
2002 struct page *freelist;
2004 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005 dma_free_pagelist(freelist);
2008 free_domain_mem(domain);
2012 * Get the PASID directory size for scalable mode context entry.
2013 * Value of X in the PDTS field of a scalable mode context entry
2014 * indicates PASID directory with 2^(X + 7) entries.
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2020 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2029 * Set the RID_PASID field of a scalable mode context entry. The
2030 * IOMMU hardware will use the PASID value set in this field for
2031 * DMA translations of DMA requests without PASID.
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2036 context->hi |= pasid & ((1 << 20) - 1);
2037 context->hi |= (1 << 20);
2041 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2044 static inline void context_set_sm_dte(struct context_entry *context)
2046 context->lo |= (1 << 2);
2050 * Set the PRE(Page Request Enable) field of a scalable mode context
2053 static inline void context_set_sm_pre(struct context_entry *context)
2055 context->lo |= (1 << 4);
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds) (((pds) & 0x7) << 9)
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062 struct intel_iommu *iommu,
2063 struct pasid_table *table,
2066 u16 did = domain->iommu_did[iommu->seq_id];
2067 int translation = CONTEXT_TT_MULTI_LEVEL;
2068 struct device_domain_info *info = NULL;
2069 struct context_entry *context;
2070 unsigned long flags;
2075 if (hw_pass_through && domain_type_is_si(domain))
2076 translation = CONTEXT_TT_PASS_THROUGH;
2078 pr_debug("Set context mapping for %02x:%02x.%d\n",
2079 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2081 BUG_ON(!domain->pgd);
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 spin_lock(&iommu->lock);
2087 context = iommu_context_addr(iommu, bus, devfn, 1);
2092 if (context_present(context))
2096 * For kdump cases, old valid entries may be cached due to the
2097 * in-flight DMA and copied pgtable, but there is no unmapping
2098 * behaviour for them, thus we need an explicit cache flush for
2099 * the newly-mapped device. For kdump, at this point, the device
2100 * is supposed to finish reset at its driver probe stage, so no
2101 * in-flight DMA will exist, and we don't need to worry anymore
2104 if (context_copied(context)) {
2105 u16 did_old = context_domain_id(context);
2107 if (did_old < cap_ndoms(iommu->cap)) {
2108 iommu->flush.flush_context(iommu, did_old,
2109 (((u16)bus) << 8) | devfn,
2110 DMA_CCMD_MASK_NOBIT,
2111 DMA_CCMD_DEVICE_INVL);
2112 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2117 context_clear_entry(context);
2119 if (sm_supported(iommu)) {
2124 /* Setup the PASID DIR pointer: */
2125 pds = context_get_sm_pds(table);
2126 context->lo = (u64)virt_to_phys(table->table) |
2129 /* Setup the RID_PASID field: */
2130 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2133 * Setup the Device-TLB enable bit and Page request
2136 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137 if (info && info->ats_supported)
2138 context_set_sm_dte(context);
2139 if (info && info->pri_supported)
2140 context_set_sm_pre(context);
2142 struct dma_pte *pgd = domain->pgd;
2145 context_set_domain_id(context, did);
2147 if (translation != CONTEXT_TT_PASS_THROUGH) {
2149 * Skip top levels of page tables for iommu which has
2150 * less agaw than default. Unnecessary for PT mode.
2152 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2154 pgd = phys_to_virt(dma_pte_addr(pgd));
2155 if (!dma_pte_present(pgd))
2159 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160 if (info && info->ats_supported)
2161 translation = CONTEXT_TT_DEV_IOTLB;
2163 translation = CONTEXT_TT_MULTI_LEVEL;
2165 context_set_address_root(context, virt_to_phys(pgd));
2166 context_set_address_width(context, agaw);
2169 * In pass through mode, AW must be programmed to
2170 * indicate the largest AGAW value supported by
2171 * hardware. And ASR is ignored by hardware.
2173 context_set_address_width(context, iommu->msagaw);
2176 context_set_translation_type(context, translation);
2179 context_set_fault_enable(context);
2180 context_set_present(context);
2181 domain_flush_cache(domain, context, sizeof(*context));
2184 * It's a non-present to present mapping. If hardware doesn't cache
2185 * non-present entry we only need to flush the write-buffer. If the
2186 * _does_ cache non-present entries, then it does so in the special
2187 * domain #0, which we have to flush:
2189 if (cap_caching_mode(iommu->cap)) {
2190 iommu->flush.flush_context(iommu, 0,
2191 (((u16)bus) << 8) | devfn,
2192 DMA_CCMD_MASK_NOBIT,
2193 DMA_CCMD_DEVICE_INVL);
2194 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2196 iommu_flush_write_buffer(iommu);
2198 iommu_enable_dev_iotlb(info);
2203 spin_unlock(&iommu->lock);
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2209 struct domain_context_mapping_data {
2210 struct dmar_domain *domain;
2211 struct intel_iommu *iommu;
2212 struct pasid_table *table;
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216 u16 alias, void *opaque)
2218 struct domain_context_mapping_data *data = opaque;
2220 return domain_context_mapping_one(data->domain, data->iommu,
2221 data->table, PCI_BUS_NUM(alias),
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2228 struct domain_context_mapping_data data;
2229 struct pasid_table *table;
2230 struct intel_iommu *iommu;
2233 iommu = device_to_iommu(dev, &bus, &devfn);
2237 table = intel_pasid_get_table(dev);
2239 if (!dev_is_pci(dev))
2240 return domain_context_mapping_one(domain, iommu, table,
2243 data.domain = domain;
2247 return pci_for_each_dma_alias(to_pci_dev(dev),
2248 &domain_context_mapping_cb, &data);
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252 u16 alias, void *opaque)
2254 struct intel_iommu *iommu = opaque;
2256 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2259 static int domain_context_mapped(struct device *dev)
2261 struct intel_iommu *iommu;
2264 iommu = device_to_iommu(dev, &bus, &devfn);
2268 if (!dev_is_pci(dev))
2269 return device_context_mapped(iommu, bus, devfn);
2271 return !pci_for_each_dma_alias(to_pci_dev(dev),
2272 domain_context_mapped_cb, iommu);
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2279 host_addr &= ~PAGE_MASK;
2280 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285 unsigned long iov_pfn,
2286 unsigned long phy_pfn,
2287 unsigned long pages)
2289 int support, level = 1;
2290 unsigned long pfnmerge;
2292 support = domain->iommu_superpage;
2294 /* To use a large page, the virtual *and* physical addresses
2295 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296 of them will mean we have to use smaller pages. So just
2297 merge them and check both at once. */
2298 pfnmerge = iov_pfn | phy_pfn;
2300 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301 pages >>= VTD_STRIDE_SHIFT;
2304 pfnmerge >>= VTD_STRIDE_SHIFT;
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312 struct scatterlist *sg, unsigned long phys_pfn,
2313 unsigned long nr_pages, int prot)
2315 struct dma_pte *first_pte = NULL, *pte = NULL;
2316 phys_addr_t uninitialized_var(pteval);
2317 unsigned long sg_res = 0;
2318 unsigned int largepage_lvl = 0;
2319 unsigned long lvl_pages = 0;
2322 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2324 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2327 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328 if (domain_use_first_level(domain))
2329 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2333 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2336 while (nr_pages > 0) {
2340 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2342 sg_res = aligned_nrpages(sg->offset, sg->length);
2343 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344 sg->dma_length = sg->length;
2345 pteval = (sg_phys(sg) - pgoff) | attr;
2346 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2350 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2352 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2355 /* It is large page*/
2356 if (largepage_lvl > 1) {
2357 unsigned long nr_superpages, end_pfn;
2359 pteval |= DMA_PTE_LARGE_PAGE;
2360 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2362 nr_superpages = sg_res / lvl_pages;
2363 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2366 * Ensure that old small page tables are
2367 * removed to make room for superpage(s).
2368 * We're adding new large pages, so make sure
2369 * we don't remove their parent tables.
2371 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2374 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2378 /* We don't need lock here, nobody else
2379 * touches the iova range
2381 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2383 static int dumps = 5;
2384 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385 iov_pfn, tmp, (unsigned long long)pteval);
2388 debug_dma_dump_mappings(NULL);
2393 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2395 BUG_ON(nr_pages < lvl_pages);
2396 BUG_ON(sg_res < lvl_pages);
2398 nr_pages -= lvl_pages;
2399 iov_pfn += lvl_pages;
2400 phys_pfn += lvl_pages;
2401 pteval += lvl_pages * VTD_PAGE_SIZE;
2402 sg_res -= lvl_pages;
2404 /* If the next PTE would be the first in a new page, then we
2405 need to flush the cache on the entries we've just written.
2406 And then we'll need to recalculate 'pte', so clear it and
2407 let it get set again in the if (!pte) block above.
2409 If we're done (!nr_pages) we need to flush the cache too.
2411 Also if we've been setting superpages, we may need to
2412 recalculate 'pte' and switch back to smaller pages for the
2413 end of the mapping, if the trailing size is not enough to
2414 use another superpage (i.e. sg_res < lvl_pages). */
2416 if (!nr_pages || first_pte_in_page(pte) ||
2417 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418 domain_flush_cache(domain, first_pte,
2419 (void *)pte - (void *)first_pte);
2423 if (!sg_res && nr_pages)
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430 struct scatterlist *sg, unsigned long phys_pfn,
2431 unsigned long nr_pages, int prot)
2434 struct intel_iommu *iommu;
2436 /* Do the real mapping first */
2437 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2441 for_each_domain_iommu(iommu_id, domain) {
2442 iommu = g_iommus[iommu_id];
2443 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450 struct scatterlist *sg, unsigned long nr_pages,
2453 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457 unsigned long phys_pfn, unsigned long nr_pages,
2460 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2465 unsigned long flags;
2466 struct context_entry *context;
2472 spin_lock_irqsave(&iommu->lock, flags);
2473 context = iommu_context_addr(iommu, bus, devfn, 0);
2475 spin_unlock_irqrestore(&iommu->lock, flags);
2478 did_old = context_domain_id(context);
2479 context_clear_entry(context);
2480 __iommu_flush_cache(iommu, context, sizeof(*context));
2481 spin_unlock_irqrestore(&iommu->lock, flags);
2482 iommu->flush.flush_context(iommu,
2484 (((u16)bus) << 8) | devfn,
2485 DMA_CCMD_MASK_NOBIT,
2486 DMA_CCMD_DEVICE_INVL);
2487 iommu->flush.flush_iotlb(iommu,
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2496 assert_spin_locked(&device_domain_lock);
2497 list_del(&info->link);
2498 list_del(&info->global);
2500 info->dev->archdata.iommu = NULL;
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2505 struct device_domain_info *info, *tmp;
2506 unsigned long flags;
2508 spin_lock_irqsave(&device_domain_lock, flags);
2509 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510 __dmar_remove_one_dev_info(info);
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2514 struct dmar_domain *find_domain(struct device *dev)
2516 struct device_domain_info *info;
2518 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2521 if (dev_is_pci(dev))
2522 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2524 /* No lock here, assumes no domain exit in normal case */
2525 info = dev->archdata.iommu;
2527 return info->domain;
2532 static void do_deferred_attach(struct device *dev)
2534 struct iommu_domain *domain;
2536 dev->archdata.iommu = NULL;
2537 domain = iommu_get_domain_for_dev(dev);
2539 intel_iommu_attach_device(domain, dev);
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2545 struct device_domain_info *info;
2547 list_for_each_entry(info, &device_domain_list, global)
2548 if (info->iommu->segment == segment && info->bus == bus &&
2549 info->devfn == devfn)
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556 struct dmar_domain *domain,
2560 int flags = PASID_FLAG_SUPERVISOR_MODE;
2561 struct dma_pte *pgd = domain->pgd;
2565 * Skip top levels of page tables for iommu which has
2566 * less agaw than default. Unnecessary for PT mode.
2568 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569 pgd = phys_to_virt(dma_pte_addr(pgd));
2570 if (!dma_pte_present(pgd))
2574 level = agaw_to_level(agaw);
2575 if (level != 4 && level != 5)
2578 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2580 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581 domain->iommu_did[iommu->seq_id],
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2588 struct dmar_domain *domain)
2590 struct dmar_domain *found = NULL;
2591 struct device_domain_info *info;
2592 unsigned long flags;
2595 info = alloc_devinfo_mem();
2600 info->devfn = devfn;
2601 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2605 info->domain = domain;
2606 info->iommu = iommu;
2607 info->pasid_table = NULL;
2608 info->auxd_enabled = 0;
2609 INIT_LIST_HEAD(&info->auxiliary_domains);
2611 if (dev && dev_is_pci(dev)) {
2612 struct pci_dev *pdev = to_pci_dev(info->dev);
2614 if (!pdev->untrusted &&
2615 !pci_ats_disabled() &&
2616 ecap_dev_iotlb_support(iommu->ecap) &&
2617 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618 dmar_find_matched_atsr_unit(pdev))
2619 info->ats_supported = 1;
2621 if (sm_supported(iommu)) {
2622 if (pasid_supported(iommu)) {
2623 int features = pci_pasid_features(pdev);
2625 info->pasid_supported = features | 1;
2628 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630 info->pri_supported = 1;
2634 spin_lock_irqsave(&device_domain_lock, flags);
2636 found = find_domain(dev);
2639 struct device_domain_info *info2;
2640 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2642 found = info2->domain;
2648 spin_unlock_irqrestore(&device_domain_lock, flags);
2649 free_devinfo_mem(info);
2650 /* Caller must free the original domain */
2654 spin_lock(&iommu->lock);
2655 ret = domain_attach_iommu(domain, iommu);
2656 spin_unlock(&iommu->lock);
2659 spin_unlock_irqrestore(&device_domain_lock, flags);
2660 free_devinfo_mem(info);
2664 list_add(&info->link, &domain->devices);
2665 list_add(&info->global, &device_domain_list);
2667 dev->archdata.iommu = info;
2668 spin_unlock_irqrestore(&device_domain_lock, flags);
2670 /* PASID table is mandatory for a PCI device in scalable mode. */
2671 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672 ret = intel_pasid_alloc_table(dev);
2674 dev_err(dev, "PASID table allocation failed\n");
2675 dmar_remove_one_dev_info(dev);
2679 /* Setup the PASID entry for requests without PASID: */
2680 spin_lock(&iommu->lock);
2681 if (hw_pass_through && domain_type_is_si(domain))
2682 ret = intel_pasid_setup_pass_through(iommu, domain,
2683 dev, PASID_RID2PASID);
2684 else if (domain_use_first_level(domain))
2685 ret = domain_setup_first_level(iommu, domain, dev,
2688 ret = intel_pasid_setup_second_level(iommu, domain,
2689 dev, PASID_RID2PASID);
2690 spin_unlock(&iommu->lock);
2692 dev_err(dev, "Setup RID2PASID failed\n");
2693 dmar_remove_one_dev_info(dev);
2698 if (dev && domain_context_mapping(domain, dev)) {
2699 dev_err(dev, "Domain context map failed\n");
2700 dmar_remove_one_dev_info(dev);
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2709 *(u16 *)opaque = alias;
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2715 struct device_domain_info *info;
2716 struct dmar_domain *domain = NULL;
2717 struct intel_iommu *iommu;
2719 unsigned long flags;
2722 iommu = device_to_iommu(dev, &bus, &devfn);
2726 if (dev_is_pci(dev)) {
2727 struct pci_dev *pdev = to_pci_dev(dev);
2729 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2731 spin_lock_irqsave(&device_domain_lock, flags);
2732 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733 PCI_BUS_NUM(dma_alias),
2736 iommu = info->iommu;
2737 domain = info->domain;
2739 spin_unlock_irqrestore(&device_domain_lock, flags);
2741 /* DMA alias already has a domain, use it */
2746 /* Allocate and initialize new domain for the device */
2747 domain = alloc_domain(0);
2750 if (domain_init(domain, iommu, gaw)) {
2751 domain_exit(domain);
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760 struct dmar_domain *domain)
2762 struct intel_iommu *iommu;
2763 struct dmar_domain *tmp;
2764 u16 req_id, dma_alias;
2767 iommu = device_to_iommu(dev, &bus, &devfn);
2771 req_id = ((u16)bus << 8) | devfn;
2773 if (dev_is_pci(dev)) {
2774 struct pci_dev *pdev = to_pci_dev(dev);
2776 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2778 /* register PCI DMA alias device */
2779 if (req_id != dma_alias) {
2780 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781 dma_alias & 0xff, NULL, domain);
2783 if (!tmp || tmp != domain)
2788 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789 if (!tmp || tmp != domain)
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796 unsigned long long start,
2797 unsigned long long end)
2799 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2802 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803 dma_to_mm_pfn(last_vpfn))) {
2804 pr_err("Reserving iova failed\n");
2808 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2810 * RMRR range might have overlap with physical memory range,
2813 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2815 return __domain_mapping(domain, first_vpfn, NULL,
2816 first_vpfn, last_vpfn - first_vpfn + 1,
2817 DMA_PTE_READ|DMA_PTE_WRITE);
2820 static int domain_prepare_identity_map(struct device *dev,
2821 struct dmar_domain *domain,
2822 unsigned long long start,
2823 unsigned long long end)
2825 /* For _hardware_ passthrough, don't bother. But for software
2826 passthrough, we do it anyway -- it may indicate a memory
2827 range which is reserved in E820, so which didn't get set
2828 up to start with in si_domain */
2829 if (domain == si_domain && hw_pass_through) {
2830 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2835 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2838 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840 dmi_get_system_info(DMI_BIOS_VENDOR),
2841 dmi_get_system_info(DMI_BIOS_VERSION),
2842 dmi_get_system_info(DMI_PRODUCT_VERSION));
2846 if (end >> agaw_to_width(domain->agaw)) {
2847 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849 agaw_to_width(domain->agaw),
2850 dmi_get_system_info(DMI_BIOS_VENDOR),
2851 dmi_get_system_info(DMI_BIOS_VERSION),
2852 dmi_get_system_info(DMI_PRODUCT_VERSION));
2856 return iommu_domain_identity_map(domain, start, end);
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2861 static int __init si_domain_init(int hw)
2863 struct dmar_rmrr_unit *rmrr;
2867 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2871 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872 domain_exit(si_domain);
2879 for_each_online_node(nid) {
2880 unsigned long start_pfn, end_pfn;
2883 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884 ret = iommu_domain_identity_map(si_domain,
2885 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2892 * Identity map the RMRRs so that devices with RMRRs could also use
2895 for_each_rmrr_units(rmrr) {
2896 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2898 unsigned long long start = rmrr->base_address;
2899 unsigned long long end = rmrr->end_address;
2901 if (WARN_ON(end < start ||
2902 end >> agaw_to_width(si_domain->agaw)))
2905 ret = iommu_domain_identity_map(si_domain, start, end);
2914 static int identity_mapping(struct device *dev)
2916 struct device_domain_info *info;
2918 info = dev->archdata.iommu;
2920 return (info->domain == si_domain);
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2927 struct dmar_domain *ndomain;
2928 struct intel_iommu *iommu;
2931 iommu = device_to_iommu(dev, &bus, &devfn);
2935 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936 if (ndomain != domain)
2942 static bool device_has_rmrr(struct device *dev)
2944 struct dmar_rmrr_unit *rmrr;
2949 for_each_rmrr_units(rmrr) {
2951 * Return TRUE if this RMRR contains the device that
2954 for_each_active_dev_scope(rmrr->devices,
2955 rmrr->devices_cnt, i, tmp)
2957 is_downstream_to_pci_bridge(dev, tmp)) {
2967 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968 * is relaxable (ie. is allowed to be not enforced under some conditions)
2969 * @dev: device handle
2971 * We assume that PCI USB devices with RMRRs have them largely
2972 * for historical reasons and that the RMRR space is not actively used post
2973 * boot. This exclusion may change if vendors begin to abuse it.
2975 * The same exception is made for graphics devices, with the requirement that
2976 * any use of the RMRR regions will be torn down before assigning the device
2979 * Return: true if the RMRR is relaxable, false otherwise
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2983 struct pci_dev *pdev;
2985 if (!dev_is_pci(dev))
2988 pdev = to_pci_dev(dev);
2989 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2996 * There are a couple cases where we need to restrict the functionality of
2997 * devices associated with RMRRs. The first is when evaluating a device for
2998 * identity mapping because problems exist when devices are moved in and out
2999 * of domains and their respective RMRR information is lost. This means that
3000 * a device with associated RMRRs will never be in a "passthrough" domain.
3001 * The second is use of the device through the IOMMU API. This interface
3002 * expects to have full control of the IOVA space for the device. We cannot
3003 * satisfy both the requirement that RMRR access is maintained and have an
3004 * unencumbered IOVA space. We also have no ability to quiesce the device's
3005 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006 * We therefore prevent devices associated with an RMRR from participating in
3007 * the IOMMU API, which eliminates them from device assignment.
3009 * In both cases, devices which have relaxable RMRRs are not concerned by this
3010 * restriction. See device_rmrr_is_relaxable comment.
3012 static bool device_is_rmrr_locked(struct device *dev)
3014 if (!device_has_rmrr(dev))
3017 if (device_rmrr_is_relaxable(dev))
3024 * Return the required default domain type for a specific device.
3026 * @dev: the device in query
3027 * @startup: true if this is during early boot
3030 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032 * - 0: both identity and dynamic domains work for this device
3034 static int device_def_domain_type(struct device *dev)
3036 if (dev_is_pci(dev)) {
3037 struct pci_dev *pdev = to_pci_dev(dev);
3040 * Prevent any device marked as untrusted from getting
3041 * placed into the statically identity mapping domain.
3043 if (pdev->untrusted)
3044 return IOMMU_DOMAIN_DMA;
3046 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047 return IOMMU_DOMAIN_IDENTITY;
3049 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050 return IOMMU_DOMAIN_IDENTITY;
3053 * We want to start off with all devices in the 1:1 domain, and
3054 * take them out later if we find they can't access all of memory.
3056 * However, we can't do this for PCI devices behind bridges,
3057 * because all PCI devices behind the same bridge will end up
3058 * with the same source-id on their transactions.
3060 * Practically speaking, we can't change things around for these
3061 * devices at run-time, because we can't be sure there'll be no
3062 * DMA transactions in flight for any of their siblings.
3064 * So PCI devices (unless they're on the root bus) as well as
3065 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066 * the 1:1 domain, just in _case_ one of their siblings turns out
3067 * not to be able to map all of memory.
3069 if (!pci_is_pcie(pdev)) {
3070 if (!pci_is_root_bus(pdev->bus))
3071 return IOMMU_DOMAIN_DMA;
3072 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073 return IOMMU_DOMAIN_DMA;
3074 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075 return IOMMU_DOMAIN_DMA;
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3084 * Start from the sane iommu hardware state.
3085 * If the queued invalidation is already initialized by us
3086 * (for example, while enabling interrupt-remapping) then
3087 * we got the things already rolling from a sane state.
3091 * Clear any previous faults.
3093 dmar_fault(-1, iommu);
3095 * Disable queued invalidation if supported and already enabled
3096 * before OS handover.
3098 dmar_disable_qi(iommu);
3101 if (dmar_enable_qi(iommu)) {
3103 * Queued Invalidate not enabled, use Register Based Invalidate
3105 iommu->flush.flush_context = __iommu_flush_context;
3106 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107 pr_info("%s: Using Register based invalidation\n",
3110 iommu->flush.flush_context = qi_flush_context;
3111 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112 pr_info("%s: Using Queued invalidation\n", iommu->name);
3116 static int copy_context_table(struct intel_iommu *iommu,
3117 struct root_entry *old_re,
3118 struct context_entry **tbl,
3121 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122 struct context_entry *new_ce = NULL, ce;
3123 struct context_entry *old_ce = NULL;
3124 struct root_entry re;
3125 phys_addr_t old_ce_phys;
3127 tbl_idx = ext ? bus * 2 : bus;
3128 memcpy(&re, old_re, sizeof(re));
3130 for (devfn = 0; devfn < 256; devfn++) {
3131 /* First calculate the correct index */
3132 idx = (ext ? devfn * 2 : devfn) % 256;
3135 /* First save what we may have and clean up */
3137 tbl[tbl_idx] = new_ce;
3138 __iommu_flush_cache(iommu, new_ce,
3148 old_ce_phys = root_entry_lctp(&re);
3150 old_ce_phys = root_entry_uctp(&re);
3153 if (ext && devfn == 0) {
3154 /* No LCTP, try UCTP */
3163 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3168 new_ce = alloc_pgtable_page(iommu->node);
3175 /* Now copy the context entry */
3176 memcpy(&ce, old_ce + idx, sizeof(ce));
3178 if (!__context_present(&ce))
3181 did = context_domain_id(&ce);
3182 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183 set_bit(did, iommu->domain_ids);
3186 * We need a marker for copied context entries. This
3187 * marker needs to work for the old format as well as
3188 * for extended context entries.
3190 * Bit 67 of the context entry is used. In the old
3191 * format this bit is available to software, in the
3192 * extended format it is the PGE bit, but PGE is ignored
3193 * by HW if PASIDs are disabled (and thus still
3196 * So disable PASIDs first and then mark the entry
3197 * copied. This means that we don't copy PASID
3198 * translations from the old kernel, but this is fine as
3199 * faults there are not fatal.
3201 context_clear_pasid_enable(&ce);
3202 context_set_copied(&ce);
3207 tbl[tbl_idx + pos] = new_ce;
3209 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3220 struct context_entry **ctxt_tbls;
3221 struct root_entry *old_rt;
3222 phys_addr_t old_rt_phys;
3223 int ctxt_table_entries;
3224 unsigned long flags;
3229 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231 new_ext = !!ecap_ecs(iommu->ecap);
3234 * The RTT bit can only be changed when translation is disabled,
3235 * but disabling translation means to open a window for data
3236 * corruption. So bail out and don't copy anything if we would
3237 * have to change the bit.
3242 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3246 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3250 /* This is too big for the stack - allocate it from slab */
3251 ctxt_table_entries = ext ? 512 : 256;
3253 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3257 for (bus = 0; bus < 256; bus++) {
3258 ret = copy_context_table(iommu, &old_rt[bus],
3259 ctxt_tbls, bus, ext);
3261 pr_err("%s: Failed to copy context table for bus %d\n",
3267 spin_lock_irqsave(&iommu->lock, flags);
3269 /* Context tables are copied, now write them to the root_entry table */
3270 for (bus = 0; bus < 256; bus++) {
3271 int idx = ext ? bus * 2 : bus;
3274 if (ctxt_tbls[idx]) {
3275 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276 iommu->root_entry[bus].lo = val;
3279 if (!ext || !ctxt_tbls[idx + 1])
3282 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283 iommu->root_entry[bus].hi = val;
3286 spin_unlock_irqrestore(&iommu->lock, flags);
3290 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3300 static int __init init_dmars(void)
3302 struct dmar_drhd_unit *drhd;
3303 struct intel_iommu *iommu;
3309 * initialize and program root entry to not present
3312 for_each_drhd_unit(drhd) {
3314 * lock not needed as this is only incremented in the single
3315 * threaded kernel __init code path all other access are read
3318 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3322 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3325 /* Preallocate enough resources for IOMMU hot-addition */
3326 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3329 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3332 pr_err("Allocating global iommu array failed\n");
3337 for_each_iommu(iommu, drhd) {
3338 if (drhd->ignored) {
3339 iommu_disable_translation(iommu);
3344 * Find the max pasid size of all IOMMU's in the system.
3345 * We need to ensure the system pasid table is no bigger
3346 * than the smallest supported.
3348 if (pasid_supported(iommu)) {
3349 u32 temp = 2 << ecap_pss(iommu->ecap);
3351 intel_pasid_max_id = min_t(u32, temp,
3352 intel_pasid_max_id);
3355 g_iommus[iommu->seq_id] = iommu;
3357 intel_iommu_init_qi(iommu);
3359 ret = iommu_init_domains(iommu);
3363 init_translation_status(iommu);
3365 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366 iommu_disable_translation(iommu);
3367 clear_translation_pre_enabled(iommu);
3368 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3374 * we could share the same root & context tables
3375 * among all IOMMU's. Need to Split it later.
3377 ret = iommu_alloc_root_entry(iommu);
3381 if (translation_pre_enabled(iommu)) {
3382 pr_info("Translation already enabled - trying to copy translation structures\n");
3384 ret = copy_translation_tables(iommu);
3387 * We found the IOMMU with translation
3388 * enabled - but failed to copy over the
3389 * old root-entry table. Try to proceed
3390 * by disabling translation now and
3391 * allocating a clean root-entry table.
3392 * This might cause DMAR faults, but
3393 * probably the dump will still succeed.
3395 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3397 iommu_disable_translation(iommu);
3398 clear_translation_pre_enabled(iommu);
3400 pr_info("Copied translation tables from previous kernel for %s\n",
3405 if (!ecap_pass_through(iommu->ecap))
3406 hw_pass_through = 0;
3407 intel_svm_check(iommu);
3411 * Now that qi is enabled on all iommus, set the root entry and flush
3412 * caches. This is required on some Intel X58 chipsets, otherwise the
3413 * flush_context function will loop forever and the boot hangs.
3415 for_each_active_iommu(iommu, drhd) {
3416 iommu_flush_write_buffer(iommu);
3417 iommu_set_root_entry(iommu);
3418 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3427 iommu_identity_mapping |= IDENTMAP_GFX;
3429 check_tylersburg_isoch();
3431 ret = si_domain_init(hw_pass_through);
3438 * global invalidate context cache
3439 * global invalidate iotlb
3440 * enable translation
3442 for_each_iommu(iommu, drhd) {
3443 if (drhd->ignored) {
3445 * we always have to disable PMRs or DMA may fail on
3449 iommu_disable_protect_mem_regions(iommu);
3453 iommu_flush_write_buffer(iommu);
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3458 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459 * could cause possible lock race condition.
3461 up_write(&dmar_global_lock);
3462 ret = intel_svm_enable_prq(iommu);
3463 down_write(&dmar_global_lock);
3468 ret = dmar_set_interrupt(iommu);
3476 for_each_active_iommu(iommu, drhd) {
3477 disable_dmar_iommu(iommu);
3478 free_dmar_iommu(iommu);
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489 struct dmar_domain *domain,
3490 unsigned long nrpages, uint64_t dma_mask)
3492 unsigned long iova_pfn;
3495 * Restrict dma_mask to the width that the iommu can handle.
3496 * First-level translation restricts the input-address to a
3497 * canonical address (i.e., address bits 63:N have the same
3498 * value as address bit [N-1], where N is 48-bits with 4-level
3499 * paging and 57-bits with 5-level paging). Hence, skip bit
3502 if (domain_use_first_level(domain))
3503 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3506 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3509 /* Ensure we reserve the whole size-aligned region */
3510 nrpages = __roundup_pow_of_two(nrpages);
3512 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3514 * First try to allocate an io virtual address in
3515 * DMA_BIT_MASK(32) and if that fails then try allocating
3518 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519 IOVA_PFN(DMA_BIT_MASK(32)), false);
3523 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524 IOVA_PFN(dma_mask), true);
3525 if (unlikely(!iova_pfn)) {
3526 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3536 struct dmar_domain *domain, *tmp;
3537 struct dmar_rmrr_unit *rmrr;
3538 struct device *i_dev;
3541 /* Device shouldn't be attached by any domains. */
3542 domain = find_domain(dev);
3546 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3550 /* We have a new domain - setup possible RMRRs for the device */
3552 for_each_rmrr_units(rmrr) {
3553 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3558 ret = domain_prepare_identity_map(dev, domain,
3562 dev_err(dev, "Mapping reserved region failed\n");
3567 tmp = set_domain_for_dev(dev, domain);
3568 if (!tmp || domain != tmp) {
3569 domain_exit(domain);
3575 dev_err(dev, "Allocating domain failed\n");
3577 domain->domain.type = IOMMU_DOMAIN_DMA;
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3587 if (iommu_dummy(dev))
3590 if (unlikely(attach_deferred(dev)))
3591 do_deferred_attach(dev);
3593 ret = identity_mapping(dev);
3595 u64 dma_mask = *dev->dma_mask;
3597 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598 dma_mask = dev->coherent_dma_mask;
3600 if (dma_mask >= dma_direct_get_required_mask(dev))
3604 * 32 bit DMA is removed from si_domain and fall back to
3605 * non-identity mapping.
3607 dmar_remove_one_dev_info(dev);
3608 ret = iommu_request_dma_domain_for_dev(dev);
3610 struct iommu_domain *domain;
3611 struct dmar_domain *dmar_domain;
3613 domain = iommu_get_domain_for_dev(dev);
3615 dmar_domain = to_dmar_domain(domain);
3616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3618 dmar_remove_one_dev_info(dev);
3619 get_private_domain_for_dev(dev);
3622 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629 size_t size, int dir, u64 dma_mask)
3631 struct dmar_domain *domain;
3632 phys_addr_t start_paddr;
3633 unsigned long iova_pfn;
3636 struct intel_iommu *iommu;
3637 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3639 BUG_ON(dir == DMA_NONE);
3641 domain = find_domain(dev);
3643 return DMA_MAPPING_ERROR;
3645 iommu = domain_get_iommu(domain);
3646 size = aligned_nrpages(paddr, size);
3648 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3653 * Check if DMAR supports zero-length reads on write only
3656 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657 !cap_zlr(iommu->cap))
3658 prot |= DMA_PTE_READ;
3659 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660 prot |= DMA_PTE_WRITE;
3662 * paddr - (paddr + size) might be partial page, we should map the whole
3663 * page. Note: if two part of one page are separately mapped, we
3664 * might have two guest_addr mapping to the same host paddr, but this
3665 * is not a big problem
3667 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668 mm_to_dma_pfn(paddr_pfn), size, prot);
3672 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673 start_paddr += paddr & ~PAGE_MASK;
3675 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3681 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683 size, (unsigned long long)paddr, dir);
3684 return DMA_MAPPING_ERROR;
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688 unsigned long offset, size_t size,
3689 enum dma_data_direction dir,
3690 unsigned long attrs)
3692 if (iommu_need_mapping(dev))
3693 return __intel_map_single(dev, page_to_phys(page) + offset,
3694 size, dir, *dev->dma_mask);
3695 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699 size_t size, enum dma_data_direction dir,
3700 unsigned long attrs)
3702 if (iommu_need_mapping(dev))
3703 return __intel_map_single(dev, phys_addr, size, dir,
3705 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3710 struct dmar_domain *domain;
3711 unsigned long start_pfn, last_pfn;
3712 unsigned long nrpages;
3713 unsigned long iova_pfn;
3714 struct intel_iommu *iommu;
3715 struct page *freelist;
3716 struct pci_dev *pdev = NULL;
3718 domain = find_domain(dev);
3721 iommu = domain_get_iommu(domain);
3723 iova_pfn = IOVA_PFN(dev_addr);
3725 nrpages = aligned_nrpages(dev_addr, size);
3726 start_pfn = mm_to_dma_pfn(iova_pfn);
3727 last_pfn = start_pfn + nrpages - 1;
3729 if (dev_is_pci(dev))
3730 pdev = to_pci_dev(dev);
3732 freelist = domain_unmap(domain, start_pfn, last_pfn);
3733 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734 !has_iova_flush_queue(&domain->iovad)) {
3735 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736 nrpages, !freelist, 0);
3738 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739 dma_free_pagelist(freelist);
3741 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742 (unsigned long)freelist);
3744 * queue up the release of the unmap to save the 1/6th of the
3745 * cpu used up by the iotlb flush operation...
3749 trace_unmap_single(dev, dev_addr, size);
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753 size_t size, enum dma_data_direction dir,
3754 unsigned long attrs)
3756 if (iommu_need_mapping(dev))
3757 intel_unmap(dev, dev_addr, size);
3759 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763 size_t size, enum dma_data_direction dir, unsigned long attrs)
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770 dma_addr_t *dma_handle, gfp_t flags,
3771 unsigned long attrs)
3773 struct page *page = NULL;
3776 if (!iommu_need_mapping(dev))
3777 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3782 if (gfpflags_allow_blocking(flags)) {
3783 unsigned int count = size >> PAGE_SHIFT;
3785 page = dma_alloc_from_contiguous(dev, count, order,
3786 flags & __GFP_NOWARN);
3790 page = alloc_pages(flags, order);
3793 memset(page_address(page), 0, size);
3795 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3797 dev->coherent_dma_mask);
3798 if (*dma_handle != DMA_MAPPING_ERROR)
3799 return page_address(page);
3800 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801 __free_pages(page, order);
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807 dma_addr_t dma_handle, unsigned long attrs)
3810 struct page *page = virt_to_page(vaddr);
3812 if (!iommu_need_mapping(dev))
3813 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3815 size = PAGE_ALIGN(size);
3816 order = get_order(size);
3818 intel_unmap(dev, dma_handle, size);
3819 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820 __free_pages(page, order);
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824 int nelems, enum dma_data_direction dir,
3825 unsigned long attrs)
3827 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828 unsigned long nrpages = 0;
3829 struct scatterlist *sg;
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3835 for_each_sg(sglist, sg, nelems, i) {
3836 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3839 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3841 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845 enum dma_data_direction dir, unsigned long attrs)
3848 struct dmar_domain *domain;
3851 unsigned long iova_pfn;
3853 struct scatterlist *sg;
3854 unsigned long start_vpfn;
3855 struct intel_iommu *iommu;
3857 BUG_ON(dir == DMA_NONE);
3858 if (!iommu_need_mapping(dev))
3859 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3861 domain = find_domain(dev);
3865 iommu = domain_get_iommu(domain);
3867 for_each_sg(sglist, sg, nelems, i)
3868 size += aligned_nrpages(sg->offset, sg->length);
3870 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3873 sglist->dma_length = 0;
3878 * Check if DMAR supports zero-length reads on write only
3881 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882 !cap_zlr(iommu->cap))
3883 prot |= DMA_PTE_READ;
3884 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885 prot |= DMA_PTE_WRITE;
3887 start_vpfn = mm_to_dma_pfn(iova_pfn);
3889 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890 if (unlikely(ret)) {
3891 dma_pte_free_pagetable(domain, start_vpfn,
3892 start_vpfn + size - 1,
3893 agaw_to_level(domain->agaw) + 1);
3894 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3898 for_each_sg(sglist, sg, nelems, i)
3899 trace_map_sg(dev, i + 1, nelems, sg);
3904 static u64 intel_get_required_mask(struct device *dev)
3906 if (!iommu_need_mapping(dev))
3907 return dma_direct_get_required_mask(dev);
3908 return DMA_BIT_MASK(32);
3911 static const struct dma_map_ops intel_dma_ops = {
3912 .alloc = intel_alloc_coherent,
3913 .free = intel_free_coherent,
3914 .map_sg = intel_map_sg,
3915 .unmap_sg = intel_unmap_sg,
3916 .map_page = intel_map_page,
3917 .unmap_page = intel_unmap_page,
3918 .map_resource = intel_map_resource,
3919 .unmap_resource = intel_unmap_resource,
3920 .dma_supported = dma_direct_supported,
3921 .mmap = dma_common_mmap,
3922 .get_sgtable = dma_common_get_sgtable,
3923 .get_required_mask = intel_get_required_mask,
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928 enum dma_data_direction dir, enum dma_sync_target target)
3930 struct dmar_domain *domain;
3931 phys_addr_t tlb_addr;
3933 domain = find_domain(dev);
3934 if (WARN_ON(!domain))
3937 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938 if (is_swiotlb_buffer(tlb_addr))
3939 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs,
3947 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948 struct dmar_domain *domain;
3949 struct intel_iommu *iommu;
3950 unsigned long iova_pfn;
3951 unsigned long nrpages;
3952 phys_addr_t tlb_addr;
3956 if (unlikely(attach_deferred(dev)))
3957 do_deferred_attach(dev);
3959 domain = find_domain(dev);
3961 if (WARN_ON(dir == DMA_NONE || !domain))
3962 return DMA_MAPPING_ERROR;
3964 iommu = domain_get_iommu(domain);
3965 if (WARN_ON(!iommu))
3966 return DMA_MAPPING_ERROR;
3968 nrpages = aligned_nrpages(0, size);
3969 iova_pfn = intel_alloc_iova(dev, domain,
3970 dma_to_mm_pfn(nrpages), dma_mask);
3972 return DMA_MAPPING_ERROR;
3975 * Check if DMAR supports zero-length reads on write only
3978 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979 !cap_zlr(iommu->cap))
3980 prot |= DMA_PTE_READ;
3981 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982 prot |= DMA_PTE_WRITE;
3985 * If both the physical buffer start address and size are
3986 * page aligned, we don't need to use a bounce page.
3988 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989 tlb_addr = swiotlb_tbl_map_single(dev,
3990 __phys_to_dma(dev, io_tlb_start),
3991 paddr, size, aligned_size, dir, attrs);
3992 if (tlb_addr == DMA_MAPPING_ERROR) {
3995 /* Cleanup the padding area. */
3996 void *padding_start = phys_to_virt(tlb_addr);
3997 size_t padding_size = aligned_size;
3999 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000 (dir == DMA_TO_DEVICE ||
4001 dir == DMA_BIDIRECTIONAL)) {
4002 padding_start += size;
4003 padding_size -= size;
4006 memset(padding_start, 0, padding_size);
4012 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4017 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4019 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4022 if (is_swiotlb_buffer(tlb_addr))
4023 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024 aligned_size, dir, attrs);
4026 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028 size, (unsigned long long)paddr, dir);
4030 return DMA_MAPPING_ERROR;
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035 enum dma_data_direction dir, unsigned long attrs)
4037 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038 struct dmar_domain *domain;
4039 phys_addr_t tlb_addr;
4041 domain = find_domain(dev);
4042 if (WARN_ON(!domain))
4045 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046 if (WARN_ON(!tlb_addr))
4049 intel_unmap(dev, dev_addr, size);
4050 if (is_swiotlb_buffer(tlb_addr))
4051 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052 aligned_size, dir, attrs);
4054 trace_bounce_unmap_single(dev, dev_addr, size);
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059 size_t size, enum dma_data_direction dir, unsigned long attrs)
4061 return bounce_map_single(dev, page_to_phys(page) + offset,
4062 size, dir, attrs, *dev->dma_mask);
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067 enum dma_data_direction dir, unsigned long attrs)
4069 return bounce_map_single(dev, phys_addr, size,
4070 dir, attrs, *dev->dma_mask);
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075 enum dma_data_direction dir, unsigned long attrs)
4077 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082 enum dma_data_direction dir, unsigned long attrs)
4084 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089 enum dma_data_direction dir, unsigned long attrs)
4091 struct scatterlist *sg;
4094 for_each_sg(sglist, sg, nelems, i)
4095 bounce_unmap_page(dev, sg->dma_address,
4096 sg_dma_len(sg), dir, attrs);
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101 enum dma_data_direction dir, unsigned long attrs)
4104 struct scatterlist *sg;
4106 for_each_sg(sglist, sg, nelems, i) {
4107 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108 sg->offset, sg->length,
4110 if (sg->dma_address == DMA_MAPPING_ERROR)
4112 sg_dma_len(sg) = sg->length;
4115 for_each_sg(sglist, sg, nelems, i)
4116 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4121 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127 size_t size, enum dma_data_direction dir)
4129 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134 size_t size, enum dma_data_direction dir)
4136 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141 int nelems, enum dma_data_direction dir)
4143 struct scatterlist *sg;
4146 for_each_sg(sglist, sg, nelems, i)
4147 bounce_sync_single(dev, sg_dma_address(sg),
4148 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153 int nelems, enum dma_data_direction dir)
4155 struct scatterlist *sg;
4158 for_each_sg(sglist, sg, nelems, i)
4159 bounce_sync_single(dev, sg_dma_address(sg),
4160 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4163 static const struct dma_map_ops bounce_dma_ops = {
4164 .alloc = intel_alloc_coherent,
4165 .free = intel_free_coherent,
4166 .map_sg = bounce_map_sg,
4167 .unmap_sg = bounce_unmap_sg,
4168 .map_page = bounce_map_page,
4169 .unmap_page = bounce_unmap_page,
4170 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4171 .sync_single_for_device = bounce_sync_single_for_device,
4172 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4173 .sync_sg_for_device = bounce_sync_sg_for_device,
4174 .map_resource = bounce_map_resource,
4175 .unmap_resource = bounce_unmap_resource,
4176 .dma_supported = dma_direct_supported,
4179 static inline int iommu_domain_cache_init(void)
4183 iommu_domain_cache = kmem_cache_create("iommu_domain",
4184 sizeof(struct dmar_domain),
4189 if (!iommu_domain_cache) {
4190 pr_err("Couldn't create iommu_domain cache\n");
4197 static inline int iommu_devinfo_cache_init(void)
4201 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202 sizeof(struct device_domain_info),
4206 if (!iommu_devinfo_cache) {
4207 pr_err("Couldn't create devinfo cache\n");
4214 static int __init iommu_init_mempool(void)
4217 ret = iova_cache_get();
4221 ret = iommu_domain_cache_init();
4225 ret = iommu_devinfo_cache_init();
4229 kmem_cache_destroy(iommu_domain_cache);
4236 static void __init iommu_exit_mempool(void)
4238 kmem_cache_destroy(iommu_devinfo_cache);
4239 kmem_cache_destroy(iommu_domain_cache);
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4245 struct dmar_drhd_unit *drhd;
4249 /* We know that this device on this chipset has its own IOMMU.
4250 * If we find it under a different IOMMU, then the BIOS is lying
4251 * to us. Hope that the IOMMU for this device is actually
4252 * disabled, and it needs no translation...
4254 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4256 /* "can't" happen */
4257 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4260 vtbar &= 0xffff0000;
4262 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263 drhd = dmar_find_matched_drhd_unit(pdev);
4264 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4265 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4266 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4267 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4270 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4272 static void __init init_no_remapping_devices(void)
4274 struct dmar_drhd_unit *drhd;
4278 for_each_drhd_unit(drhd) {
4279 if (!drhd->include_all) {
4280 for_each_active_dev_scope(drhd->devices,
4281 drhd->devices_cnt, i, dev)
4283 /* ignore DMAR unit if no devices exist */
4284 if (i == drhd->devices_cnt)
4289 for_each_active_drhd_unit(drhd) {
4290 if (drhd->include_all)
4293 for_each_active_dev_scope(drhd->devices,
4294 drhd->devices_cnt, i, dev)
4295 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4297 if (i < drhd->devices_cnt)
4300 /* This IOMMU has *only* gfx devices. Either bypass it or
4301 set the gfx_mapped flag, as appropriate */
4302 if (!dmar_map_gfx) {
4304 for_each_active_dev_scope(drhd->devices,
4305 drhd->devices_cnt, i, dev)
4306 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4311 #ifdef CONFIG_SUSPEND
4312 static int init_iommu_hw(void)
4314 struct dmar_drhd_unit *drhd;
4315 struct intel_iommu *iommu = NULL;
4317 for_each_active_iommu(iommu, drhd)
4319 dmar_reenable_qi(iommu);
4321 for_each_iommu(iommu, drhd) {
4322 if (drhd->ignored) {
4324 * we always have to disable PMRs or DMA may fail on
4328 iommu_disable_protect_mem_regions(iommu);
4332 iommu_flush_write_buffer(iommu);
4334 iommu_set_root_entry(iommu);
4336 iommu->flush.flush_context(iommu, 0, 0, 0,
4337 DMA_CCMD_GLOBAL_INVL);
4338 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4339 iommu_enable_translation(iommu);
4340 iommu_disable_protect_mem_regions(iommu);
4346 static void iommu_flush_all(void)
4348 struct dmar_drhd_unit *drhd;
4349 struct intel_iommu *iommu;
4351 for_each_active_iommu(iommu, drhd) {
4352 iommu->flush.flush_context(iommu, 0, 0, 0,
4353 DMA_CCMD_GLOBAL_INVL);
4354 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4355 DMA_TLB_GLOBAL_FLUSH);
4359 static int iommu_suspend(void)
4361 struct dmar_drhd_unit *drhd;
4362 struct intel_iommu *iommu = NULL;
4365 for_each_active_iommu(iommu, drhd) {
4366 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4368 if (!iommu->iommu_state)
4374 for_each_active_iommu(iommu, drhd) {
4375 iommu_disable_translation(iommu);
4377 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4379 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4380 readl(iommu->reg + DMAR_FECTL_REG);
4381 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4382 readl(iommu->reg + DMAR_FEDATA_REG);
4383 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4384 readl(iommu->reg + DMAR_FEADDR_REG);
4385 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4386 readl(iommu->reg + DMAR_FEUADDR_REG);
4388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4393 for_each_active_iommu(iommu, drhd)
4394 kfree(iommu->iommu_state);
4399 static void iommu_resume(void)
4401 struct dmar_drhd_unit *drhd;
4402 struct intel_iommu *iommu = NULL;
4405 if (init_iommu_hw()) {
4407 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4409 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4413 for_each_active_iommu(iommu, drhd) {
4415 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4417 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4418 iommu->reg + DMAR_FECTL_REG);
4419 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4420 iommu->reg + DMAR_FEDATA_REG);
4421 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4422 iommu->reg + DMAR_FEADDR_REG);
4423 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4424 iommu->reg + DMAR_FEUADDR_REG);
4426 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4429 for_each_active_iommu(iommu, drhd)
4430 kfree(iommu->iommu_state);
4433 static struct syscore_ops iommu_syscore_ops = {
4434 .resume = iommu_resume,
4435 .suspend = iommu_suspend,
4438 static void __init init_iommu_pm_ops(void)
4440 register_syscore_ops(&iommu_syscore_ops);
4444 static inline void init_iommu_pm_ops(void) {}
4445 #endif /* CONFIG_PM */
4447 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4449 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4450 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4451 rmrr->end_address <= rmrr->base_address ||
4452 arch_rmrr_sanity_check(rmrr))
4458 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4460 struct acpi_dmar_reserved_memory *rmrr;
4461 struct dmar_rmrr_unit *rmrru;
4463 rmrr = (struct acpi_dmar_reserved_memory *)header;
4464 if (rmrr_sanity_check(rmrr)) {
4466 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4467 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4468 rmrr->base_address, rmrr->end_address,
4469 dmi_get_system_info(DMI_BIOS_VENDOR),
4470 dmi_get_system_info(DMI_BIOS_VERSION),
4471 dmi_get_system_info(DMI_PRODUCT_VERSION));
4472 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4475 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4479 rmrru->hdr = header;
4481 rmrru->base_address = rmrr->base_address;
4482 rmrru->end_address = rmrr->end_address;
4484 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4485 ((void *)rmrr) + rmrr->header.length,
4486 &rmrru->devices_cnt);
4487 if (rmrru->devices_cnt && rmrru->devices == NULL)
4490 list_add(&rmrru->list, &dmar_rmrr_units);
4499 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4501 struct dmar_atsr_unit *atsru;
4502 struct acpi_dmar_atsr *tmp;
4504 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4506 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4507 if (atsr->segment != tmp->segment)
4509 if (atsr->header.length != tmp->header.length)
4511 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4518 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4520 struct acpi_dmar_atsr *atsr;
4521 struct dmar_atsr_unit *atsru;
4523 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4526 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4527 atsru = dmar_find_atsr(atsr);
4531 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4536 * If memory is allocated from slab by ACPI _DSM method, we need to
4537 * copy the memory content because the memory buffer will be freed
4540 atsru->hdr = (void *)(atsru + 1);
4541 memcpy(atsru->hdr, hdr, hdr->length);
4542 atsru->include_all = atsr->flags & 0x1;
4543 if (!atsru->include_all) {
4544 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4545 (void *)atsr + atsr->header.length,
4546 &atsru->devices_cnt);
4547 if (atsru->devices_cnt && atsru->devices == NULL) {
4553 list_add_rcu(&atsru->list, &dmar_atsr_units);
4558 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4560 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4564 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4566 struct acpi_dmar_atsr *atsr;
4567 struct dmar_atsr_unit *atsru;
4569 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4570 atsru = dmar_find_atsr(atsr);
4572 list_del_rcu(&atsru->list);
4574 intel_iommu_free_atsr(atsru);
4580 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4584 struct acpi_dmar_atsr *atsr;
4585 struct dmar_atsr_unit *atsru;
4587 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4588 atsru = dmar_find_atsr(atsr);
4592 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4593 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4601 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4604 struct intel_iommu *iommu = dmaru->iommu;
4606 if (g_iommus[iommu->seq_id])
4609 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4610 pr_warn("%s: Doesn't support hardware pass through.\n",
4614 if (!ecap_sc_support(iommu->ecap) &&
4615 domain_update_iommu_snooping(iommu)) {
4616 pr_warn("%s: Doesn't support snooping.\n",
4620 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4621 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4622 pr_warn("%s: Doesn't support large page.\n",
4628 * Disable translation if already enabled prior to OS handover.
4630 if (iommu->gcmd & DMA_GCMD_TE)
4631 iommu_disable_translation(iommu);
4633 g_iommus[iommu->seq_id] = iommu;
4634 ret = iommu_init_domains(iommu);
4636 ret = iommu_alloc_root_entry(iommu);
4640 intel_svm_check(iommu);
4642 if (dmaru->ignored) {
4644 * we always have to disable PMRs or DMA may fail on this device
4647 iommu_disable_protect_mem_regions(iommu);
4651 intel_iommu_init_qi(iommu);
4652 iommu_flush_write_buffer(iommu);
4654 #ifdef CONFIG_INTEL_IOMMU_SVM
4655 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4656 ret = intel_svm_enable_prq(iommu);
4661 ret = dmar_set_interrupt(iommu);
4665 iommu_set_root_entry(iommu);
4666 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4667 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4668 iommu_enable_translation(iommu);
4670 iommu_disable_protect_mem_regions(iommu);
4674 disable_dmar_iommu(iommu);
4676 free_dmar_iommu(iommu);
4680 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4683 struct intel_iommu *iommu = dmaru->iommu;
4685 if (!intel_iommu_enabled)
4691 ret = intel_iommu_add(dmaru);
4693 disable_dmar_iommu(iommu);
4694 free_dmar_iommu(iommu);
4700 static void intel_iommu_free_dmars(void)
4702 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4703 struct dmar_atsr_unit *atsru, *atsr_n;
4705 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4706 list_del(&rmrru->list);
4707 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4711 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4712 list_del(&atsru->list);
4713 intel_iommu_free_atsr(atsru);
4717 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4720 struct pci_bus *bus;
4721 struct pci_dev *bridge = NULL;
4723 struct acpi_dmar_atsr *atsr;
4724 struct dmar_atsr_unit *atsru;
4726 dev = pci_physfn(dev);
4727 for (bus = dev->bus; bus; bus = bus->parent) {
4729 /* If it's an integrated device, allow ATS */
4732 /* Connected via non-PCIe: no ATS */
4733 if (!pci_is_pcie(bridge) ||
4734 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4736 /* If we found the root port, look it up in the ATSR */
4737 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4742 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4743 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4744 if (atsr->segment != pci_domain_nr(dev->bus))
4747 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4748 if (tmp == &bridge->dev)
4751 if (atsru->include_all)
4761 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4764 struct dmar_rmrr_unit *rmrru;
4765 struct dmar_atsr_unit *atsru;
4766 struct acpi_dmar_atsr *atsr;
4767 struct acpi_dmar_reserved_memory *rmrr;
4769 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4772 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4773 rmrr = container_of(rmrru->hdr,
4774 struct acpi_dmar_reserved_memory, header);
4775 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4776 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4777 ((void *)rmrr) + rmrr->header.length,
4778 rmrr->segment, rmrru->devices,
4779 rmrru->devices_cnt);
4782 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4783 dmar_remove_dev_scope(info, rmrr->segment,
4784 rmrru->devices, rmrru->devices_cnt);
4788 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4789 if (atsru->include_all)
4792 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4793 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4794 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4795 (void *)atsr + atsr->header.length,
4796 atsr->segment, atsru->devices,
4797 atsru->devices_cnt);
4802 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4803 if (dmar_remove_dev_scope(info, atsr->segment,
4804 atsru->devices, atsru->devices_cnt))
4812 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4813 unsigned long val, void *v)
4815 struct memory_notify *mhp = v;
4816 unsigned long long start, end;
4817 unsigned long start_vpfn, last_vpfn;
4820 case MEM_GOING_ONLINE:
4821 start = mhp->start_pfn << PAGE_SHIFT;
4822 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4823 if (iommu_domain_identity_map(si_domain, start, end)) {
4824 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4831 case MEM_CANCEL_ONLINE:
4832 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4833 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4834 while (start_vpfn <= last_vpfn) {
4836 struct dmar_drhd_unit *drhd;
4837 struct intel_iommu *iommu;
4838 struct page *freelist;
4840 iova = find_iova(&si_domain->iovad, start_vpfn);
4842 pr_debug("Failed get IOVA for PFN %lx\n",
4847 iova = split_and_remove_iova(&si_domain->iovad, iova,
4848 start_vpfn, last_vpfn);
4850 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4851 start_vpfn, last_vpfn);
4855 freelist = domain_unmap(si_domain, iova->pfn_lo,
4859 for_each_active_iommu(iommu, drhd)
4860 iommu_flush_iotlb_psi(iommu, si_domain,
4861 iova->pfn_lo, iova_size(iova),
4864 dma_free_pagelist(freelist);
4866 start_vpfn = iova->pfn_hi + 1;
4867 free_iova_mem(iova);
4875 static struct notifier_block intel_iommu_memory_nb = {
4876 .notifier_call = intel_iommu_memory_notifier,
4880 static void free_all_cpu_cached_iovas(unsigned int cpu)
4884 for (i = 0; i < g_num_of_iommus; i++) {
4885 struct intel_iommu *iommu = g_iommus[i];
4886 struct dmar_domain *domain;
4892 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4893 domain = get_iommu_domain(iommu, (u16)did);
4897 free_cpu_cached_iovas(cpu, &domain->iovad);
4902 static int intel_iommu_cpu_dead(unsigned int cpu)
4904 free_all_cpu_cached_iovas(cpu);
4908 static void intel_disable_iommus(void)
4910 struct intel_iommu *iommu = NULL;
4911 struct dmar_drhd_unit *drhd;
4913 for_each_iommu(iommu, drhd)
4914 iommu_disable_translation(iommu);
4917 void intel_iommu_shutdown(void)
4919 struct dmar_drhd_unit *drhd;
4920 struct intel_iommu *iommu = NULL;
4922 if (no_iommu || dmar_disabled)
4925 down_write(&dmar_global_lock);
4927 /* Disable PMRs explicitly here. */
4928 for_each_iommu(iommu, drhd)
4929 iommu_disable_protect_mem_regions(iommu);
4931 /* Make sure the IOMMUs are switched off */
4932 intel_disable_iommus();
4934 up_write(&dmar_global_lock);
4937 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4939 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4941 return container_of(iommu_dev, struct intel_iommu, iommu);
4944 static ssize_t intel_iommu_show_version(struct device *dev,
4945 struct device_attribute *attr,
4948 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4949 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4950 return sprintf(buf, "%d:%d\n",
4951 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4953 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4955 static ssize_t intel_iommu_show_address(struct device *dev,
4956 struct device_attribute *attr,
4959 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4960 return sprintf(buf, "%llx\n", iommu->reg_phys);
4962 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4964 static ssize_t intel_iommu_show_cap(struct device *dev,
4965 struct device_attribute *attr,
4968 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4969 return sprintf(buf, "%llx\n", iommu->cap);
4971 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4973 static ssize_t intel_iommu_show_ecap(struct device *dev,
4974 struct device_attribute *attr,
4977 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4978 return sprintf(buf, "%llx\n", iommu->ecap);
4980 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4982 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4983 struct device_attribute *attr,
4986 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4987 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4989 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4991 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4992 struct device_attribute *attr,
4995 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4996 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4997 cap_ndoms(iommu->cap)));
4999 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
5001 static struct attribute *intel_iommu_attrs[] = {
5002 &dev_attr_version.attr,
5003 &dev_attr_address.attr,
5005 &dev_attr_ecap.attr,
5006 &dev_attr_domains_supported.attr,
5007 &dev_attr_domains_used.attr,
5011 static struct attribute_group intel_iommu_group = {
5012 .name = "intel-iommu",
5013 .attrs = intel_iommu_attrs,
5016 const struct attribute_group *intel_iommu_groups[] = {
5021 static inline bool has_untrusted_dev(void)
5023 struct pci_dev *pdev = NULL;
5025 for_each_pci_dev(pdev)
5026 if (pdev->untrusted)
5032 static int __init platform_optin_force_iommu(void)
5034 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5037 if (no_iommu || dmar_disabled)
5038 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5041 * If Intel-IOMMU is disabled by default, we will apply identity
5042 * map for all devices except those marked as being untrusted.
5045 iommu_set_default_passthrough(false);
5053 static int __init probe_acpi_namespace_devices(void)
5055 struct dmar_drhd_unit *drhd;
5056 /* To avoid a -Wunused-but-set-variable warning. */
5057 struct intel_iommu *iommu __maybe_unused;
5061 for_each_active_iommu(iommu, drhd) {
5062 for_each_active_dev_scope(drhd->devices,
5063 drhd->devices_cnt, i, dev) {
5064 struct acpi_device_physical_node *pn;
5065 struct iommu_group *group;
5066 struct acpi_device *adev;
5068 if (dev->bus != &acpi_bus_type)
5071 adev = to_acpi_device(dev);
5072 mutex_lock(&adev->physical_node_lock);
5073 list_for_each_entry(pn,
5074 &adev->physical_node_list, node) {
5075 group = iommu_group_get(pn->dev);
5077 iommu_group_put(group);
5081 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5082 ret = iommu_probe_device(pn->dev);
5086 mutex_unlock(&adev->physical_node_lock);
5096 int __init intel_iommu_init(void)
5099 struct dmar_drhd_unit *drhd;
5100 struct intel_iommu *iommu;
5103 * Intel IOMMU is required for a TXT/tboot launch or platform
5104 * opt in, so enforce that.
5106 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5108 if (iommu_init_mempool()) {
5110 panic("tboot: Failed to initialize iommu memory\n");
5114 down_write(&dmar_global_lock);
5115 if (dmar_table_init()) {
5117 panic("tboot: Failed to initialize DMAR table\n");
5121 if (dmar_dev_scope_init() < 0) {
5123 panic("tboot: Failed to initialize DMAR device scope\n");
5127 up_write(&dmar_global_lock);
5130 * The bus notifier takes the dmar_global_lock, so lockdep will
5131 * complain later when we register it under the lock.
5133 dmar_register_bus_notifier();
5135 down_write(&dmar_global_lock);
5138 intel_iommu_debugfs_init();
5140 if (no_iommu || dmar_disabled) {
5142 * We exit the function here to ensure IOMMU's remapping and
5143 * mempool aren't setup, which means that the IOMMU's PMRs
5144 * won't be disabled via the call to init_dmars(). So disable
5145 * it explicitly here. The PMRs were setup by tboot prior to
5146 * calling SENTER, but the kernel is expected to reset/tear
5149 if (intel_iommu_tboot_noforce) {
5150 for_each_iommu(iommu, drhd)
5151 iommu_disable_protect_mem_regions(iommu);
5155 * Make sure the IOMMUs are switched off, even when we
5156 * boot into a kexec kernel and the previous kernel left
5159 intel_disable_iommus();
5163 if (list_empty(&dmar_rmrr_units))
5164 pr_info("No RMRR found\n");
5166 if (list_empty(&dmar_atsr_units))
5167 pr_info("No ATSR found\n");
5169 if (dmar_init_reserved_ranges()) {
5171 panic("tboot: Failed to reserve iommu ranges\n");
5172 goto out_free_reserved_range;
5176 intel_iommu_gfx_mapped = 1;
5178 init_no_remapping_devices();
5183 panic("tboot: Failed to initialize DMARs\n");
5184 pr_err("Initialization failed\n");
5185 goto out_free_reserved_range;
5187 up_write(&dmar_global_lock);
5189 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5191 * If the system has no untrusted device or the user has decided
5192 * to disable the bounce page mechanisms, we don't need swiotlb.
5193 * Mark this and the pre-allocated bounce pages will be released
5196 if (!has_untrusted_dev() || intel_no_bounce)
5199 dma_ops = &intel_dma_ops;
5201 init_iommu_pm_ops();
5203 down_read(&dmar_global_lock);
5204 for_each_active_iommu(iommu, drhd) {
5205 iommu_device_sysfs_add(&iommu->iommu, NULL,
5208 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5209 iommu_device_register(&iommu->iommu);
5211 up_read(&dmar_global_lock);
5213 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5214 if (si_domain && !hw_pass_through)
5215 register_memory_notifier(&intel_iommu_memory_nb);
5216 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5217 intel_iommu_cpu_dead);
5219 down_read(&dmar_global_lock);
5220 if (probe_acpi_namespace_devices())
5221 pr_warn("ACPI name space devices didn't probe correctly\n");
5223 /* Finally, we enable the DMA remapping hardware. */
5224 for_each_iommu(iommu, drhd) {
5225 if (!drhd->ignored && !translation_pre_enabled(iommu))
5226 iommu_enable_translation(iommu);
5228 iommu_disable_protect_mem_regions(iommu);
5230 up_read(&dmar_global_lock);
5232 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5234 intel_iommu_enabled = 1;
5238 out_free_reserved_range:
5239 put_iova_domain(&reserved_iova_list);
5241 intel_iommu_free_dmars();
5242 up_write(&dmar_global_lock);
5243 iommu_exit_mempool();
5247 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5249 struct intel_iommu *iommu = opaque;
5251 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5256 * NB - intel-iommu lacks any sort of reference counting for the users of
5257 * dependent devices. If multiple endpoints have intersecting dependent
5258 * devices, unbinding the driver from any one of them will possibly leave
5259 * the others unable to operate.
5261 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5263 if (!iommu || !dev || !dev_is_pci(dev))
5266 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5269 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5271 struct dmar_domain *domain;
5272 struct intel_iommu *iommu;
5273 unsigned long flags;
5275 assert_spin_locked(&device_domain_lock);
5280 iommu = info->iommu;
5281 domain = info->domain;
5284 if (dev_is_pci(info->dev) && sm_supported(iommu))
5285 intel_pasid_tear_down_entry(iommu, info->dev,
5288 iommu_disable_dev_iotlb(info);
5289 domain_context_clear(iommu, info->dev);
5290 intel_pasid_free_table(info->dev);
5293 unlink_domain_info(info);
5295 spin_lock_irqsave(&iommu->lock, flags);
5296 domain_detach_iommu(domain, iommu);
5297 spin_unlock_irqrestore(&iommu->lock, flags);
5299 /* free the private domain */
5300 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5301 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5302 list_empty(&domain->devices))
5303 domain_exit(info->domain);
5305 free_devinfo_mem(info);
5308 static void dmar_remove_one_dev_info(struct device *dev)
5310 struct device_domain_info *info;
5311 unsigned long flags;
5313 spin_lock_irqsave(&device_domain_lock, flags);
5314 info = dev->archdata.iommu;
5315 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5316 && info != DUMMY_DEVICE_DOMAIN_INFO)
5317 __dmar_remove_one_dev_info(info);
5318 spin_unlock_irqrestore(&device_domain_lock, flags);
5321 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5325 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5326 domain_reserve_special_ranges(domain);
5328 /* calculate AGAW */
5329 domain->gaw = guest_width;
5330 adjust_width = guestwidth_to_adjustwidth(guest_width);
5331 domain->agaw = width_to_agaw(adjust_width);
5333 domain->iommu_coherency = 0;
5334 domain->iommu_snooping = 0;
5335 domain->iommu_superpage = 0;
5336 domain->max_addr = 0;
5338 /* always allocate the top pgd */
5339 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5342 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5346 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5348 struct dmar_domain *dmar_domain;
5349 struct iommu_domain *domain;
5353 case IOMMU_DOMAIN_DMA:
5355 case IOMMU_DOMAIN_UNMANAGED:
5356 dmar_domain = alloc_domain(0);
5358 pr_err("Can't allocate dmar_domain\n");
5361 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5362 pr_err("Domain initialization failed\n");
5363 domain_exit(dmar_domain);
5367 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5368 ret = init_iova_flush_queue(&dmar_domain->iovad,
5372 pr_info("iova flush queue initialization failed\n");
5375 domain_update_iommu_cap(dmar_domain);
5377 domain = &dmar_domain->domain;
5378 domain->geometry.aperture_start = 0;
5379 domain->geometry.aperture_end =
5380 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5381 domain->geometry.force_aperture = true;
5384 case IOMMU_DOMAIN_IDENTITY:
5385 return &si_domain->domain;
5393 static void intel_iommu_domain_free(struct iommu_domain *domain)
5395 if (domain != &si_domain->domain)
5396 domain_exit(to_dmar_domain(domain));
5400 * Check whether a @domain could be attached to the @dev through the
5401 * aux-domain attach/detach APIs.
5404 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5406 struct device_domain_info *info = dev->archdata.iommu;
5408 return info && info->auxd_enabled &&
5409 domain->type == IOMMU_DOMAIN_UNMANAGED;
5412 static void auxiliary_link_device(struct dmar_domain *domain,
5415 struct device_domain_info *info = dev->archdata.iommu;
5417 assert_spin_locked(&device_domain_lock);
5421 domain->auxd_refcnt++;
5422 list_add(&domain->auxd, &info->auxiliary_domains);
5425 static void auxiliary_unlink_device(struct dmar_domain *domain,
5428 struct device_domain_info *info = dev->archdata.iommu;
5430 assert_spin_locked(&device_domain_lock);
5434 list_del(&domain->auxd);
5435 domain->auxd_refcnt--;
5437 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5438 ioasid_free(domain->default_pasid);
5441 static int aux_domain_add_dev(struct dmar_domain *domain,
5446 unsigned long flags;
5447 struct intel_iommu *iommu;
5449 iommu = device_to_iommu(dev, &bus, &devfn);
5453 if (domain->default_pasid <= 0) {
5456 /* No private data needed for the default pasid */
5457 pasid = ioasid_alloc(NULL, PASID_MIN,
5458 pci_max_pasids(to_pci_dev(dev)) - 1,
5460 if (pasid == INVALID_IOASID) {
5461 pr_err("Can't allocate default pasid\n");
5464 domain->default_pasid = pasid;
5467 spin_lock_irqsave(&device_domain_lock, flags);
5469 * iommu->lock must be held to attach domain to iommu and setup the
5470 * pasid entry for second level translation.
5472 spin_lock(&iommu->lock);
5473 ret = domain_attach_iommu(domain, iommu);
5477 /* Setup the PASID entry for mediated devices: */
5478 if (domain_use_first_level(domain))
5479 ret = domain_setup_first_level(iommu, domain, dev,
5480 domain->default_pasid);
5482 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5483 domain->default_pasid);
5486 spin_unlock(&iommu->lock);
5488 auxiliary_link_device(domain, dev);
5490 spin_unlock_irqrestore(&device_domain_lock, flags);
5495 domain_detach_iommu(domain, iommu);
5497 spin_unlock(&iommu->lock);
5498 spin_unlock_irqrestore(&device_domain_lock, flags);
5499 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5500 ioasid_free(domain->default_pasid);
5505 static void aux_domain_remove_dev(struct dmar_domain *domain,
5508 struct device_domain_info *info;
5509 struct intel_iommu *iommu;
5510 unsigned long flags;
5512 if (!is_aux_domain(dev, &domain->domain))
5515 spin_lock_irqsave(&device_domain_lock, flags);
5516 info = dev->archdata.iommu;
5517 iommu = info->iommu;
5519 auxiliary_unlink_device(domain, dev);
5521 spin_lock(&iommu->lock);
5522 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5523 domain_detach_iommu(domain, iommu);
5524 spin_unlock(&iommu->lock);
5526 spin_unlock_irqrestore(&device_domain_lock, flags);
5529 static int prepare_domain_attach_device(struct iommu_domain *domain,
5532 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5533 struct intel_iommu *iommu;
5537 iommu = device_to_iommu(dev, &bus, &devfn);
5541 /* check if this iommu agaw is sufficient for max mapped address */
5542 addr_width = agaw_to_width(iommu->agaw);
5543 if (addr_width > cap_mgaw(iommu->cap))
5544 addr_width = cap_mgaw(iommu->cap);
5546 if (dmar_domain->max_addr > (1LL << addr_width)) {
5547 dev_err(dev, "%s: iommu width (%d) is not "
5548 "sufficient for the mapped address (%llx)\n",
5549 __func__, addr_width, dmar_domain->max_addr);
5552 dmar_domain->gaw = addr_width;
5555 * Knock out extra levels of page tables if necessary
5557 while (iommu->agaw < dmar_domain->agaw) {
5558 struct dma_pte *pte;
5560 pte = dmar_domain->pgd;
5561 if (dma_pte_present(pte)) {
5562 dmar_domain->pgd = (struct dma_pte *)
5563 phys_to_virt(dma_pte_addr(pte));
5564 free_pgtable_page(pte);
5566 dmar_domain->agaw--;
5572 static int intel_iommu_attach_device(struct iommu_domain *domain,
5577 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5578 device_is_rmrr_locked(dev)) {
5579 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5583 if (is_aux_domain(dev, domain))
5586 /* normally dev is not mapped */
5587 if (unlikely(domain_context_mapped(dev))) {
5588 struct dmar_domain *old_domain;
5590 old_domain = find_domain(dev);
5592 dmar_remove_one_dev_info(dev);
5595 ret = prepare_domain_attach_device(domain, dev);
5599 return domain_add_dev_info(to_dmar_domain(domain), dev);
5602 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5607 if (!is_aux_domain(dev, domain))
5610 ret = prepare_domain_attach_device(domain, dev);
5614 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5617 static void intel_iommu_detach_device(struct iommu_domain *domain,
5620 dmar_remove_one_dev_info(dev);
5623 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5626 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5629 static int intel_iommu_map(struct iommu_domain *domain,
5630 unsigned long iova, phys_addr_t hpa,
5631 size_t size, int iommu_prot, gfp_t gfp)
5633 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5638 if (iommu_prot & IOMMU_READ)
5639 prot |= DMA_PTE_READ;
5640 if (iommu_prot & IOMMU_WRITE)
5641 prot |= DMA_PTE_WRITE;
5642 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5643 prot |= DMA_PTE_SNP;
5645 max_addr = iova + size;
5646 if (dmar_domain->max_addr < max_addr) {
5649 /* check if minimum agaw is sufficient for mapped address */
5650 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5651 if (end < max_addr) {
5652 pr_err("%s: iommu width (%d) is not "
5653 "sufficient for the mapped address (%llx)\n",
5654 __func__, dmar_domain->gaw, max_addr);
5657 dmar_domain->max_addr = max_addr;
5659 /* Round up size to next multiple of PAGE_SIZE, if it and
5660 the low bits of hpa would take us onto the next page */
5661 size = aligned_nrpages(hpa, size);
5662 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5663 hpa >> VTD_PAGE_SHIFT, size, prot);
5667 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5668 unsigned long iova, size_t size,
5669 struct iommu_iotlb_gather *gather)
5671 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5672 struct page *freelist = NULL;
5673 unsigned long start_pfn, last_pfn;
5674 unsigned int npages;
5675 int iommu_id, level = 0;
5677 /* Cope with horrid API which requires us to unmap more than the
5678 size argument if it happens to be a large-page mapping. */
5679 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5681 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5682 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5684 start_pfn = iova >> VTD_PAGE_SHIFT;
5685 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5687 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5689 npages = last_pfn - start_pfn + 1;
5691 for_each_domain_iommu(iommu_id, dmar_domain)
5692 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5693 start_pfn, npages, !freelist, 0);
5695 dma_free_pagelist(freelist);
5697 if (dmar_domain->max_addr == iova + size)
5698 dmar_domain->max_addr = iova;
5703 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5706 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5707 struct dma_pte *pte;
5711 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5712 if (pte && dma_pte_present(pte))
5713 phys = dma_pte_addr(pte) +
5714 (iova & (BIT_MASK(level_to_offset_bits(level) +
5715 VTD_PAGE_SHIFT) - 1));
5720 static inline bool scalable_mode_support(void)
5722 struct dmar_drhd_unit *drhd;
5723 struct intel_iommu *iommu;
5727 for_each_active_iommu(iommu, drhd) {
5728 if (!sm_supported(iommu)) {
5738 static inline bool iommu_pasid_support(void)
5740 struct dmar_drhd_unit *drhd;
5741 struct intel_iommu *iommu;
5745 for_each_active_iommu(iommu, drhd) {
5746 if (!pasid_supported(iommu)) {
5756 static inline bool nested_mode_support(void)
5758 struct dmar_drhd_unit *drhd;
5759 struct intel_iommu *iommu;
5763 for_each_active_iommu(iommu, drhd) {
5764 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5774 static bool intel_iommu_capable(enum iommu_cap cap)
5776 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5777 return domain_update_iommu_snooping(NULL) == 1;
5778 if (cap == IOMMU_CAP_INTR_REMAP)
5779 return irq_remapping_enabled == 1;
5784 static int intel_iommu_add_device(struct device *dev)
5786 struct dmar_domain *dmar_domain;
5787 struct iommu_domain *domain;
5788 struct intel_iommu *iommu;
5789 struct iommu_group *group;
5793 iommu = device_to_iommu(dev, &bus, &devfn);
5797 iommu_device_link(&iommu->iommu, dev);
5799 if (translation_pre_enabled(iommu))
5800 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5802 group = iommu_group_get_for_dev(dev);
5804 if (IS_ERR(group)) {
5805 ret = PTR_ERR(group);
5809 iommu_group_put(group);
5811 domain = iommu_get_domain_for_dev(dev);
5812 dmar_domain = to_dmar_domain(domain);
5813 if (domain->type == IOMMU_DOMAIN_DMA) {
5814 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5815 ret = iommu_request_dm_for_dev(dev);
5817 dmar_remove_one_dev_info(dev);
5818 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5819 domain_add_dev_info(si_domain, dev);
5821 "Device uses a private identity domain.\n");
5825 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5826 ret = iommu_request_dma_domain_for_dev(dev);
5828 dmar_remove_one_dev_info(dev);
5829 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5830 if (!get_private_domain_for_dev(dev)) {
5832 "Failed to get a private domain.\n");
5838 "Device uses a private dma domain.\n");
5843 if (device_needs_bounce(dev)) {
5844 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5845 set_dma_ops(dev, &bounce_dma_ops);
5851 iommu_device_unlink(&iommu->iommu, dev);
5855 static void intel_iommu_remove_device(struct device *dev)
5857 struct intel_iommu *iommu;
5860 iommu = device_to_iommu(dev, &bus, &devfn);
5864 dmar_remove_one_dev_info(dev);
5866 iommu_group_remove_device(dev);
5868 iommu_device_unlink(&iommu->iommu, dev);
5870 if (device_needs_bounce(dev))
5871 set_dma_ops(dev, NULL);
5874 static void intel_iommu_get_resv_regions(struct device *device,
5875 struct list_head *head)
5877 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5878 struct iommu_resv_region *reg;
5879 struct dmar_rmrr_unit *rmrr;
5880 struct device *i_dev;
5883 down_read(&dmar_global_lock);
5884 for_each_rmrr_units(rmrr) {
5885 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5887 struct iommu_resv_region *resv;
5888 enum iommu_resv_type type;
5891 if (i_dev != device &&
5892 !is_downstream_to_pci_bridge(device, i_dev))
5895 length = rmrr->end_address - rmrr->base_address + 1;
5897 type = device_rmrr_is_relaxable(device) ?
5898 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5900 resv = iommu_alloc_resv_region(rmrr->base_address,
5901 length, prot, type);
5905 list_add_tail(&resv->list, head);
5908 up_read(&dmar_global_lock);
5910 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5911 if (dev_is_pci(device)) {
5912 struct pci_dev *pdev = to_pci_dev(device);
5914 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5915 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5916 IOMMU_RESV_DIRECT_RELAXABLE);
5918 list_add_tail(®->list, head);
5921 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5923 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5924 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5928 list_add_tail(®->list, head);
5931 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5933 struct device_domain_info *info;
5934 struct context_entry *context;
5935 struct dmar_domain *domain;
5936 unsigned long flags;
5940 domain = find_domain(dev);
5944 spin_lock_irqsave(&device_domain_lock, flags);
5945 spin_lock(&iommu->lock);
5948 info = dev->archdata.iommu;
5949 if (!info || !info->pasid_supported)
5952 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5953 if (WARN_ON(!context))
5956 ctx_lo = context[0].lo;
5958 if (!(ctx_lo & CONTEXT_PASIDE)) {
5959 ctx_lo |= CONTEXT_PASIDE;
5960 context[0].lo = ctx_lo;
5962 iommu->flush.flush_context(iommu,
5963 domain->iommu_did[iommu->seq_id],
5964 PCI_DEVID(info->bus, info->devfn),
5965 DMA_CCMD_MASK_NOBIT,
5966 DMA_CCMD_DEVICE_INVL);
5969 /* Enable PASID support in the device, if it wasn't already */
5970 if (!info->pasid_enabled)
5971 iommu_enable_dev_iotlb(info);
5976 spin_unlock(&iommu->lock);
5977 spin_unlock_irqrestore(&device_domain_lock, flags);
5982 static void intel_iommu_apply_resv_region(struct device *dev,
5983 struct iommu_domain *domain,
5984 struct iommu_resv_region *region)
5986 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5987 unsigned long start, end;
5989 start = IOVA_PFN(region->start);
5990 end = IOVA_PFN(region->start + region->length - 1);
5992 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5995 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5997 if (dev_is_pci(dev))
5998 return pci_device_group(dev);
5999 return generic_device_group(dev);
6002 #ifdef CONFIG_INTEL_IOMMU_SVM
6003 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6005 struct intel_iommu *iommu;
6008 if (iommu_dummy(dev)) {
6010 "No IOMMU translation for device; cannot enable SVM\n");
6014 iommu = device_to_iommu(dev, &bus, &devfn);
6016 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6022 #endif /* CONFIG_INTEL_IOMMU_SVM */
6024 static int intel_iommu_enable_auxd(struct device *dev)
6026 struct device_domain_info *info;
6027 struct intel_iommu *iommu;
6028 unsigned long flags;
6032 iommu = device_to_iommu(dev, &bus, &devfn);
6033 if (!iommu || dmar_disabled)
6036 if (!sm_supported(iommu) || !pasid_supported(iommu))
6039 ret = intel_iommu_enable_pasid(iommu, dev);
6043 spin_lock_irqsave(&device_domain_lock, flags);
6044 info = dev->archdata.iommu;
6045 info->auxd_enabled = 1;
6046 spin_unlock_irqrestore(&device_domain_lock, flags);
6051 static int intel_iommu_disable_auxd(struct device *dev)
6053 struct device_domain_info *info;
6054 unsigned long flags;
6056 spin_lock_irqsave(&device_domain_lock, flags);
6057 info = dev->archdata.iommu;
6058 if (!WARN_ON(!info))
6059 info->auxd_enabled = 0;
6060 spin_unlock_irqrestore(&device_domain_lock, flags);
6066 * A PCI express designated vendor specific extended capability is defined
6067 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6068 * for system software and tools to detect endpoint devices supporting the
6069 * Intel scalable IO virtualization without host driver dependency.
6071 * Returns the address of the matching extended capability structure within
6072 * the device's PCI configuration space or 0 if the device does not support
6075 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6080 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6082 pci_read_config_word(pdev, pos + 4, &vendor);
6083 pci_read_config_word(pdev, pos + 8, &id);
6084 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6087 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6094 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6096 if (feat == IOMMU_DEV_FEAT_AUX) {
6099 if (!dev_is_pci(dev) || dmar_disabled ||
6100 !scalable_mode_support() || !iommu_pasid_support())
6103 ret = pci_pasid_features(to_pci_dev(dev));
6107 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6114 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6116 if (feat == IOMMU_DEV_FEAT_AUX)
6117 return intel_iommu_enable_auxd(dev);
6123 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6125 if (feat == IOMMU_DEV_FEAT_AUX)
6126 return intel_iommu_disable_auxd(dev);
6132 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6134 struct device_domain_info *info = dev->archdata.iommu;
6136 if (feat == IOMMU_DEV_FEAT_AUX)
6137 return scalable_mode_support() && info && info->auxd_enabled;
6143 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6145 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6147 return dmar_domain->default_pasid > 0 ?
6148 dmar_domain->default_pasid : -EINVAL;
6151 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6154 return attach_deferred(dev);
6158 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6159 enum iommu_attr attr, void *data)
6161 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6162 unsigned long flags;
6165 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6169 case DOMAIN_ATTR_NESTING:
6170 spin_lock_irqsave(&device_domain_lock, flags);
6171 if (nested_mode_support() &&
6172 list_empty(&dmar_domain->devices)) {
6173 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6174 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6178 spin_unlock_irqrestore(&device_domain_lock, flags);
6188 const struct iommu_ops intel_iommu_ops = {
6189 .capable = intel_iommu_capable,
6190 .domain_alloc = intel_iommu_domain_alloc,
6191 .domain_free = intel_iommu_domain_free,
6192 .domain_set_attr = intel_iommu_domain_set_attr,
6193 .attach_dev = intel_iommu_attach_device,
6194 .detach_dev = intel_iommu_detach_device,
6195 .aux_attach_dev = intel_iommu_aux_attach_device,
6196 .aux_detach_dev = intel_iommu_aux_detach_device,
6197 .aux_get_pasid = intel_iommu_aux_get_pasid,
6198 .map = intel_iommu_map,
6199 .unmap = intel_iommu_unmap,
6200 .iova_to_phys = intel_iommu_iova_to_phys,
6201 .add_device = intel_iommu_add_device,
6202 .remove_device = intel_iommu_remove_device,
6203 .get_resv_regions = intel_iommu_get_resv_regions,
6204 .put_resv_regions = generic_iommu_put_resv_regions,
6205 .apply_resv_region = intel_iommu_apply_resv_region,
6206 .device_group = intel_iommu_device_group,
6207 .dev_has_feat = intel_iommu_dev_has_feat,
6208 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6209 .dev_enable_feat = intel_iommu_dev_enable_feat,
6210 .dev_disable_feat = intel_iommu_dev_disable_feat,
6211 .is_attach_deferred = intel_iommu_is_attach_deferred,
6212 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6215 static void quirk_iommu_igfx(struct pci_dev *dev)
6217 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6221 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6230 /* Broadwell igfx malfunctions with dmar */
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6256 static void quirk_iommu_rwbf(struct pci_dev *dev)
6259 * Mobile 4 Series Chipset neglects to set RWBF capability,
6260 * but needs it. Same seems to hold for the desktop versions.
6262 pci_info(dev, "Forcing write-buffer flush capability\n");
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6275 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6276 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6277 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6278 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6279 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6280 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6281 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6282 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6284 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6288 if (pci_read_config_word(dev, GGC, &ggc))
6291 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6292 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6294 } else if (dmar_map_gfx) {
6295 /* we have to ensure the gfx device is idle before we flush */
6296 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6297 intel_iommu_strict = 1;
6300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6305 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6306 ISOCH DMAR unit for the Azalia sound device, but not give it any
6307 TLB entries, which causes it to deadlock. Check for that. We do
6308 this in a function called from init_dmars(), instead of in a PCI
6309 quirk, because we don't want to print the obnoxious "BIOS broken"
6310 message if VT-d is actually disabled.
6312 static void __init check_tylersburg_isoch(void)
6314 struct pci_dev *pdev;
6315 uint32_t vtisochctrl;
6317 /* If there's no Azalia in the system anyway, forget it. */
6318 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6323 /* System Management Registers. Might be hidden, in which case
6324 we can't do the sanity check. But that's OK, because the
6325 known-broken BIOSes _don't_ actually hide it, so far. */
6326 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6330 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6337 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6338 if (vtisochctrl & 1)
6341 /* Drop all bits other than the number of TLB entries */
6342 vtisochctrl &= 0x1c;
6344 /* If we have the recommended number of TLB entries (16), fine. */
6345 if (vtisochctrl == 0x10)
6348 /* Zero TLB entries? You get to ride the short bus to school. */
6350 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6351 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6352 dmi_get_system_info(DMI_BIOS_VENDOR),
6353 dmi_get_system_info(DMI_BIOS_VERSION),
6354 dmi_get_system_info(DMI_PRODUCT_VERSION));
6355 iommu_identity_mapping |= IDENTMAP_AZALIA;
6359 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",