1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static int intel_iommu_attach_device(struct iommu_domain *domain,
360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
363 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
364 int dmar_disabled = 0;
366 int dmar_disabled = 1;
367 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
369 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
370 int intel_iommu_sm = 1;
373 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
375 int intel_iommu_enabled = 0;
376 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
378 static int dmar_map_gfx = 1;
379 static int dmar_forcedac;
380 static int intel_iommu_strict;
381 static int intel_iommu_superpage = 1;
382 static int iommu_identity_mapping;
383 static int intel_no_bounce;
385 #define IDENTMAP_GFX 2
386 #define IDENTMAP_AZALIA 4
388 int intel_iommu_gfx_mapped;
389 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
391 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
392 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
393 DEFINE_SPINLOCK(device_domain_lock);
394 static LIST_HEAD(device_domain_list);
396 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
397 to_pci_dev(d)->untrusted)
400 * Iterate over elements in device_domain_list and call the specified
401 * callback @fn against each element.
403 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
404 void *data), void *data)
408 struct device_domain_info *info;
410 spin_lock_irqsave(&device_domain_lock, flags);
411 list_for_each_entry(info, &device_domain_list, global) {
412 ret = fn(info, data);
414 spin_unlock_irqrestore(&device_domain_lock, flags);
418 spin_unlock_irqrestore(&device_domain_lock, flags);
423 const struct iommu_ops intel_iommu_ops;
425 static bool translation_pre_enabled(struct intel_iommu *iommu)
427 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
430 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
432 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
435 static void init_translation_status(struct intel_iommu *iommu)
439 gsts = readl(iommu->reg + DMAR_GSTS_REG);
440 if (gsts & DMA_GSTS_TES)
441 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
444 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
445 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
447 return container_of(dom, struct dmar_domain, domain);
450 static int __init intel_iommu_setup(char *str)
455 if (!strncmp(str, "on", 2)) {
457 pr_info("IOMMU enabled\n");
458 } else if (!strncmp(str, "off", 3)) {
460 no_platform_optin = 1;
461 pr_info("IOMMU disabled\n");
462 } else if (!strncmp(str, "igfx_off", 8)) {
464 pr_info("Disable GFX device mapping\n");
465 } else if (!strncmp(str, "forcedac", 8)) {
466 pr_info("Forcing DAC for PCI devices\n");
468 } else if (!strncmp(str, "strict", 6)) {
469 pr_info("Disable batched IOTLB flush\n");
470 intel_iommu_strict = 1;
471 } else if (!strncmp(str, "sp_off", 6)) {
472 pr_info("Disable supported super page\n");
473 intel_iommu_superpage = 0;
474 } else if (!strncmp(str, "sm_on", 5)) {
475 pr_info("Intel-IOMMU: scalable mode supported\n");
477 } else if (!strncmp(str, "tboot_noforce", 13)) {
479 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
480 intel_iommu_tboot_noforce = 1;
481 } else if (!strncmp(str, "nobounce", 8)) {
482 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
486 str += strcspn(str, ",");
492 __setup("intel_iommu=", intel_iommu_setup);
494 static struct kmem_cache *iommu_domain_cache;
495 static struct kmem_cache *iommu_devinfo_cache;
497 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
499 struct dmar_domain **domains;
502 domains = iommu->domains[idx];
506 return domains[did & 0xff];
509 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
510 struct dmar_domain *domain)
512 struct dmar_domain **domains;
515 if (!iommu->domains[idx]) {
516 size_t size = 256 * sizeof(struct dmar_domain *);
517 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
520 domains = iommu->domains[idx];
521 if (WARN_ON(!domains))
524 domains[did & 0xff] = domain;
527 void *alloc_pgtable_page(int node)
532 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
534 vaddr = page_address(page);
538 void free_pgtable_page(void *vaddr)
540 free_page((unsigned long)vaddr);
543 static inline void *alloc_domain_mem(void)
545 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
548 static void free_domain_mem(void *vaddr)
550 kmem_cache_free(iommu_domain_cache, vaddr);
553 static inline void * alloc_devinfo_mem(void)
555 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
558 static inline void free_devinfo_mem(void *vaddr)
560 kmem_cache_free(iommu_devinfo_cache, vaddr);
563 static inline int domain_type_is_si(struct dmar_domain *domain)
565 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
568 static inline bool domain_use_first_level(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
573 static inline int domain_pfn_supported(struct dmar_domain *domain,
576 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
578 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
581 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
586 sagaw = cap_sagaw(iommu->cap);
587 for (agaw = width_to_agaw(max_gaw);
589 if (test_bit(agaw, &sagaw))
597 * Calculate max SAGAW for each iommu.
599 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
601 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
605 * calculate agaw for each iommu.
606 * "SAGAW" may be different across iommus, use a default agaw, and
607 * get a supported less agaw for iommus that don't support the default agaw.
609 int iommu_calculate_agaw(struct intel_iommu *iommu)
611 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
614 /* This functionin only returns single iommu in a domain */
615 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
619 /* si_domain and vm domain should not get here. */
620 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
623 for_each_domain_iommu(iommu_id, domain)
626 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
629 return g_iommus[iommu_id];
632 static void domain_update_iommu_coherency(struct dmar_domain *domain)
634 struct dmar_drhd_unit *drhd;
635 struct intel_iommu *iommu;
639 domain->iommu_coherency = 1;
641 for_each_domain_iommu(i, domain) {
643 if (!ecap_coherent(g_iommus[i]->ecap)) {
644 domain->iommu_coherency = 0;
651 /* No hardware attached; use lowest common denominator */
653 for_each_active_iommu(iommu, drhd) {
654 if (!ecap_coherent(iommu->ecap)) {
655 domain->iommu_coherency = 0;
662 static int domain_update_iommu_snooping(struct intel_iommu *skip)
664 struct dmar_drhd_unit *drhd;
665 struct intel_iommu *iommu;
669 for_each_active_iommu(iommu, drhd) {
671 if (!ecap_sc_support(iommu->ecap)) {
682 static int domain_update_iommu_superpage(struct dmar_domain *domain,
683 struct intel_iommu *skip)
685 struct dmar_drhd_unit *drhd;
686 struct intel_iommu *iommu;
689 if (!intel_iommu_superpage) {
693 /* set iommu_superpage to the smallest common denominator */
695 for_each_active_iommu(iommu, drhd) {
697 if (domain && domain_use_first_level(domain)) {
698 if (!cap_fl1gp_support(iommu->cap))
701 mask &= cap_super_page_val(iommu->cap);
713 /* Some capabilities may be different across iommus */
714 static void domain_update_iommu_cap(struct dmar_domain *domain)
716 domain_update_iommu_coherency(domain);
717 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
718 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
721 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
724 struct root_entry *root = &iommu->root_entry[bus];
725 struct context_entry *context;
729 if (sm_supported(iommu)) {
737 context = phys_to_virt(*entry & VTD_PAGE_MASK);
739 unsigned long phy_addr;
743 context = alloc_pgtable_page(iommu->node);
747 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
748 phy_addr = virt_to_phys((void *)context);
749 *entry = phy_addr | 1;
750 __iommu_flush_cache(iommu, entry, sizeof(*entry));
752 return &context[devfn];
755 static int iommu_dummy(struct device *dev)
757 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
760 static bool attach_deferred(struct device *dev)
762 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
766 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
767 * sub-hierarchy of a candidate PCI-PCI bridge
768 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
769 * @bridge: the candidate PCI-PCI bridge
771 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
774 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
776 struct pci_dev *pdev, *pbridge;
778 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
781 pdev = to_pci_dev(dev);
782 pbridge = to_pci_dev(bridge);
784 if (pbridge->subordinate &&
785 pbridge->subordinate->number <= pdev->bus->number &&
786 pbridge->subordinate->busn_res.end >= pdev->bus->number)
792 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
794 struct dmar_drhd_unit *drhd = NULL;
795 struct intel_iommu *iommu;
797 struct pci_dev *pdev = NULL;
801 if (iommu_dummy(dev))
804 if (dev_is_pci(dev)) {
805 struct pci_dev *pf_pdev;
807 pdev = pci_real_dma_dev(to_pci_dev(dev));
809 /* VFs aren't listed in scope tables; we need to look up
810 * the PF instead to find the IOMMU. */
811 pf_pdev = pci_physfn(pdev);
813 segment = pci_domain_nr(pdev->bus);
814 } else if (has_acpi_companion(dev))
815 dev = &ACPI_COMPANION(dev)->dev;
818 for_each_active_iommu(iommu, drhd) {
819 if (pdev && segment != drhd->segment)
822 for_each_active_dev_scope(drhd->devices,
823 drhd->devices_cnt, i, tmp) {
825 /* For a VF use its original BDF# not that of the PF
826 * which we used for the IOMMU lookup. Strictly speaking
827 * we could do this for all PCI devices; we only need to
828 * get the BDF# from the scope table for ACPI matches. */
829 if (pdev && pdev->is_virtfn)
832 *bus = drhd->devices[i].bus;
833 *devfn = drhd->devices[i].devfn;
837 if (is_downstream_to_pci_bridge(dev, tmp))
841 if (pdev && drhd->include_all) {
843 *bus = pdev->bus->number;
844 *devfn = pdev->devfn;
855 static void domain_flush_cache(struct dmar_domain *domain,
856 void *addr, int size)
858 if (!domain->iommu_coherency)
859 clflush_cache_range(addr, size);
862 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
864 struct context_entry *context;
868 spin_lock_irqsave(&iommu->lock, flags);
869 context = iommu_context_addr(iommu, bus, devfn, 0);
871 ret = context_present(context);
872 spin_unlock_irqrestore(&iommu->lock, flags);
876 static void free_context_table(struct intel_iommu *iommu)
880 struct context_entry *context;
882 spin_lock_irqsave(&iommu->lock, flags);
883 if (!iommu->root_entry) {
886 for (i = 0; i < ROOT_ENTRY_NR; i++) {
887 context = iommu_context_addr(iommu, i, 0, 0);
889 free_pgtable_page(context);
891 if (!sm_supported(iommu))
894 context = iommu_context_addr(iommu, i, 0x80, 0);
896 free_pgtable_page(context);
899 free_pgtable_page(iommu->root_entry);
900 iommu->root_entry = NULL;
902 spin_unlock_irqrestore(&iommu->lock, flags);
905 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
906 unsigned long pfn, int *target_level)
908 struct dma_pte *parent, *pte;
909 int level = agaw_to_level(domain->agaw);
912 BUG_ON(!domain->pgd);
914 if (!domain_pfn_supported(domain, pfn))
915 /* Address beyond IOMMU's addressing capabilities. */
918 parent = domain->pgd;
923 offset = pfn_level_offset(pfn, level);
924 pte = &parent[offset];
925 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
927 if (level == *target_level)
930 if (!dma_pte_present(pte)) {
933 tmp_page = alloc_pgtable_page(domain->nid);
938 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
939 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
940 if (domain_use_first_level(domain))
941 pteval |= DMA_FL_PTE_XD;
942 if (cmpxchg64(&pte->val, 0ULL, pteval))
943 /* Someone else set it while we were thinking; use theirs. */
944 free_pgtable_page(tmp_page);
946 domain_flush_cache(domain, pte, sizeof(*pte));
951 parent = phys_to_virt(dma_pte_addr(pte));
956 *target_level = level;
961 /* return address's pte at specific level */
962 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
964 int level, int *large_page)
966 struct dma_pte *parent, *pte;
967 int total = agaw_to_level(domain->agaw);
970 parent = domain->pgd;
971 while (level <= total) {
972 offset = pfn_level_offset(pfn, total);
973 pte = &parent[offset];
977 if (!dma_pte_present(pte)) {
982 if (dma_pte_superpage(pte)) {
987 parent = phys_to_virt(dma_pte_addr(pte));
993 /* clear last level pte, a tlb flush should be followed */
994 static void dma_pte_clear_range(struct dmar_domain *domain,
995 unsigned long start_pfn,
996 unsigned long last_pfn)
998 unsigned int large_page;
999 struct dma_pte *first_pte, *pte;
1001 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1002 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1003 BUG_ON(start_pfn > last_pfn);
1005 /* we don't need lock here; nobody else touches the iova range */
1008 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1010 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1015 start_pfn += lvl_to_nr_pages(large_page);
1017 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1019 domain_flush_cache(domain, first_pte,
1020 (void *)pte - (void *)first_pte);
1022 } while (start_pfn && start_pfn <= last_pfn);
1025 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1026 int retain_level, struct dma_pte *pte,
1027 unsigned long pfn, unsigned long start_pfn,
1028 unsigned long last_pfn)
1030 pfn = max(start_pfn, pfn);
1031 pte = &pte[pfn_level_offset(pfn, level)];
1034 unsigned long level_pfn;
1035 struct dma_pte *level_pte;
1037 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1040 level_pfn = pfn & level_mask(level);
1041 level_pte = phys_to_virt(dma_pte_addr(pte));
1044 dma_pte_free_level(domain, level - 1, retain_level,
1045 level_pte, level_pfn, start_pfn,
1050 * Free the page table if we're below the level we want to
1051 * retain and the range covers the entire table.
1053 if (level < retain_level && !(start_pfn > level_pfn ||
1054 last_pfn < level_pfn + level_size(level) - 1)) {
1056 domain_flush_cache(domain, pte, sizeof(*pte));
1057 free_pgtable_page(level_pte);
1060 pfn += level_size(level);
1061 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1065 * clear last level (leaf) ptes and free page table pages below the
1066 * level we wish to keep intact.
1068 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1069 unsigned long start_pfn,
1070 unsigned long last_pfn,
1073 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075 BUG_ON(start_pfn > last_pfn);
1077 dma_pte_clear_range(domain, start_pfn, last_pfn);
1079 /* We don't need lock here; nobody else touches the iova range */
1080 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1081 domain->pgd, 0, start_pfn, last_pfn);
1084 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1085 free_pgtable_page(domain->pgd);
1090 /* When a page at a given level is being unlinked from its parent, we don't
1091 need to *modify* it at all. All we need to do is make a list of all the
1092 pages which can be freed just as soon as we've flushed the IOTLB and we
1093 know the hardware page-walk will no longer touch them.
1094 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1096 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1097 int level, struct dma_pte *pte,
1098 struct page *freelist)
1102 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1103 pg->freelist = freelist;
1109 pte = page_address(pg);
1111 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112 freelist = dma_pte_list_pagetables(domain, level - 1,
1115 } while (!first_pte_in_page(pte));
1120 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1121 struct dma_pte *pte, unsigned long pfn,
1122 unsigned long start_pfn,
1123 unsigned long last_pfn,
1124 struct page *freelist)
1126 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128 pfn = max(start_pfn, pfn);
1129 pte = &pte[pfn_level_offset(pfn, level)];
1132 unsigned long level_pfn;
1134 if (!dma_pte_present(pte))
1137 level_pfn = pfn & level_mask(level);
1139 /* If range covers entire pagetable, free it */
1140 if (start_pfn <= level_pfn &&
1141 last_pfn >= level_pfn + level_size(level) - 1) {
1142 /* These suborbinate page tables are going away entirely. Don't
1143 bother to clear them; we're just going to *free* them. */
1144 if (level > 1 && !dma_pte_superpage(pte))
1145 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1151 } else if (level > 1) {
1152 /* Recurse down into a level that isn't *entirely* obsolete */
1153 freelist = dma_pte_clear_level(domain, level - 1,
1154 phys_to_virt(dma_pte_addr(pte)),
1155 level_pfn, start_pfn, last_pfn,
1159 pfn += level_size(level);
1160 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1163 domain_flush_cache(domain, first_pte,
1164 (void *)++last_pte - (void *)first_pte);
1169 /* We can't just free the pages because the IOMMU may still be walking
1170 the page tables, and may have cached the intermediate levels. The
1171 pages can only be freed after the IOTLB flush has been done. */
1172 static struct page *domain_unmap(struct dmar_domain *domain,
1173 unsigned long start_pfn,
1174 unsigned long last_pfn)
1176 struct page *freelist;
1178 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1179 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1180 BUG_ON(start_pfn > last_pfn);
1182 /* we don't need lock here; nobody else touches the iova range */
1183 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1184 domain->pgd, 0, start_pfn, last_pfn, NULL);
1187 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188 struct page *pgd_page = virt_to_page(domain->pgd);
1189 pgd_page->freelist = freelist;
1190 freelist = pgd_page;
1198 static void dma_free_pagelist(struct page *freelist)
1202 while ((pg = freelist)) {
1203 freelist = pg->freelist;
1204 free_pgtable_page(page_address(pg));
1208 static void iova_entry_free(unsigned long data)
1210 struct page *freelist = (struct page *)data;
1212 dma_free_pagelist(freelist);
1215 /* iommu handling */
1216 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1218 struct root_entry *root;
1219 unsigned long flags;
1221 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1223 pr_err("Allocating root entry for %s failed\n",
1228 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1230 spin_lock_irqsave(&iommu->lock, flags);
1231 iommu->root_entry = root;
1232 spin_unlock_irqrestore(&iommu->lock, flags);
1237 static void iommu_set_root_entry(struct intel_iommu *iommu)
1243 addr = virt_to_phys(iommu->root_entry);
1244 if (sm_supported(iommu))
1245 addr |= DMA_RTADDR_SMT;
1247 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1250 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1252 /* Make sure hardware complete it */
1253 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1254 readl, (sts & DMA_GSTS_RTPS), sts);
1256 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1259 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1264 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1267 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1268 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1270 /* Make sure hardware complete it */
1271 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272 readl, (!(val & DMA_GSTS_WBFS)), val);
1274 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1277 /* return value determine if we need a write buffer flush */
1278 static void __iommu_flush_context(struct intel_iommu *iommu,
1279 u16 did, u16 source_id, u8 function_mask,
1286 case DMA_CCMD_GLOBAL_INVL:
1287 val = DMA_CCMD_GLOBAL_INVL;
1289 case DMA_CCMD_DOMAIN_INVL:
1290 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1292 case DMA_CCMD_DEVICE_INVL:
1293 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1294 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1299 val |= DMA_CCMD_ICC;
1301 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1304 /* Make sure hardware complete it */
1305 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1306 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1308 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1311 /* return value determine if we need a write buffer flush */
1312 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1313 u64 addr, unsigned int size_order, u64 type)
1315 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1316 u64 val = 0, val_iva = 0;
1320 case DMA_TLB_GLOBAL_FLUSH:
1321 /* global flush doesn't need set IVA_REG */
1322 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1324 case DMA_TLB_DSI_FLUSH:
1325 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1327 case DMA_TLB_PSI_FLUSH:
1328 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1329 /* IH bit is passed in as part of address */
1330 val_iva = size_order | addr;
1335 /* Note: set drain read/write */
1338 * This is probably to be super secure.. Looks like we can
1339 * ignore it without any impact.
1341 if (cap_read_drain(iommu->cap))
1342 val |= DMA_TLB_READ_DRAIN;
1344 if (cap_write_drain(iommu->cap))
1345 val |= DMA_TLB_WRITE_DRAIN;
1347 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1348 /* Note: Only uses first TLB reg currently */
1350 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1351 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1353 /* Make sure hardware complete it */
1354 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1355 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1357 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1359 /* check IOTLB invalidation granularity */
1360 if (DMA_TLB_IAIG(val) == 0)
1361 pr_err("Flush IOTLB failed\n");
1362 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1363 pr_debug("TLB flush request %Lx, actual %Lx\n",
1364 (unsigned long long)DMA_TLB_IIRG(type),
1365 (unsigned long long)DMA_TLB_IAIG(val));
1368 static struct device_domain_info *
1369 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1372 struct device_domain_info *info;
1374 assert_spin_locked(&device_domain_lock);
1379 list_for_each_entry(info, &domain->devices, link)
1380 if (info->iommu == iommu && info->bus == bus &&
1381 info->devfn == devfn) {
1382 if (info->ats_supported && info->dev)
1390 static void domain_update_iotlb(struct dmar_domain *domain)
1392 struct device_domain_info *info;
1393 bool has_iotlb_device = false;
1395 assert_spin_locked(&device_domain_lock);
1397 list_for_each_entry(info, &domain->devices, link) {
1398 struct pci_dev *pdev;
1400 if (!info->dev || !dev_is_pci(info->dev))
1403 pdev = to_pci_dev(info->dev);
1404 if (pdev->ats_enabled) {
1405 has_iotlb_device = true;
1410 domain->has_iotlb_device = has_iotlb_device;
1413 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1415 struct pci_dev *pdev;
1417 assert_spin_locked(&device_domain_lock);
1419 if (!info || !dev_is_pci(info->dev))
1422 pdev = to_pci_dev(info->dev);
1423 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1424 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1425 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1426 * reserved, which should be set to 0.
1428 if (!ecap_dit(info->iommu->ecap))
1431 struct pci_dev *pf_pdev;
1433 /* pdev will be returned if device is not a vf */
1434 pf_pdev = pci_physfn(pdev);
1435 info->pfsid = pci_dev_id(pf_pdev);
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439 /* The PCIe spec, in its wisdom, declares that the behaviour of
1440 the device if you enable PASID support after ATS support is
1441 undefined. So always enable PASID support on devices which
1442 have it, even if we can't yet know if we're ever going to
1444 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1445 info->pasid_enabled = 1;
1447 if (info->pri_supported &&
1448 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1449 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1450 info->pri_enabled = 1;
1452 if (!pdev->untrusted && info->ats_supported &&
1453 pci_ats_page_aligned(pdev) &&
1454 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1455 info->ats_enabled = 1;
1456 domain_update_iotlb(info->domain);
1457 info->ats_qdep = pci_ats_queue_depth(pdev);
1461 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1463 struct pci_dev *pdev;
1465 assert_spin_locked(&device_domain_lock);
1467 if (!dev_is_pci(info->dev))
1470 pdev = to_pci_dev(info->dev);
1472 if (info->ats_enabled) {
1473 pci_disable_ats(pdev);
1474 info->ats_enabled = 0;
1475 domain_update_iotlb(info->domain);
1477 #ifdef CONFIG_INTEL_IOMMU_SVM
1478 if (info->pri_enabled) {
1479 pci_disable_pri(pdev);
1480 info->pri_enabled = 0;
1482 if (info->pasid_enabled) {
1483 pci_disable_pasid(pdev);
1484 info->pasid_enabled = 0;
1489 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1490 u64 addr, unsigned mask)
1493 unsigned long flags;
1494 struct device_domain_info *info;
1496 if (!domain->has_iotlb_device)
1499 spin_lock_irqsave(&device_domain_lock, flags);
1500 list_for_each_entry(info, &domain->devices, link) {
1501 if (!info->ats_enabled)
1504 sid = info->bus << 8 | info->devfn;
1505 qdep = info->ats_qdep;
1506 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1509 spin_unlock_irqrestore(&device_domain_lock, flags);
1512 static void domain_flush_piotlb(struct intel_iommu *iommu,
1513 struct dmar_domain *domain,
1514 u64 addr, unsigned long npages, bool ih)
1516 u16 did = domain->iommu_did[iommu->seq_id];
1518 if (domain->default_pasid)
1519 qi_flush_piotlb(iommu, did, domain->default_pasid,
1522 if (!list_empty(&domain->devices))
1523 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1526 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1527 struct dmar_domain *domain,
1528 unsigned long pfn, unsigned int pages,
1531 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1532 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1533 u16 did = domain->iommu_did[iommu->seq_id];
1540 if (domain_use_first_level(domain)) {
1541 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1544 * Fallback to domain selective flush if no PSI support or
1545 * the size is too big. PSI requires page size to be 2 ^ x,
1546 * and the base address is naturally aligned to the size.
1548 if (!cap_pgsel_inv(iommu->cap) ||
1549 mask > cap_max_amask_val(iommu->cap))
1550 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1558 * In caching mode, changes of pages from non-present to present require
1559 * flush. However, device IOTLB doesn't need to be flushed in this case.
1561 if (!cap_caching_mode(iommu->cap) || !map)
1562 iommu_flush_dev_iotlb(domain, addr, mask);
1565 /* Notification for newly created mappings */
1566 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1567 struct dmar_domain *domain,
1568 unsigned long pfn, unsigned int pages)
1571 * It's a non-present to present mapping. Only flush if caching mode
1574 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1575 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1577 iommu_flush_write_buffer(iommu);
1580 static void iommu_flush_iova(struct iova_domain *iovad)
1582 struct dmar_domain *domain;
1585 domain = container_of(iovad, struct dmar_domain, iovad);
1587 for_each_domain_iommu(idx, domain) {
1588 struct intel_iommu *iommu = g_iommus[idx];
1589 u16 did = domain->iommu_did[iommu->seq_id];
1591 if (domain_use_first_level(domain))
1592 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1594 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1597 if (!cap_caching_mode(iommu->cap))
1598 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1599 0, MAX_AGAW_PFN_WIDTH);
1603 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1606 unsigned long flags;
1608 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1611 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1613 pmen &= ~DMA_PMEN_EPM;
1614 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1616 /* wait for the protected region status bit to clear */
1617 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1618 readl, !(pmen & DMA_PMEN_PRS), pmen);
1620 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1623 static void iommu_enable_translation(struct intel_iommu *iommu)
1626 unsigned long flags;
1628 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1629 iommu->gcmd |= DMA_GCMD_TE;
1630 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1632 /* Make sure hardware complete it */
1633 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1634 readl, (sts & DMA_GSTS_TES), sts);
1636 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1639 static void iommu_disable_translation(struct intel_iommu *iommu)
1644 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1645 iommu->gcmd &= ~DMA_GCMD_TE;
1646 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1648 /* Make sure hardware complete it */
1649 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1650 readl, (!(sts & DMA_GSTS_TES)), sts);
1652 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1655 static int iommu_init_domains(struct intel_iommu *iommu)
1657 u32 ndomains, nlongs;
1660 ndomains = cap_ndoms(iommu->cap);
1661 pr_debug("%s: Number of Domains supported <%d>\n",
1662 iommu->name, ndomains);
1663 nlongs = BITS_TO_LONGS(ndomains);
1665 spin_lock_init(&iommu->lock);
1667 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1668 if (!iommu->domain_ids) {
1669 pr_err("%s: Allocating domain id array failed\n",
1674 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1675 iommu->domains = kzalloc(size, GFP_KERNEL);
1677 if (iommu->domains) {
1678 size = 256 * sizeof(struct dmar_domain *);
1679 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1682 if (!iommu->domains || !iommu->domains[0]) {
1683 pr_err("%s: Allocating domain array failed\n",
1685 kfree(iommu->domain_ids);
1686 kfree(iommu->domains);
1687 iommu->domain_ids = NULL;
1688 iommu->domains = NULL;
1693 * If Caching mode is set, then invalid translations are tagged
1694 * with domain-id 0, hence we need to pre-allocate it. We also
1695 * use domain-id 0 as a marker for non-allocated domain-id, so
1696 * make sure it is not used for a real domain.
1698 set_bit(0, iommu->domain_ids);
1701 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1702 * entry for first-level or pass-through translation modes should
1703 * be programmed with a domain id different from those used for
1704 * second-level or nested translation. We reserve a domain id for
1707 if (sm_supported(iommu))
1708 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1713 static void disable_dmar_iommu(struct intel_iommu *iommu)
1715 struct device_domain_info *info, *tmp;
1716 unsigned long flags;
1718 if (!iommu->domains || !iommu->domain_ids)
1721 spin_lock_irqsave(&device_domain_lock, flags);
1722 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1723 if (info->iommu != iommu)
1726 if (!info->dev || !info->domain)
1729 __dmar_remove_one_dev_info(info);
1731 spin_unlock_irqrestore(&device_domain_lock, flags);
1733 if (iommu->gcmd & DMA_GCMD_TE)
1734 iommu_disable_translation(iommu);
1737 static void free_dmar_iommu(struct intel_iommu *iommu)
1739 if ((iommu->domains) && (iommu->domain_ids)) {
1740 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1743 for (i = 0; i < elems; i++)
1744 kfree(iommu->domains[i]);
1745 kfree(iommu->domains);
1746 kfree(iommu->domain_ids);
1747 iommu->domains = NULL;
1748 iommu->domain_ids = NULL;
1751 g_iommus[iommu->seq_id] = NULL;
1753 /* free context mapping */
1754 free_context_table(iommu);
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757 if (pasid_supported(iommu)) {
1758 if (ecap_prs(iommu->ecap))
1759 intel_svm_finish_prq(iommu);
1765 * Check and return whether first level is used by default for
1768 static bool first_level_by_default(void)
1770 struct dmar_drhd_unit *drhd;
1771 struct intel_iommu *iommu;
1772 static int first_level_support = -1;
1774 if (likely(first_level_support != -1))
1775 return first_level_support;
1777 first_level_support = 1;
1780 for_each_active_iommu(iommu, drhd) {
1781 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1782 first_level_support = 0;
1788 return first_level_support;
1791 static struct dmar_domain *alloc_domain(int flags)
1793 struct dmar_domain *domain;
1795 domain = alloc_domain_mem();
1799 memset(domain, 0, sizeof(*domain));
1800 domain->nid = NUMA_NO_NODE;
1801 domain->flags = flags;
1802 if (first_level_by_default())
1803 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1804 domain->has_iotlb_device = false;
1805 INIT_LIST_HEAD(&domain->devices);
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812 struct intel_iommu *iommu)
1814 unsigned long ndomains;
1817 assert_spin_locked(&device_domain_lock);
1818 assert_spin_locked(&iommu->lock);
1820 domain->iommu_refcnt[iommu->seq_id] += 1;
1821 domain->iommu_count += 1;
1822 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823 ndomains = cap_ndoms(iommu->cap);
1824 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1826 if (num >= ndomains) {
1827 pr_err("%s: No free domain ids\n", iommu->name);
1828 domain->iommu_refcnt[iommu->seq_id] -= 1;
1829 domain->iommu_count -= 1;
1833 set_bit(num, iommu->domain_ids);
1834 set_iommu_domain(iommu, num, domain);
1836 domain->iommu_did[iommu->seq_id] = num;
1837 domain->nid = iommu->node;
1839 domain_update_iommu_cap(domain);
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846 struct intel_iommu *iommu)
1850 assert_spin_locked(&device_domain_lock);
1851 assert_spin_locked(&iommu->lock);
1853 domain->iommu_refcnt[iommu->seq_id] -= 1;
1854 count = --domain->iommu_count;
1855 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856 num = domain->iommu_did[iommu->seq_id];
1857 clear_bit(num, iommu->domain_ids);
1858 set_iommu_domain(iommu, num, NULL);
1860 domain_update_iommu_cap(domain);
1861 domain->iommu_did[iommu->seq_id] = 0;
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1870 static int dmar_init_reserved_ranges(void)
1872 struct pci_dev *pdev = NULL;
1876 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1878 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879 &reserved_rbtree_key);
1881 /* IOAPIC ranges shouldn't be accessed by DMA */
1882 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883 IOVA_PFN(IOAPIC_RANGE_END));
1885 pr_err("Reserve IOAPIC range failed\n");
1889 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1890 for_each_pci_dev(pdev) {
1893 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894 r = &pdev->resource[i];
1895 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1897 iova = reserve_iova(&reserved_iova_list,
1901 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1909 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1911 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1914 static inline int guestwidth_to_adjustwidth(int gaw)
1917 int r = (gaw - 12) % 9;
1928 static void domain_exit(struct dmar_domain *domain)
1931 /* Remove associated devices and clear attached or cached domains */
1932 domain_remove_dev_info(domain);
1935 put_iova_domain(&domain->iovad);
1938 struct page *freelist;
1940 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1941 dma_free_pagelist(freelist);
1944 free_domain_mem(domain);
1948 * Get the PASID directory size for scalable mode context entry.
1949 * Value of X in the PDTS field of a scalable mode context entry
1950 * indicates PASID directory with 2^(X + 7) entries.
1952 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1956 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1957 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1965 * Set the RID_PASID field of a scalable mode context entry. The
1966 * IOMMU hardware will use the PASID value set in this field for
1967 * DMA translations of DMA requests without PASID.
1970 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1972 context->hi |= pasid & ((1 << 20) - 1);
1973 context->hi |= (1 << 20);
1977 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1980 static inline void context_set_sm_dte(struct context_entry *context)
1982 context->lo |= (1 << 2);
1986 * Set the PRE(Page Request Enable) field of a scalable mode context
1989 static inline void context_set_sm_pre(struct context_entry *context)
1991 context->lo |= (1 << 4);
1994 /* Convert value to context PASID directory size field coding. */
1995 #define context_pdts(pds) (((pds) & 0x7) << 9)
1997 static int domain_context_mapping_one(struct dmar_domain *domain,
1998 struct intel_iommu *iommu,
1999 struct pasid_table *table,
2002 u16 did = domain->iommu_did[iommu->seq_id];
2003 int translation = CONTEXT_TT_MULTI_LEVEL;
2004 struct device_domain_info *info = NULL;
2005 struct context_entry *context;
2006 unsigned long flags;
2011 if (hw_pass_through && domain_type_is_si(domain))
2012 translation = CONTEXT_TT_PASS_THROUGH;
2014 pr_debug("Set context mapping for %02x:%02x.%d\n",
2015 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2017 BUG_ON(!domain->pgd);
2019 spin_lock_irqsave(&device_domain_lock, flags);
2020 spin_lock(&iommu->lock);
2023 context = iommu_context_addr(iommu, bus, devfn, 1);
2028 if (context_present(context))
2032 * For kdump cases, old valid entries may be cached due to the
2033 * in-flight DMA and copied pgtable, but there is no unmapping
2034 * behaviour for them, thus we need an explicit cache flush for
2035 * the newly-mapped device. For kdump, at this point, the device
2036 * is supposed to finish reset at its driver probe stage, so no
2037 * in-flight DMA will exist, and we don't need to worry anymore
2040 if (context_copied(context)) {
2041 u16 did_old = context_domain_id(context);
2043 if (did_old < cap_ndoms(iommu->cap)) {
2044 iommu->flush.flush_context(iommu, did_old,
2045 (((u16)bus) << 8) | devfn,
2046 DMA_CCMD_MASK_NOBIT,
2047 DMA_CCMD_DEVICE_INVL);
2048 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2053 context_clear_entry(context);
2055 if (sm_supported(iommu)) {
2060 /* Setup the PASID DIR pointer: */
2061 pds = context_get_sm_pds(table);
2062 context->lo = (u64)virt_to_phys(table->table) |
2065 /* Setup the RID_PASID field: */
2066 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2069 * Setup the Device-TLB enable bit and Page request
2072 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2073 if (info && info->ats_supported)
2074 context_set_sm_dte(context);
2075 if (info && info->pri_supported)
2076 context_set_sm_pre(context);
2078 struct dma_pte *pgd = domain->pgd;
2081 context_set_domain_id(context, did);
2083 if (translation != CONTEXT_TT_PASS_THROUGH) {
2085 * Skip top levels of page tables for iommu which has
2086 * less agaw than default. Unnecessary for PT mode.
2088 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2090 pgd = phys_to_virt(dma_pte_addr(pgd));
2091 if (!dma_pte_present(pgd))
2095 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2096 if (info && info->ats_supported)
2097 translation = CONTEXT_TT_DEV_IOTLB;
2099 translation = CONTEXT_TT_MULTI_LEVEL;
2101 context_set_address_root(context, virt_to_phys(pgd));
2102 context_set_address_width(context, agaw);
2105 * In pass through mode, AW must be programmed to
2106 * indicate the largest AGAW value supported by
2107 * hardware. And ASR is ignored by hardware.
2109 context_set_address_width(context, iommu->msagaw);
2112 context_set_translation_type(context, translation);
2115 context_set_fault_enable(context);
2116 context_set_present(context);
2117 domain_flush_cache(domain, context, sizeof(*context));
2120 * It's a non-present to present mapping. If hardware doesn't cache
2121 * non-present entry we only need to flush the write-buffer. If the
2122 * _does_ cache non-present entries, then it does so in the special
2123 * domain #0, which we have to flush:
2125 if (cap_caching_mode(iommu->cap)) {
2126 iommu->flush.flush_context(iommu, 0,
2127 (((u16)bus) << 8) | devfn,
2128 DMA_CCMD_MASK_NOBIT,
2129 DMA_CCMD_DEVICE_INVL);
2130 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2132 iommu_flush_write_buffer(iommu);
2134 iommu_enable_dev_iotlb(info);
2139 spin_unlock(&iommu->lock);
2140 spin_unlock_irqrestore(&device_domain_lock, flags);
2145 struct domain_context_mapping_data {
2146 struct dmar_domain *domain;
2147 struct intel_iommu *iommu;
2148 struct pasid_table *table;
2151 static int domain_context_mapping_cb(struct pci_dev *pdev,
2152 u16 alias, void *opaque)
2154 struct domain_context_mapping_data *data = opaque;
2156 return domain_context_mapping_one(data->domain, data->iommu,
2157 data->table, PCI_BUS_NUM(alias),
2162 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2164 struct domain_context_mapping_data data;
2165 struct pasid_table *table;
2166 struct intel_iommu *iommu;
2169 iommu = device_to_iommu(dev, &bus, &devfn);
2173 table = intel_pasid_get_table(dev);
2175 if (!dev_is_pci(dev))
2176 return domain_context_mapping_one(domain, iommu, table,
2179 data.domain = domain;
2183 return pci_for_each_dma_alias(to_pci_dev(dev),
2184 &domain_context_mapping_cb, &data);
2187 static int domain_context_mapped_cb(struct pci_dev *pdev,
2188 u16 alias, void *opaque)
2190 struct intel_iommu *iommu = opaque;
2192 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2195 static int domain_context_mapped(struct device *dev)
2197 struct intel_iommu *iommu;
2200 iommu = device_to_iommu(dev, &bus, &devfn);
2204 if (!dev_is_pci(dev))
2205 return device_context_mapped(iommu, bus, devfn);
2207 return !pci_for_each_dma_alias(to_pci_dev(dev),
2208 domain_context_mapped_cb, iommu);
2211 /* Returns a number of VTD pages, but aligned to MM page size */
2212 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2215 host_addr &= ~PAGE_MASK;
2216 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2219 /* Return largest possible superpage level for a given mapping */
2220 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2221 unsigned long iov_pfn,
2222 unsigned long phy_pfn,
2223 unsigned long pages)
2225 int support, level = 1;
2226 unsigned long pfnmerge;
2228 support = domain->iommu_superpage;
2230 /* To use a large page, the virtual *and* physical addresses
2231 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2232 of them will mean we have to use smaller pages. So just
2233 merge them and check both at once. */
2234 pfnmerge = iov_pfn | phy_pfn;
2236 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2237 pages >>= VTD_STRIDE_SHIFT;
2240 pfnmerge >>= VTD_STRIDE_SHIFT;
2247 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2248 struct scatterlist *sg, unsigned long phys_pfn,
2249 unsigned long nr_pages, int prot)
2251 struct dma_pte *first_pte = NULL, *pte = NULL;
2252 phys_addr_t uninitialized_var(pteval);
2253 unsigned long sg_res = 0;
2254 unsigned int largepage_lvl = 0;
2255 unsigned long lvl_pages = 0;
2258 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2260 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2263 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2264 if (domain_use_first_level(domain))
2265 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2269 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2272 while (nr_pages > 0) {
2276 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2278 sg_res = aligned_nrpages(sg->offset, sg->length);
2279 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2280 sg->dma_length = sg->length;
2281 pteval = (sg_phys(sg) - pgoff) | attr;
2282 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2286 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2288 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2291 /* It is large page*/
2292 if (largepage_lvl > 1) {
2293 unsigned long nr_superpages, end_pfn;
2295 pteval |= DMA_PTE_LARGE_PAGE;
2296 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2298 nr_superpages = sg_res / lvl_pages;
2299 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2302 * Ensure that old small page tables are
2303 * removed to make room for superpage(s).
2304 * We're adding new large pages, so make sure
2305 * we don't remove their parent tables.
2307 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2310 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2314 /* We don't need lock here, nobody else
2315 * touches the iova range
2317 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2319 static int dumps = 5;
2320 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2321 iov_pfn, tmp, (unsigned long long)pteval);
2324 debug_dma_dump_mappings(NULL);
2329 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2331 BUG_ON(nr_pages < lvl_pages);
2332 BUG_ON(sg_res < lvl_pages);
2334 nr_pages -= lvl_pages;
2335 iov_pfn += lvl_pages;
2336 phys_pfn += lvl_pages;
2337 pteval += lvl_pages * VTD_PAGE_SIZE;
2338 sg_res -= lvl_pages;
2340 /* If the next PTE would be the first in a new page, then we
2341 need to flush the cache on the entries we've just written.
2342 And then we'll need to recalculate 'pte', so clear it and
2343 let it get set again in the if (!pte) block above.
2345 If we're done (!nr_pages) we need to flush the cache too.
2347 Also if we've been setting superpages, we may need to
2348 recalculate 'pte' and switch back to smaller pages for the
2349 end of the mapping, if the trailing size is not enough to
2350 use another superpage (i.e. sg_res < lvl_pages). */
2352 if (!nr_pages || first_pte_in_page(pte) ||
2353 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2354 domain_flush_cache(domain, first_pte,
2355 (void *)pte - (void *)first_pte);
2359 if (!sg_res && nr_pages)
2365 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366 struct scatterlist *sg, unsigned long phys_pfn,
2367 unsigned long nr_pages, int prot)
2370 struct intel_iommu *iommu;
2372 /* Do the real mapping first */
2373 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2377 for_each_domain_iommu(iommu_id, domain) {
2378 iommu = g_iommus[iommu_id];
2379 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2385 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2386 struct scatterlist *sg, unsigned long nr_pages,
2389 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2392 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2393 unsigned long phys_pfn, unsigned long nr_pages,
2396 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2399 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2401 unsigned long flags;
2402 struct context_entry *context;
2408 spin_lock_irqsave(&iommu->lock, flags);
2409 context = iommu_context_addr(iommu, bus, devfn, 0);
2411 spin_unlock_irqrestore(&iommu->lock, flags);
2414 did_old = context_domain_id(context);
2415 context_clear_entry(context);
2416 __iommu_flush_cache(iommu, context, sizeof(*context));
2417 spin_unlock_irqrestore(&iommu->lock, flags);
2418 iommu->flush.flush_context(iommu,
2420 (((u16)bus) << 8) | devfn,
2421 DMA_CCMD_MASK_NOBIT,
2422 DMA_CCMD_DEVICE_INVL);
2423 iommu->flush.flush_iotlb(iommu,
2430 static inline void unlink_domain_info(struct device_domain_info *info)
2432 assert_spin_locked(&device_domain_lock);
2433 list_del(&info->link);
2434 list_del(&info->global);
2436 info->dev->archdata.iommu = NULL;
2439 static void domain_remove_dev_info(struct dmar_domain *domain)
2441 struct device_domain_info *info, *tmp;
2442 unsigned long flags;
2444 spin_lock_irqsave(&device_domain_lock, flags);
2445 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2446 __dmar_remove_one_dev_info(info);
2447 spin_unlock_irqrestore(&device_domain_lock, flags);
2450 struct dmar_domain *find_domain(struct device *dev)
2452 struct device_domain_info *info;
2454 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2457 if (dev_is_pci(dev))
2458 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2460 /* No lock here, assumes no domain exit in normal case */
2461 info = dev->archdata.iommu;
2463 return info->domain;
2468 static void do_deferred_attach(struct device *dev)
2470 struct iommu_domain *domain;
2472 dev->archdata.iommu = NULL;
2473 domain = iommu_get_domain_for_dev(dev);
2475 intel_iommu_attach_device(domain, dev);
2478 static inline struct device_domain_info *
2479 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2481 struct device_domain_info *info;
2483 list_for_each_entry(info, &device_domain_list, global)
2484 if (info->iommu->segment == segment && info->bus == bus &&
2485 info->devfn == devfn)
2491 static int domain_setup_first_level(struct intel_iommu *iommu,
2492 struct dmar_domain *domain,
2496 int flags = PASID_FLAG_SUPERVISOR_MODE;
2497 struct dma_pte *pgd = domain->pgd;
2501 * Skip top levels of page tables for iommu which has
2502 * less agaw than default. Unnecessary for PT mode.
2504 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2505 pgd = phys_to_virt(dma_pte_addr(pgd));
2506 if (!dma_pte_present(pgd))
2510 level = agaw_to_level(agaw);
2511 if (level != 4 && level != 5)
2514 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2516 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2517 domain->iommu_did[iommu->seq_id],
2521 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2524 struct dmar_domain *domain)
2526 struct dmar_domain *found = NULL;
2527 struct device_domain_info *info;
2528 unsigned long flags;
2531 info = alloc_devinfo_mem();
2536 info->devfn = devfn;
2537 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2538 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2541 info->domain = domain;
2542 info->iommu = iommu;
2543 info->pasid_table = NULL;
2544 info->auxd_enabled = 0;
2545 INIT_LIST_HEAD(&info->auxiliary_domains);
2547 if (dev && dev_is_pci(dev)) {
2548 struct pci_dev *pdev = to_pci_dev(info->dev);
2550 if (!pdev->untrusted &&
2551 !pci_ats_disabled() &&
2552 ecap_dev_iotlb_support(iommu->ecap) &&
2553 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2554 dmar_find_matched_atsr_unit(pdev))
2555 info->ats_supported = 1;
2557 if (sm_supported(iommu)) {
2558 if (pasid_supported(iommu)) {
2559 int features = pci_pasid_features(pdev);
2561 info->pasid_supported = features | 1;
2564 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2565 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2566 info->pri_supported = 1;
2570 spin_lock_irqsave(&device_domain_lock, flags);
2572 found = find_domain(dev);
2575 struct device_domain_info *info2;
2576 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2578 found = info2->domain;
2584 spin_unlock_irqrestore(&device_domain_lock, flags);
2585 free_devinfo_mem(info);
2586 /* Caller must free the original domain */
2590 spin_lock(&iommu->lock);
2591 ret = domain_attach_iommu(domain, iommu);
2592 spin_unlock(&iommu->lock);
2595 spin_unlock_irqrestore(&device_domain_lock, flags);
2596 free_devinfo_mem(info);
2600 list_add(&info->link, &domain->devices);
2601 list_add(&info->global, &device_domain_list);
2603 dev->archdata.iommu = info;
2604 spin_unlock_irqrestore(&device_domain_lock, flags);
2606 /* PASID table is mandatory for a PCI device in scalable mode. */
2607 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2608 ret = intel_pasid_alloc_table(dev);
2610 dev_err(dev, "PASID table allocation failed\n");
2611 dmar_remove_one_dev_info(dev);
2615 /* Setup the PASID entry for requests without PASID: */
2616 spin_lock(&iommu->lock);
2617 if (hw_pass_through && domain_type_is_si(domain))
2618 ret = intel_pasid_setup_pass_through(iommu, domain,
2619 dev, PASID_RID2PASID);
2620 else if (domain_use_first_level(domain))
2621 ret = domain_setup_first_level(iommu, domain, dev,
2624 ret = intel_pasid_setup_second_level(iommu, domain,
2625 dev, PASID_RID2PASID);
2626 spin_unlock(&iommu->lock);
2628 dev_err(dev, "Setup RID2PASID failed\n");
2629 dmar_remove_one_dev_info(dev);
2634 if (dev && domain_context_mapping(domain, dev)) {
2635 dev_err(dev, "Domain context map failed\n");
2636 dmar_remove_one_dev_info(dev);
2643 static int iommu_domain_identity_map(struct dmar_domain *domain,
2644 unsigned long long start,
2645 unsigned long long end)
2647 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2648 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2650 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2651 dma_to_mm_pfn(last_vpfn))) {
2652 pr_err("Reserving iova failed\n");
2656 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2658 * RMRR range might have overlap with physical memory range,
2661 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2663 return __domain_mapping(domain, first_vpfn, NULL,
2664 first_vpfn, last_vpfn - first_vpfn + 1,
2665 DMA_PTE_READ|DMA_PTE_WRITE);
2668 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2670 static int __init si_domain_init(int hw)
2672 struct dmar_rmrr_unit *rmrr;
2676 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2680 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2681 domain_exit(si_domain);
2688 for_each_online_node(nid) {
2689 unsigned long start_pfn, end_pfn;
2692 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2693 ret = iommu_domain_identity_map(si_domain,
2694 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2701 * Identity map the RMRRs so that devices with RMRRs could also use
2704 for_each_rmrr_units(rmrr) {
2705 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2707 unsigned long long start = rmrr->base_address;
2708 unsigned long long end = rmrr->end_address;
2710 if (WARN_ON(end < start ||
2711 end >> agaw_to_width(si_domain->agaw)))
2714 ret = iommu_domain_identity_map(si_domain, start, end);
2723 static int identity_mapping(struct device *dev)
2725 struct device_domain_info *info;
2727 info = dev->archdata.iommu;
2729 return (info->domain == si_domain);
2734 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2736 struct dmar_domain *ndomain;
2737 struct intel_iommu *iommu;
2740 iommu = device_to_iommu(dev, &bus, &devfn);
2744 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2745 if (ndomain != domain)
2751 static bool device_has_rmrr(struct device *dev)
2753 struct dmar_rmrr_unit *rmrr;
2758 for_each_rmrr_units(rmrr) {
2760 * Return TRUE if this RMRR contains the device that
2763 for_each_active_dev_scope(rmrr->devices,
2764 rmrr->devices_cnt, i, tmp)
2766 is_downstream_to_pci_bridge(dev, tmp)) {
2776 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2777 * is relaxable (ie. is allowed to be not enforced under some conditions)
2778 * @dev: device handle
2780 * We assume that PCI USB devices with RMRRs have them largely
2781 * for historical reasons and that the RMRR space is not actively used post
2782 * boot. This exclusion may change if vendors begin to abuse it.
2784 * The same exception is made for graphics devices, with the requirement that
2785 * any use of the RMRR regions will be torn down before assigning the device
2788 * Return: true if the RMRR is relaxable, false otherwise
2790 static bool device_rmrr_is_relaxable(struct device *dev)
2792 struct pci_dev *pdev;
2794 if (!dev_is_pci(dev))
2797 pdev = to_pci_dev(dev);
2798 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2805 * There are a couple cases where we need to restrict the functionality of
2806 * devices associated with RMRRs. The first is when evaluating a device for
2807 * identity mapping because problems exist when devices are moved in and out
2808 * of domains and their respective RMRR information is lost. This means that
2809 * a device with associated RMRRs will never be in a "passthrough" domain.
2810 * The second is use of the device through the IOMMU API. This interface
2811 * expects to have full control of the IOVA space for the device. We cannot
2812 * satisfy both the requirement that RMRR access is maintained and have an
2813 * unencumbered IOVA space. We also have no ability to quiesce the device's
2814 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2815 * We therefore prevent devices associated with an RMRR from participating in
2816 * the IOMMU API, which eliminates them from device assignment.
2818 * In both cases, devices which have relaxable RMRRs are not concerned by this
2819 * restriction. See device_rmrr_is_relaxable comment.
2821 static bool device_is_rmrr_locked(struct device *dev)
2823 if (!device_has_rmrr(dev))
2826 if (device_rmrr_is_relaxable(dev))
2833 * Return the required default domain type for a specific device.
2835 * @dev: the device in query
2836 * @startup: true if this is during early boot
2839 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2840 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2841 * - 0: both identity and dynamic domains work for this device
2843 static int device_def_domain_type(struct device *dev)
2845 if (dev_is_pci(dev)) {
2846 struct pci_dev *pdev = to_pci_dev(dev);
2849 * Prevent any device marked as untrusted from getting
2850 * placed into the statically identity mapping domain.
2852 if (pdev->untrusted)
2853 return IOMMU_DOMAIN_DMA;
2855 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2856 return IOMMU_DOMAIN_IDENTITY;
2858 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2859 return IOMMU_DOMAIN_IDENTITY;
2862 * We want to start off with all devices in the 1:1 domain, and
2863 * take them out later if we find they can't access all of memory.
2865 * However, we can't do this for PCI devices behind bridges,
2866 * because all PCI devices behind the same bridge will end up
2867 * with the same source-id on their transactions.
2869 * Practically speaking, we can't change things around for these
2870 * devices at run-time, because we can't be sure there'll be no
2871 * DMA transactions in flight for any of their siblings.
2873 * So PCI devices (unless they're on the root bus) as well as
2874 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2875 * the 1:1 domain, just in _case_ one of their siblings turns out
2876 * not to be able to map all of memory.
2878 if (!pci_is_pcie(pdev)) {
2879 if (!pci_is_root_bus(pdev->bus))
2880 return IOMMU_DOMAIN_DMA;
2881 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2882 return IOMMU_DOMAIN_DMA;
2883 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2884 return IOMMU_DOMAIN_DMA;
2890 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2893 * Start from the sane iommu hardware state.
2894 * If the queued invalidation is already initialized by us
2895 * (for example, while enabling interrupt-remapping) then
2896 * we got the things already rolling from a sane state.
2900 * Clear any previous faults.
2902 dmar_fault(-1, iommu);
2904 * Disable queued invalidation if supported and already enabled
2905 * before OS handover.
2907 dmar_disable_qi(iommu);
2910 if (dmar_enable_qi(iommu)) {
2912 * Queued Invalidate not enabled, use Register Based Invalidate
2914 iommu->flush.flush_context = __iommu_flush_context;
2915 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2916 pr_info("%s: Using Register based invalidation\n",
2919 iommu->flush.flush_context = qi_flush_context;
2920 iommu->flush.flush_iotlb = qi_flush_iotlb;
2921 pr_info("%s: Using Queued invalidation\n", iommu->name);
2925 static int copy_context_table(struct intel_iommu *iommu,
2926 struct root_entry *old_re,
2927 struct context_entry **tbl,
2930 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2931 struct context_entry *new_ce = NULL, ce;
2932 struct context_entry *old_ce = NULL;
2933 struct root_entry re;
2934 phys_addr_t old_ce_phys;
2936 tbl_idx = ext ? bus * 2 : bus;
2937 memcpy(&re, old_re, sizeof(re));
2939 for (devfn = 0; devfn < 256; devfn++) {
2940 /* First calculate the correct index */
2941 idx = (ext ? devfn * 2 : devfn) % 256;
2944 /* First save what we may have and clean up */
2946 tbl[tbl_idx] = new_ce;
2947 __iommu_flush_cache(iommu, new_ce,
2957 old_ce_phys = root_entry_lctp(&re);
2959 old_ce_phys = root_entry_uctp(&re);
2962 if (ext && devfn == 0) {
2963 /* No LCTP, try UCTP */
2972 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2977 new_ce = alloc_pgtable_page(iommu->node);
2984 /* Now copy the context entry */
2985 memcpy(&ce, old_ce + idx, sizeof(ce));
2987 if (!__context_present(&ce))
2990 did = context_domain_id(&ce);
2991 if (did >= 0 && did < cap_ndoms(iommu->cap))
2992 set_bit(did, iommu->domain_ids);
2995 * We need a marker for copied context entries. This
2996 * marker needs to work for the old format as well as
2997 * for extended context entries.
2999 * Bit 67 of the context entry is used. In the old
3000 * format this bit is available to software, in the
3001 * extended format it is the PGE bit, but PGE is ignored
3002 * by HW if PASIDs are disabled (and thus still
3005 * So disable PASIDs first and then mark the entry
3006 * copied. This means that we don't copy PASID
3007 * translations from the old kernel, but this is fine as
3008 * faults there are not fatal.
3010 context_clear_pasid_enable(&ce);
3011 context_set_copied(&ce);
3016 tbl[tbl_idx + pos] = new_ce;
3018 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027 static int copy_translation_tables(struct intel_iommu *iommu)
3029 struct context_entry **ctxt_tbls;
3030 struct root_entry *old_rt;
3031 phys_addr_t old_rt_phys;
3032 int ctxt_table_entries;
3033 unsigned long flags;
3038 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3039 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3040 new_ext = !!ecap_ecs(iommu->ecap);
3043 * The RTT bit can only be changed when translation is disabled,
3044 * but disabling translation means to open a window for data
3045 * corruption. So bail out and don't copy anything if we would
3046 * have to change the bit.
3051 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3055 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3059 /* This is too big for the stack - allocate it from slab */
3060 ctxt_table_entries = ext ? 512 : 256;
3062 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3066 for (bus = 0; bus < 256; bus++) {
3067 ret = copy_context_table(iommu, &old_rt[bus],
3068 ctxt_tbls, bus, ext);
3070 pr_err("%s: Failed to copy context table for bus %d\n",
3076 spin_lock_irqsave(&iommu->lock, flags);
3078 /* Context tables are copied, now write them to the root_entry table */
3079 for (bus = 0; bus < 256; bus++) {
3080 int idx = ext ? bus * 2 : bus;
3083 if (ctxt_tbls[idx]) {
3084 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3085 iommu->root_entry[bus].lo = val;
3088 if (!ext || !ctxt_tbls[idx + 1])
3091 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3092 iommu->root_entry[bus].hi = val;
3095 spin_unlock_irqrestore(&iommu->lock, flags);
3099 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3109 static int __init init_dmars(void)
3111 struct dmar_drhd_unit *drhd;
3112 struct intel_iommu *iommu;
3118 * initialize and program root entry to not present
3121 for_each_drhd_unit(drhd) {
3123 * lock not needed as this is only incremented in the single
3124 * threaded kernel __init code path all other access are read
3127 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3131 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3134 /* Preallocate enough resources for IOMMU hot-addition */
3135 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3136 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3138 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3141 pr_err("Allocating global iommu array failed\n");
3146 for_each_iommu(iommu, drhd) {
3147 if (drhd->ignored) {
3148 iommu_disable_translation(iommu);
3153 * Find the max pasid size of all IOMMU's in the system.
3154 * We need to ensure the system pasid table is no bigger
3155 * than the smallest supported.
3157 if (pasid_supported(iommu)) {
3158 u32 temp = 2 << ecap_pss(iommu->ecap);
3160 intel_pasid_max_id = min_t(u32, temp,
3161 intel_pasid_max_id);
3164 g_iommus[iommu->seq_id] = iommu;
3166 intel_iommu_init_qi(iommu);
3168 ret = iommu_init_domains(iommu);
3172 init_translation_status(iommu);
3174 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3175 iommu_disable_translation(iommu);
3176 clear_translation_pre_enabled(iommu);
3177 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3183 * we could share the same root & context tables
3184 * among all IOMMU's. Need to Split it later.
3186 ret = iommu_alloc_root_entry(iommu);
3190 if (translation_pre_enabled(iommu)) {
3191 pr_info("Translation already enabled - trying to copy translation structures\n");
3193 ret = copy_translation_tables(iommu);
3196 * We found the IOMMU with translation
3197 * enabled - but failed to copy over the
3198 * old root-entry table. Try to proceed
3199 * by disabling translation now and
3200 * allocating a clean root-entry table.
3201 * This might cause DMAR faults, but
3202 * probably the dump will still succeed.
3204 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3206 iommu_disable_translation(iommu);
3207 clear_translation_pre_enabled(iommu);
3209 pr_info("Copied translation tables from previous kernel for %s\n",
3214 if (!ecap_pass_through(iommu->ecap))
3215 hw_pass_through = 0;
3216 intel_svm_check(iommu);
3220 * Now that qi is enabled on all iommus, set the root entry and flush
3221 * caches. This is required on some Intel X58 chipsets, otherwise the
3222 * flush_context function will loop forever and the boot hangs.
3224 for_each_active_iommu(iommu, drhd) {
3225 iommu_flush_write_buffer(iommu);
3226 iommu_set_root_entry(iommu);
3227 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3228 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3231 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3236 iommu_identity_mapping |= IDENTMAP_GFX;
3238 check_tylersburg_isoch();
3240 ret = si_domain_init(hw_pass_through);
3247 * global invalidate context cache
3248 * global invalidate iotlb
3249 * enable translation
3251 for_each_iommu(iommu, drhd) {
3252 if (drhd->ignored) {
3254 * we always have to disable PMRs or DMA may fail on
3258 iommu_disable_protect_mem_regions(iommu);
3262 iommu_flush_write_buffer(iommu);
3264 #ifdef CONFIG_INTEL_IOMMU_SVM
3265 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3267 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3268 * could cause possible lock race condition.
3270 up_write(&dmar_global_lock);
3271 ret = intel_svm_enable_prq(iommu);
3272 down_write(&dmar_global_lock);
3277 ret = dmar_set_interrupt(iommu);
3285 for_each_active_iommu(iommu, drhd) {
3286 disable_dmar_iommu(iommu);
3287 free_dmar_iommu(iommu);
3296 /* This takes a number of _MM_ pages, not VTD pages */
3297 static unsigned long intel_alloc_iova(struct device *dev,
3298 struct dmar_domain *domain,
3299 unsigned long nrpages, uint64_t dma_mask)
3301 unsigned long iova_pfn;
3304 * Restrict dma_mask to the width that the iommu can handle.
3305 * First-level translation restricts the input-address to a
3306 * canonical address (i.e., address bits 63:N have the same
3307 * value as address bit [N-1], where N is 48-bits with 4-level
3308 * paging and 57-bits with 5-level paging). Hence, skip bit
3311 if (domain_use_first_level(domain))
3312 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3315 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3318 /* Ensure we reserve the whole size-aligned region */
3319 nrpages = __roundup_pow_of_two(nrpages);
3321 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3323 * First try to allocate an io virtual address in
3324 * DMA_BIT_MASK(32) and if that fails then try allocating
3327 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3328 IOVA_PFN(DMA_BIT_MASK(32)), false);
3332 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3333 IOVA_PFN(dma_mask), true);
3334 if (unlikely(!iova_pfn)) {
3335 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3343 /* Check if the dev needs to go through non-identity map and unmap process.*/
3344 static bool iommu_need_mapping(struct device *dev)
3346 if (iommu_dummy(dev))
3349 if (unlikely(attach_deferred(dev)))
3350 do_deferred_attach(dev);
3352 return !identity_mapping(dev);
3355 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3356 size_t size, int dir, u64 dma_mask)
3358 struct dmar_domain *domain;
3359 phys_addr_t start_paddr;
3360 unsigned long iova_pfn;
3363 struct intel_iommu *iommu;
3364 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3366 BUG_ON(dir == DMA_NONE);
3368 domain = find_domain(dev);
3370 return DMA_MAPPING_ERROR;
3372 iommu = domain_get_iommu(domain);
3373 size = aligned_nrpages(paddr, size);
3375 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3380 * Check if DMAR supports zero-length reads on write only
3383 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3384 !cap_zlr(iommu->cap))
3385 prot |= DMA_PTE_READ;
3386 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3387 prot |= DMA_PTE_WRITE;
3389 * paddr - (paddr + size) might be partial page, we should map the whole
3390 * page. Note: if two part of one page are separately mapped, we
3391 * might have two guest_addr mapping to the same host paddr, but this
3392 * is not a big problem
3394 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3395 mm_to_dma_pfn(paddr_pfn), size, prot);
3399 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3400 start_paddr += paddr & ~PAGE_MASK;
3402 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3408 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3409 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3410 size, (unsigned long long)paddr, dir);
3411 return DMA_MAPPING_ERROR;
3414 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3415 unsigned long offset, size_t size,
3416 enum dma_data_direction dir,
3417 unsigned long attrs)
3419 if (iommu_need_mapping(dev))
3420 return __intel_map_single(dev, page_to_phys(page) + offset,
3421 size, dir, *dev->dma_mask);
3422 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3425 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3426 size_t size, enum dma_data_direction dir,
3427 unsigned long attrs)
3429 if (iommu_need_mapping(dev))
3430 return __intel_map_single(dev, phys_addr, size, dir,
3432 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3435 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3437 struct dmar_domain *domain;
3438 unsigned long start_pfn, last_pfn;
3439 unsigned long nrpages;
3440 unsigned long iova_pfn;
3441 struct intel_iommu *iommu;
3442 struct page *freelist;
3443 struct pci_dev *pdev = NULL;
3445 domain = find_domain(dev);
3448 iommu = domain_get_iommu(domain);
3450 iova_pfn = IOVA_PFN(dev_addr);
3452 nrpages = aligned_nrpages(dev_addr, size);
3453 start_pfn = mm_to_dma_pfn(iova_pfn);
3454 last_pfn = start_pfn + nrpages - 1;
3456 if (dev_is_pci(dev))
3457 pdev = to_pci_dev(dev);
3459 freelist = domain_unmap(domain, start_pfn, last_pfn);
3460 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3461 !has_iova_flush_queue(&domain->iovad)) {
3462 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3463 nrpages, !freelist, 0);
3465 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3466 dma_free_pagelist(freelist);
3468 queue_iova(&domain->iovad, iova_pfn, nrpages,
3469 (unsigned long)freelist);
3471 * queue up the release of the unmap to save the 1/6th of the
3472 * cpu used up by the iotlb flush operation...
3476 trace_unmap_single(dev, dev_addr, size);
3479 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3480 size_t size, enum dma_data_direction dir,
3481 unsigned long attrs)
3483 if (iommu_need_mapping(dev))
3484 intel_unmap(dev, dev_addr, size);
3486 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3489 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3490 size_t size, enum dma_data_direction dir, unsigned long attrs)
3492 if (iommu_need_mapping(dev))
3493 intel_unmap(dev, dev_addr, size);
3496 static void *intel_alloc_coherent(struct device *dev, size_t size,
3497 dma_addr_t *dma_handle, gfp_t flags,
3498 unsigned long attrs)
3500 struct page *page = NULL;
3503 if (!iommu_need_mapping(dev))
3504 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3506 size = PAGE_ALIGN(size);
3507 order = get_order(size);
3509 if (gfpflags_allow_blocking(flags)) {
3510 unsigned int count = size >> PAGE_SHIFT;
3512 page = dma_alloc_from_contiguous(dev, count, order,
3513 flags & __GFP_NOWARN);
3517 page = alloc_pages(flags, order);
3520 memset(page_address(page), 0, size);
3522 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3524 dev->coherent_dma_mask);
3525 if (*dma_handle != DMA_MAPPING_ERROR)
3526 return page_address(page);
3527 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3528 __free_pages(page, order);
3533 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3534 dma_addr_t dma_handle, unsigned long attrs)
3537 struct page *page = virt_to_page(vaddr);
3539 if (!iommu_need_mapping(dev))
3540 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3542 size = PAGE_ALIGN(size);
3543 order = get_order(size);
3545 intel_unmap(dev, dma_handle, size);
3546 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3547 __free_pages(page, order);
3550 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3551 int nelems, enum dma_data_direction dir,
3552 unsigned long attrs)
3554 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3555 unsigned long nrpages = 0;
3556 struct scatterlist *sg;
3559 if (!iommu_need_mapping(dev))
3560 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3562 for_each_sg(sglist, sg, nelems, i) {
3563 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3566 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3568 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3571 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3572 enum dma_data_direction dir, unsigned long attrs)
3575 struct dmar_domain *domain;
3578 unsigned long iova_pfn;
3580 struct scatterlist *sg;
3581 unsigned long start_vpfn;
3582 struct intel_iommu *iommu;
3584 BUG_ON(dir == DMA_NONE);
3585 if (!iommu_need_mapping(dev))
3586 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3588 domain = find_domain(dev);
3592 iommu = domain_get_iommu(domain);
3594 for_each_sg(sglist, sg, nelems, i)
3595 size += aligned_nrpages(sg->offset, sg->length);
3597 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3600 sglist->dma_length = 0;
3605 * Check if DMAR supports zero-length reads on write only
3608 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3609 !cap_zlr(iommu->cap))
3610 prot |= DMA_PTE_READ;
3611 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3612 prot |= DMA_PTE_WRITE;
3614 start_vpfn = mm_to_dma_pfn(iova_pfn);
3616 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3617 if (unlikely(ret)) {
3618 dma_pte_free_pagetable(domain, start_vpfn,
3619 start_vpfn + size - 1,
3620 agaw_to_level(domain->agaw) + 1);
3621 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3625 for_each_sg(sglist, sg, nelems, i)
3626 trace_map_sg(dev, i + 1, nelems, sg);
3631 static u64 intel_get_required_mask(struct device *dev)
3633 if (!iommu_need_mapping(dev))
3634 return dma_direct_get_required_mask(dev);
3635 return DMA_BIT_MASK(32);
3638 static const struct dma_map_ops intel_dma_ops = {
3639 .alloc = intel_alloc_coherent,
3640 .free = intel_free_coherent,
3641 .map_sg = intel_map_sg,
3642 .unmap_sg = intel_unmap_sg,
3643 .map_page = intel_map_page,
3644 .unmap_page = intel_unmap_page,
3645 .map_resource = intel_map_resource,
3646 .unmap_resource = intel_unmap_resource,
3647 .dma_supported = dma_direct_supported,
3648 .mmap = dma_common_mmap,
3649 .get_sgtable = dma_common_get_sgtable,
3650 .get_required_mask = intel_get_required_mask,
3654 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3655 enum dma_data_direction dir, enum dma_sync_target target)
3657 struct dmar_domain *domain;
3658 phys_addr_t tlb_addr;
3660 domain = find_domain(dev);
3661 if (WARN_ON(!domain))
3664 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3665 if (is_swiotlb_buffer(tlb_addr))
3666 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3670 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3671 enum dma_data_direction dir, unsigned long attrs,
3674 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3675 struct dmar_domain *domain;
3676 struct intel_iommu *iommu;
3677 unsigned long iova_pfn;
3678 unsigned long nrpages;
3679 phys_addr_t tlb_addr;
3683 if (unlikely(attach_deferred(dev)))
3684 do_deferred_attach(dev);
3686 domain = find_domain(dev);
3688 if (WARN_ON(dir == DMA_NONE || !domain))
3689 return DMA_MAPPING_ERROR;
3691 iommu = domain_get_iommu(domain);
3692 if (WARN_ON(!iommu))
3693 return DMA_MAPPING_ERROR;
3695 nrpages = aligned_nrpages(0, size);
3696 iova_pfn = intel_alloc_iova(dev, domain,
3697 dma_to_mm_pfn(nrpages), dma_mask);
3699 return DMA_MAPPING_ERROR;
3702 * Check if DMAR supports zero-length reads on write only
3705 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3706 !cap_zlr(iommu->cap))
3707 prot |= DMA_PTE_READ;
3708 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3709 prot |= DMA_PTE_WRITE;
3712 * If both the physical buffer start address and size are
3713 * page aligned, we don't need to use a bounce page.
3715 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3716 tlb_addr = swiotlb_tbl_map_single(dev,
3717 __phys_to_dma(dev, io_tlb_start),
3718 paddr, size, aligned_size, dir, attrs);
3719 if (tlb_addr == DMA_MAPPING_ERROR) {
3722 /* Cleanup the padding area. */
3723 void *padding_start = phys_to_virt(tlb_addr);
3724 size_t padding_size = aligned_size;
3726 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3727 (dir == DMA_TO_DEVICE ||
3728 dir == DMA_BIDIRECTIONAL)) {
3729 padding_start += size;
3730 padding_size -= size;
3733 memset(padding_start, 0, padding_size);
3739 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3740 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3744 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3746 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3749 if (is_swiotlb_buffer(tlb_addr))
3750 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3751 aligned_size, dir, attrs);
3753 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3754 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3755 size, (unsigned long long)paddr, dir);
3757 return DMA_MAPPING_ERROR;
3761 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3762 enum dma_data_direction dir, unsigned long attrs)
3764 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3765 struct dmar_domain *domain;
3766 phys_addr_t tlb_addr;
3768 domain = find_domain(dev);
3769 if (WARN_ON(!domain))
3772 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3773 if (WARN_ON(!tlb_addr))
3776 intel_unmap(dev, dev_addr, size);
3777 if (is_swiotlb_buffer(tlb_addr))
3778 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3779 aligned_size, dir, attrs);
3781 trace_bounce_unmap_single(dev, dev_addr, size);
3785 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3786 size_t size, enum dma_data_direction dir, unsigned long attrs)
3788 return bounce_map_single(dev, page_to_phys(page) + offset,
3789 size, dir, attrs, *dev->dma_mask);
3793 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3794 enum dma_data_direction dir, unsigned long attrs)
3796 return bounce_map_single(dev, phys_addr, size,
3797 dir, attrs, *dev->dma_mask);
3801 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3802 enum dma_data_direction dir, unsigned long attrs)
3804 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3808 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3809 enum dma_data_direction dir, unsigned long attrs)
3811 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3815 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3816 enum dma_data_direction dir, unsigned long attrs)
3818 struct scatterlist *sg;
3821 for_each_sg(sglist, sg, nelems, i)
3822 bounce_unmap_page(dev, sg->dma_address,
3823 sg_dma_len(sg), dir, attrs);
3827 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3828 enum dma_data_direction dir, unsigned long attrs)
3831 struct scatterlist *sg;
3833 for_each_sg(sglist, sg, nelems, i) {
3834 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3835 sg->offset, sg->length,
3837 if (sg->dma_address == DMA_MAPPING_ERROR)
3839 sg_dma_len(sg) = sg->length;
3842 for_each_sg(sglist, sg, nelems, i)
3843 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3848 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3853 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3854 size_t size, enum dma_data_direction dir)
3856 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3860 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3861 size_t size, enum dma_data_direction dir)
3863 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3867 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3868 int nelems, enum dma_data_direction dir)
3870 struct scatterlist *sg;
3873 for_each_sg(sglist, sg, nelems, i)
3874 bounce_sync_single(dev, sg_dma_address(sg),
3875 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3879 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3880 int nelems, enum dma_data_direction dir)
3882 struct scatterlist *sg;
3885 for_each_sg(sglist, sg, nelems, i)
3886 bounce_sync_single(dev, sg_dma_address(sg),
3887 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3890 static const struct dma_map_ops bounce_dma_ops = {
3891 .alloc = intel_alloc_coherent,
3892 .free = intel_free_coherent,
3893 .map_sg = bounce_map_sg,
3894 .unmap_sg = bounce_unmap_sg,
3895 .map_page = bounce_map_page,
3896 .unmap_page = bounce_unmap_page,
3897 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3898 .sync_single_for_device = bounce_sync_single_for_device,
3899 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3900 .sync_sg_for_device = bounce_sync_sg_for_device,
3901 .map_resource = bounce_map_resource,
3902 .unmap_resource = bounce_unmap_resource,
3903 .dma_supported = dma_direct_supported,
3906 static inline int iommu_domain_cache_init(void)
3910 iommu_domain_cache = kmem_cache_create("iommu_domain",
3911 sizeof(struct dmar_domain),
3916 if (!iommu_domain_cache) {
3917 pr_err("Couldn't create iommu_domain cache\n");
3924 static inline int iommu_devinfo_cache_init(void)
3928 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3929 sizeof(struct device_domain_info),
3933 if (!iommu_devinfo_cache) {
3934 pr_err("Couldn't create devinfo cache\n");
3941 static int __init iommu_init_mempool(void)
3944 ret = iova_cache_get();
3948 ret = iommu_domain_cache_init();
3952 ret = iommu_devinfo_cache_init();
3956 kmem_cache_destroy(iommu_domain_cache);
3963 static void __init iommu_exit_mempool(void)
3965 kmem_cache_destroy(iommu_devinfo_cache);
3966 kmem_cache_destroy(iommu_domain_cache);
3970 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3972 struct dmar_drhd_unit *drhd;
3976 /* We know that this device on this chipset has its own IOMMU.
3977 * If we find it under a different IOMMU, then the BIOS is lying
3978 * to us. Hope that the IOMMU for this device is actually
3979 * disabled, and it needs no translation...
3981 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3983 /* "can't" happen */
3984 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3987 vtbar &= 0xffff0000;
3989 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3990 drhd = dmar_find_matched_drhd_unit(pdev);
3991 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3992 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3993 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3994 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3997 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3999 static void __init init_no_remapping_devices(void)
4001 struct dmar_drhd_unit *drhd;
4005 for_each_drhd_unit(drhd) {
4006 if (!drhd->include_all) {
4007 for_each_active_dev_scope(drhd->devices,
4008 drhd->devices_cnt, i, dev)
4010 /* ignore DMAR unit if no devices exist */
4011 if (i == drhd->devices_cnt)
4016 for_each_active_drhd_unit(drhd) {
4017 if (drhd->include_all)
4020 for_each_active_dev_scope(drhd->devices,
4021 drhd->devices_cnt, i, dev)
4022 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4024 if (i < drhd->devices_cnt)
4027 /* This IOMMU has *only* gfx devices. Either bypass it or
4028 set the gfx_mapped flag, as appropriate */
4029 if (!dmar_map_gfx) {
4031 for_each_active_dev_scope(drhd->devices,
4032 drhd->devices_cnt, i, dev)
4033 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4038 #ifdef CONFIG_SUSPEND
4039 static int init_iommu_hw(void)
4041 struct dmar_drhd_unit *drhd;
4042 struct intel_iommu *iommu = NULL;
4044 for_each_active_iommu(iommu, drhd)
4046 dmar_reenable_qi(iommu);
4048 for_each_iommu(iommu, drhd) {
4049 if (drhd->ignored) {
4051 * we always have to disable PMRs or DMA may fail on
4055 iommu_disable_protect_mem_regions(iommu);
4059 iommu_flush_write_buffer(iommu);
4061 iommu_set_root_entry(iommu);
4063 iommu->flush.flush_context(iommu, 0, 0, 0,
4064 DMA_CCMD_GLOBAL_INVL);
4065 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4066 iommu_enable_translation(iommu);
4067 iommu_disable_protect_mem_regions(iommu);
4073 static void iommu_flush_all(void)
4075 struct dmar_drhd_unit *drhd;
4076 struct intel_iommu *iommu;
4078 for_each_active_iommu(iommu, drhd) {
4079 iommu->flush.flush_context(iommu, 0, 0, 0,
4080 DMA_CCMD_GLOBAL_INVL);
4081 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4082 DMA_TLB_GLOBAL_FLUSH);
4086 static int iommu_suspend(void)
4088 struct dmar_drhd_unit *drhd;
4089 struct intel_iommu *iommu = NULL;
4092 for_each_active_iommu(iommu, drhd) {
4093 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4095 if (!iommu->iommu_state)
4101 for_each_active_iommu(iommu, drhd) {
4102 iommu_disable_translation(iommu);
4104 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4106 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4107 readl(iommu->reg + DMAR_FECTL_REG);
4108 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4109 readl(iommu->reg + DMAR_FEDATA_REG);
4110 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4111 readl(iommu->reg + DMAR_FEADDR_REG);
4112 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4113 readl(iommu->reg + DMAR_FEUADDR_REG);
4115 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4120 for_each_active_iommu(iommu, drhd)
4121 kfree(iommu->iommu_state);
4126 static void iommu_resume(void)
4128 struct dmar_drhd_unit *drhd;
4129 struct intel_iommu *iommu = NULL;
4132 if (init_iommu_hw()) {
4134 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4136 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4140 for_each_active_iommu(iommu, drhd) {
4142 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4144 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4145 iommu->reg + DMAR_FECTL_REG);
4146 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4147 iommu->reg + DMAR_FEDATA_REG);
4148 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4149 iommu->reg + DMAR_FEADDR_REG);
4150 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4151 iommu->reg + DMAR_FEUADDR_REG);
4153 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4156 for_each_active_iommu(iommu, drhd)
4157 kfree(iommu->iommu_state);
4160 static struct syscore_ops iommu_syscore_ops = {
4161 .resume = iommu_resume,
4162 .suspend = iommu_suspend,
4165 static void __init init_iommu_pm_ops(void)
4167 register_syscore_ops(&iommu_syscore_ops);
4171 static inline void init_iommu_pm_ops(void) {}
4172 #endif /* CONFIG_PM */
4174 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4176 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4177 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4178 rmrr->end_address <= rmrr->base_address ||
4179 arch_rmrr_sanity_check(rmrr))
4185 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4187 struct acpi_dmar_reserved_memory *rmrr;
4188 struct dmar_rmrr_unit *rmrru;
4190 rmrr = (struct acpi_dmar_reserved_memory *)header;
4191 if (rmrr_sanity_check(rmrr)) {
4193 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4194 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4195 rmrr->base_address, rmrr->end_address,
4196 dmi_get_system_info(DMI_BIOS_VENDOR),
4197 dmi_get_system_info(DMI_BIOS_VERSION),
4198 dmi_get_system_info(DMI_PRODUCT_VERSION));
4199 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4202 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4206 rmrru->hdr = header;
4208 rmrru->base_address = rmrr->base_address;
4209 rmrru->end_address = rmrr->end_address;
4211 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4212 ((void *)rmrr) + rmrr->header.length,
4213 &rmrru->devices_cnt);
4214 if (rmrru->devices_cnt && rmrru->devices == NULL)
4217 list_add(&rmrru->list, &dmar_rmrr_units);
4226 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4228 struct dmar_atsr_unit *atsru;
4229 struct acpi_dmar_atsr *tmp;
4231 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4233 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4234 if (atsr->segment != tmp->segment)
4236 if (atsr->header.length != tmp->header.length)
4238 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4245 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4247 struct acpi_dmar_atsr *atsr;
4248 struct dmar_atsr_unit *atsru;
4250 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4253 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4254 atsru = dmar_find_atsr(atsr);
4258 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4263 * If memory is allocated from slab by ACPI _DSM method, we need to
4264 * copy the memory content because the memory buffer will be freed
4267 atsru->hdr = (void *)(atsru + 1);
4268 memcpy(atsru->hdr, hdr, hdr->length);
4269 atsru->include_all = atsr->flags & 0x1;
4270 if (!atsru->include_all) {
4271 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4272 (void *)atsr + atsr->header.length,
4273 &atsru->devices_cnt);
4274 if (atsru->devices_cnt && atsru->devices == NULL) {
4280 list_add_rcu(&atsru->list, &dmar_atsr_units);
4285 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4287 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4291 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4293 struct acpi_dmar_atsr *atsr;
4294 struct dmar_atsr_unit *atsru;
4296 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4297 atsru = dmar_find_atsr(atsr);
4299 list_del_rcu(&atsru->list);
4301 intel_iommu_free_atsr(atsru);
4307 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4311 struct acpi_dmar_atsr *atsr;
4312 struct dmar_atsr_unit *atsru;
4314 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4315 atsru = dmar_find_atsr(atsr);
4319 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4320 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4328 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4331 struct intel_iommu *iommu = dmaru->iommu;
4333 if (g_iommus[iommu->seq_id])
4336 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4337 pr_warn("%s: Doesn't support hardware pass through.\n",
4341 if (!ecap_sc_support(iommu->ecap) &&
4342 domain_update_iommu_snooping(iommu)) {
4343 pr_warn("%s: Doesn't support snooping.\n",
4347 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4348 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4349 pr_warn("%s: Doesn't support large page.\n",
4355 * Disable translation if already enabled prior to OS handover.
4357 if (iommu->gcmd & DMA_GCMD_TE)
4358 iommu_disable_translation(iommu);
4360 g_iommus[iommu->seq_id] = iommu;
4361 ret = iommu_init_domains(iommu);
4363 ret = iommu_alloc_root_entry(iommu);
4367 intel_svm_check(iommu);
4369 if (dmaru->ignored) {
4371 * we always have to disable PMRs or DMA may fail on this device
4374 iommu_disable_protect_mem_regions(iommu);
4378 intel_iommu_init_qi(iommu);
4379 iommu_flush_write_buffer(iommu);
4381 #ifdef CONFIG_INTEL_IOMMU_SVM
4382 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4383 ret = intel_svm_enable_prq(iommu);
4388 ret = dmar_set_interrupt(iommu);
4392 iommu_set_root_entry(iommu);
4393 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4394 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4395 iommu_enable_translation(iommu);
4397 iommu_disable_protect_mem_regions(iommu);
4401 disable_dmar_iommu(iommu);
4403 free_dmar_iommu(iommu);
4407 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4410 struct intel_iommu *iommu = dmaru->iommu;
4412 if (!intel_iommu_enabled)
4418 ret = intel_iommu_add(dmaru);
4420 disable_dmar_iommu(iommu);
4421 free_dmar_iommu(iommu);
4427 static void intel_iommu_free_dmars(void)
4429 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4430 struct dmar_atsr_unit *atsru, *atsr_n;
4432 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4433 list_del(&rmrru->list);
4434 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4438 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4439 list_del(&atsru->list);
4440 intel_iommu_free_atsr(atsru);
4444 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4447 struct pci_bus *bus;
4448 struct pci_dev *bridge = NULL;
4450 struct acpi_dmar_atsr *atsr;
4451 struct dmar_atsr_unit *atsru;
4453 dev = pci_physfn(dev);
4454 for (bus = dev->bus; bus; bus = bus->parent) {
4456 /* If it's an integrated device, allow ATS */
4459 /* Connected via non-PCIe: no ATS */
4460 if (!pci_is_pcie(bridge) ||
4461 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4463 /* If we found the root port, look it up in the ATSR */
4464 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4469 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4470 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4471 if (atsr->segment != pci_domain_nr(dev->bus))
4474 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4475 if (tmp == &bridge->dev)
4478 if (atsru->include_all)
4488 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4491 struct dmar_rmrr_unit *rmrru;
4492 struct dmar_atsr_unit *atsru;
4493 struct acpi_dmar_atsr *atsr;
4494 struct acpi_dmar_reserved_memory *rmrr;
4496 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4499 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4500 rmrr = container_of(rmrru->hdr,
4501 struct acpi_dmar_reserved_memory, header);
4502 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4503 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4504 ((void *)rmrr) + rmrr->header.length,
4505 rmrr->segment, rmrru->devices,
4506 rmrru->devices_cnt);
4509 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4510 dmar_remove_dev_scope(info, rmrr->segment,
4511 rmrru->devices, rmrru->devices_cnt);
4515 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4516 if (atsru->include_all)
4519 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4520 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4521 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4522 (void *)atsr + atsr->header.length,
4523 atsr->segment, atsru->devices,
4524 atsru->devices_cnt);
4529 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4530 if (dmar_remove_dev_scope(info, atsr->segment,
4531 atsru->devices, atsru->devices_cnt))
4539 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4540 unsigned long val, void *v)
4542 struct memory_notify *mhp = v;
4543 unsigned long long start, end;
4544 unsigned long start_vpfn, last_vpfn;
4547 case MEM_GOING_ONLINE:
4548 start = mhp->start_pfn << PAGE_SHIFT;
4549 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4550 if (iommu_domain_identity_map(si_domain, start, end)) {
4551 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4558 case MEM_CANCEL_ONLINE:
4559 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4560 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4561 while (start_vpfn <= last_vpfn) {
4563 struct dmar_drhd_unit *drhd;
4564 struct intel_iommu *iommu;
4565 struct page *freelist;
4567 iova = find_iova(&si_domain->iovad, start_vpfn);
4569 pr_debug("Failed get IOVA for PFN %lx\n",
4574 iova = split_and_remove_iova(&si_domain->iovad, iova,
4575 start_vpfn, last_vpfn);
4577 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4578 start_vpfn, last_vpfn);
4582 freelist = domain_unmap(si_domain, iova->pfn_lo,
4586 for_each_active_iommu(iommu, drhd)
4587 iommu_flush_iotlb_psi(iommu, si_domain,
4588 iova->pfn_lo, iova_size(iova),
4591 dma_free_pagelist(freelist);
4593 start_vpfn = iova->pfn_hi + 1;
4594 free_iova_mem(iova);
4602 static struct notifier_block intel_iommu_memory_nb = {
4603 .notifier_call = intel_iommu_memory_notifier,
4607 static void free_all_cpu_cached_iovas(unsigned int cpu)
4611 for (i = 0; i < g_num_of_iommus; i++) {
4612 struct intel_iommu *iommu = g_iommus[i];
4613 struct dmar_domain *domain;
4619 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4620 domain = get_iommu_domain(iommu, (u16)did);
4624 free_cpu_cached_iovas(cpu, &domain->iovad);
4629 static int intel_iommu_cpu_dead(unsigned int cpu)
4631 free_all_cpu_cached_iovas(cpu);
4635 static void intel_disable_iommus(void)
4637 struct intel_iommu *iommu = NULL;
4638 struct dmar_drhd_unit *drhd;
4640 for_each_iommu(iommu, drhd)
4641 iommu_disable_translation(iommu);
4644 void intel_iommu_shutdown(void)
4646 struct dmar_drhd_unit *drhd;
4647 struct intel_iommu *iommu = NULL;
4649 if (no_iommu || dmar_disabled)
4652 down_write(&dmar_global_lock);
4654 /* Disable PMRs explicitly here. */
4655 for_each_iommu(iommu, drhd)
4656 iommu_disable_protect_mem_regions(iommu);
4658 /* Make sure the IOMMUs are switched off */
4659 intel_disable_iommus();
4661 up_write(&dmar_global_lock);
4664 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4666 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4668 return container_of(iommu_dev, struct intel_iommu, iommu);
4671 static ssize_t intel_iommu_show_version(struct device *dev,
4672 struct device_attribute *attr,
4675 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4676 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4677 return sprintf(buf, "%d:%d\n",
4678 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4680 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4682 static ssize_t intel_iommu_show_address(struct device *dev,
4683 struct device_attribute *attr,
4686 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4687 return sprintf(buf, "%llx\n", iommu->reg_phys);
4689 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4691 static ssize_t intel_iommu_show_cap(struct device *dev,
4692 struct device_attribute *attr,
4695 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4696 return sprintf(buf, "%llx\n", iommu->cap);
4698 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4700 static ssize_t intel_iommu_show_ecap(struct device *dev,
4701 struct device_attribute *attr,
4704 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4705 return sprintf(buf, "%llx\n", iommu->ecap);
4707 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4709 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4710 struct device_attribute *attr,
4713 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4714 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4716 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4718 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4719 struct device_attribute *attr,
4722 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4723 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4724 cap_ndoms(iommu->cap)));
4726 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4728 static struct attribute *intel_iommu_attrs[] = {
4729 &dev_attr_version.attr,
4730 &dev_attr_address.attr,
4732 &dev_attr_ecap.attr,
4733 &dev_attr_domains_supported.attr,
4734 &dev_attr_domains_used.attr,
4738 static struct attribute_group intel_iommu_group = {
4739 .name = "intel-iommu",
4740 .attrs = intel_iommu_attrs,
4743 const struct attribute_group *intel_iommu_groups[] = {
4748 static inline bool has_untrusted_dev(void)
4750 struct pci_dev *pdev = NULL;
4752 for_each_pci_dev(pdev)
4753 if (pdev->untrusted)
4759 static int __init platform_optin_force_iommu(void)
4761 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4764 if (no_iommu || dmar_disabled)
4765 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4768 * If Intel-IOMMU is disabled by default, we will apply identity
4769 * map for all devices except those marked as being untrusted.
4772 iommu_set_default_passthrough(false);
4780 static int __init probe_acpi_namespace_devices(void)
4782 struct dmar_drhd_unit *drhd;
4783 /* To avoid a -Wunused-but-set-variable warning. */
4784 struct intel_iommu *iommu __maybe_unused;
4788 for_each_active_iommu(iommu, drhd) {
4789 for_each_active_dev_scope(drhd->devices,
4790 drhd->devices_cnt, i, dev) {
4791 struct acpi_device_physical_node *pn;
4792 struct iommu_group *group;
4793 struct acpi_device *adev;
4795 if (dev->bus != &acpi_bus_type)
4798 adev = to_acpi_device(dev);
4799 mutex_lock(&adev->physical_node_lock);
4800 list_for_each_entry(pn,
4801 &adev->physical_node_list, node) {
4802 group = iommu_group_get(pn->dev);
4804 iommu_group_put(group);
4808 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4809 ret = iommu_probe_device(pn->dev);
4813 mutex_unlock(&adev->physical_node_lock);
4823 int __init intel_iommu_init(void)
4826 struct dmar_drhd_unit *drhd;
4827 struct intel_iommu *iommu;
4830 * Intel IOMMU is required for a TXT/tboot launch or platform
4831 * opt in, so enforce that.
4833 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4835 if (iommu_init_mempool()) {
4837 panic("tboot: Failed to initialize iommu memory\n");
4841 down_write(&dmar_global_lock);
4842 if (dmar_table_init()) {
4844 panic("tboot: Failed to initialize DMAR table\n");
4848 if (dmar_dev_scope_init() < 0) {
4850 panic("tboot: Failed to initialize DMAR device scope\n");
4854 up_write(&dmar_global_lock);
4857 * The bus notifier takes the dmar_global_lock, so lockdep will
4858 * complain later when we register it under the lock.
4860 dmar_register_bus_notifier();
4862 down_write(&dmar_global_lock);
4865 intel_iommu_debugfs_init();
4867 if (no_iommu || dmar_disabled) {
4869 * We exit the function here to ensure IOMMU's remapping and
4870 * mempool aren't setup, which means that the IOMMU's PMRs
4871 * won't be disabled via the call to init_dmars(). So disable
4872 * it explicitly here. The PMRs were setup by tboot prior to
4873 * calling SENTER, but the kernel is expected to reset/tear
4876 if (intel_iommu_tboot_noforce) {
4877 for_each_iommu(iommu, drhd)
4878 iommu_disable_protect_mem_regions(iommu);
4882 * Make sure the IOMMUs are switched off, even when we
4883 * boot into a kexec kernel and the previous kernel left
4886 intel_disable_iommus();
4890 if (list_empty(&dmar_rmrr_units))
4891 pr_info("No RMRR found\n");
4893 if (list_empty(&dmar_atsr_units))
4894 pr_info("No ATSR found\n");
4896 if (dmar_init_reserved_ranges()) {
4898 panic("tboot: Failed to reserve iommu ranges\n");
4899 goto out_free_reserved_range;
4903 intel_iommu_gfx_mapped = 1;
4905 init_no_remapping_devices();
4910 panic("tboot: Failed to initialize DMARs\n");
4911 pr_err("Initialization failed\n");
4912 goto out_free_reserved_range;
4914 up_write(&dmar_global_lock);
4916 dma_ops = &intel_dma_ops;
4918 init_iommu_pm_ops();
4920 down_read(&dmar_global_lock);
4921 for_each_active_iommu(iommu, drhd) {
4922 iommu_device_sysfs_add(&iommu->iommu, NULL,
4925 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4926 iommu_device_register(&iommu->iommu);
4928 up_read(&dmar_global_lock);
4930 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4931 if (si_domain && !hw_pass_through)
4932 register_memory_notifier(&intel_iommu_memory_nb);
4933 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4934 intel_iommu_cpu_dead);
4936 down_read(&dmar_global_lock);
4937 if (probe_acpi_namespace_devices())
4938 pr_warn("ACPI name space devices didn't probe correctly\n");
4940 /* Finally, we enable the DMA remapping hardware. */
4941 for_each_iommu(iommu, drhd) {
4942 if (!drhd->ignored && !translation_pre_enabled(iommu))
4943 iommu_enable_translation(iommu);
4945 iommu_disable_protect_mem_regions(iommu);
4947 up_read(&dmar_global_lock);
4949 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4951 intel_iommu_enabled = 1;
4955 out_free_reserved_range:
4956 put_iova_domain(&reserved_iova_list);
4958 intel_iommu_free_dmars();
4959 up_write(&dmar_global_lock);
4960 iommu_exit_mempool();
4964 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4966 struct intel_iommu *iommu = opaque;
4968 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4973 * NB - intel-iommu lacks any sort of reference counting for the users of
4974 * dependent devices. If multiple endpoints have intersecting dependent
4975 * devices, unbinding the driver from any one of them will possibly leave
4976 * the others unable to operate.
4978 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4980 if (!iommu || !dev || !dev_is_pci(dev))
4983 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4986 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4988 struct dmar_domain *domain;
4989 struct intel_iommu *iommu;
4990 unsigned long flags;
4992 assert_spin_locked(&device_domain_lock);
4997 iommu = info->iommu;
4998 domain = info->domain;
5001 if (dev_is_pci(info->dev) && sm_supported(iommu))
5002 intel_pasid_tear_down_entry(iommu, info->dev,
5005 iommu_disable_dev_iotlb(info);
5006 domain_context_clear(iommu, info->dev);
5007 intel_pasid_free_table(info->dev);
5010 unlink_domain_info(info);
5012 spin_lock_irqsave(&iommu->lock, flags);
5013 domain_detach_iommu(domain, iommu);
5014 spin_unlock_irqrestore(&iommu->lock, flags);
5016 free_devinfo_mem(info);
5019 static void dmar_remove_one_dev_info(struct device *dev)
5021 struct device_domain_info *info;
5022 unsigned long flags;
5024 spin_lock_irqsave(&device_domain_lock, flags);
5025 info = dev->archdata.iommu;
5026 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5027 && info != DUMMY_DEVICE_DOMAIN_INFO)
5028 __dmar_remove_one_dev_info(info);
5029 spin_unlock_irqrestore(&device_domain_lock, flags);
5032 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5036 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5037 domain_reserve_special_ranges(domain);
5039 /* calculate AGAW */
5040 domain->gaw = guest_width;
5041 adjust_width = guestwidth_to_adjustwidth(guest_width);
5042 domain->agaw = width_to_agaw(adjust_width);
5044 domain->iommu_coherency = 0;
5045 domain->iommu_snooping = 0;
5046 domain->iommu_superpage = 0;
5047 domain->max_addr = 0;
5049 /* always allocate the top pgd */
5050 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5053 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5057 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5059 struct dmar_domain *dmar_domain;
5060 struct iommu_domain *domain;
5064 case IOMMU_DOMAIN_DMA:
5066 case IOMMU_DOMAIN_UNMANAGED:
5067 dmar_domain = alloc_domain(0);
5069 pr_err("Can't allocate dmar_domain\n");
5072 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5073 pr_err("Domain initialization failed\n");
5074 domain_exit(dmar_domain);
5078 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5079 ret = init_iova_flush_queue(&dmar_domain->iovad,
5083 pr_info("iova flush queue initialization failed\n");
5086 domain_update_iommu_cap(dmar_domain);
5088 domain = &dmar_domain->domain;
5089 domain->geometry.aperture_start = 0;
5090 domain->geometry.aperture_end =
5091 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5092 domain->geometry.force_aperture = true;
5095 case IOMMU_DOMAIN_IDENTITY:
5096 return &si_domain->domain;
5104 static void intel_iommu_domain_free(struct iommu_domain *domain)
5106 if (domain != &si_domain->domain)
5107 domain_exit(to_dmar_domain(domain));
5111 * Check whether a @domain could be attached to the @dev through the
5112 * aux-domain attach/detach APIs.
5115 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5117 struct device_domain_info *info = dev->archdata.iommu;
5119 return info && info->auxd_enabled &&
5120 domain->type == IOMMU_DOMAIN_UNMANAGED;
5123 static void auxiliary_link_device(struct dmar_domain *domain,
5126 struct device_domain_info *info = dev->archdata.iommu;
5128 assert_spin_locked(&device_domain_lock);
5132 domain->auxd_refcnt++;
5133 list_add(&domain->auxd, &info->auxiliary_domains);
5136 static void auxiliary_unlink_device(struct dmar_domain *domain,
5139 struct device_domain_info *info = dev->archdata.iommu;
5141 assert_spin_locked(&device_domain_lock);
5145 list_del(&domain->auxd);
5146 domain->auxd_refcnt--;
5148 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5149 ioasid_free(domain->default_pasid);
5152 static int aux_domain_add_dev(struct dmar_domain *domain,
5157 unsigned long flags;
5158 struct intel_iommu *iommu;
5160 iommu = device_to_iommu(dev, &bus, &devfn);
5164 if (domain->default_pasid <= 0) {
5167 /* No private data needed for the default pasid */
5168 pasid = ioasid_alloc(NULL, PASID_MIN,
5169 pci_max_pasids(to_pci_dev(dev)) - 1,
5171 if (pasid == INVALID_IOASID) {
5172 pr_err("Can't allocate default pasid\n");
5175 domain->default_pasid = pasid;
5178 spin_lock_irqsave(&device_domain_lock, flags);
5180 * iommu->lock must be held to attach domain to iommu and setup the
5181 * pasid entry for second level translation.
5183 spin_lock(&iommu->lock);
5184 ret = domain_attach_iommu(domain, iommu);
5188 /* Setup the PASID entry for mediated devices: */
5189 if (domain_use_first_level(domain))
5190 ret = domain_setup_first_level(iommu, domain, dev,
5191 domain->default_pasid);
5193 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5194 domain->default_pasid);
5197 spin_unlock(&iommu->lock);
5199 auxiliary_link_device(domain, dev);
5201 spin_unlock_irqrestore(&device_domain_lock, flags);
5206 domain_detach_iommu(domain, iommu);
5208 spin_unlock(&iommu->lock);
5209 spin_unlock_irqrestore(&device_domain_lock, flags);
5210 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5211 ioasid_free(domain->default_pasid);
5216 static void aux_domain_remove_dev(struct dmar_domain *domain,
5219 struct device_domain_info *info;
5220 struct intel_iommu *iommu;
5221 unsigned long flags;
5223 if (!is_aux_domain(dev, &domain->domain))
5226 spin_lock_irqsave(&device_domain_lock, flags);
5227 info = dev->archdata.iommu;
5228 iommu = info->iommu;
5230 auxiliary_unlink_device(domain, dev);
5232 spin_lock(&iommu->lock);
5233 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5234 domain_detach_iommu(domain, iommu);
5235 spin_unlock(&iommu->lock);
5237 spin_unlock_irqrestore(&device_domain_lock, flags);
5240 static int prepare_domain_attach_device(struct iommu_domain *domain,
5243 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5244 struct intel_iommu *iommu;
5248 iommu = device_to_iommu(dev, &bus, &devfn);
5252 /* check if this iommu agaw is sufficient for max mapped address */
5253 addr_width = agaw_to_width(iommu->agaw);
5254 if (addr_width > cap_mgaw(iommu->cap))
5255 addr_width = cap_mgaw(iommu->cap);
5257 if (dmar_domain->max_addr > (1LL << addr_width)) {
5258 dev_err(dev, "%s: iommu width (%d) is not "
5259 "sufficient for the mapped address (%llx)\n",
5260 __func__, addr_width, dmar_domain->max_addr);
5263 dmar_domain->gaw = addr_width;
5266 * Knock out extra levels of page tables if necessary
5268 while (iommu->agaw < dmar_domain->agaw) {
5269 struct dma_pte *pte;
5271 pte = dmar_domain->pgd;
5272 if (dma_pte_present(pte)) {
5273 dmar_domain->pgd = (struct dma_pte *)
5274 phys_to_virt(dma_pte_addr(pte));
5275 free_pgtable_page(pte);
5277 dmar_domain->agaw--;
5283 static int intel_iommu_attach_device(struct iommu_domain *domain,
5288 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5289 device_is_rmrr_locked(dev)) {
5290 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5294 if (is_aux_domain(dev, domain))
5297 /* normally dev is not mapped */
5298 if (unlikely(domain_context_mapped(dev))) {
5299 struct dmar_domain *old_domain;
5301 old_domain = find_domain(dev);
5303 dmar_remove_one_dev_info(dev);
5306 ret = prepare_domain_attach_device(domain, dev);
5310 return domain_add_dev_info(to_dmar_domain(domain), dev);
5313 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5318 if (!is_aux_domain(dev, domain))
5321 ret = prepare_domain_attach_device(domain, dev);
5325 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5328 static void intel_iommu_detach_device(struct iommu_domain *domain,
5331 dmar_remove_one_dev_info(dev);
5334 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5337 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5340 static int intel_iommu_map(struct iommu_domain *domain,
5341 unsigned long iova, phys_addr_t hpa,
5342 size_t size, int iommu_prot, gfp_t gfp)
5344 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5349 if (iommu_prot & IOMMU_READ)
5350 prot |= DMA_PTE_READ;
5351 if (iommu_prot & IOMMU_WRITE)
5352 prot |= DMA_PTE_WRITE;
5353 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5354 prot |= DMA_PTE_SNP;
5356 max_addr = iova + size;
5357 if (dmar_domain->max_addr < max_addr) {
5360 /* check if minimum agaw is sufficient for mapped address */
5361 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5362 if (end < max_addr) {
5363 pr_err("%s: iommu width (%d) is not "
5364 "sufficient for the mapped address (%llx)\n",
5365 __func__, dmar_domain->gaw, max_addr);
5368 dmar_domain->max_addr = max_addr;
5370 /* Round up size to next multiple of PAGE_SIZE, if it and
5371 the low bits of hpa would take us onto the next page */
5372 size = aligned_nrpages(hpa, size);
5373 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5374 hpa >> VTD_PAGE_SHIFT, size, prot);
5378 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5379 unsigned long iova, size_t size,
5380 struct iommu_iotlb_gather *gather)
5382 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5383 struct page *freelist = NULL;
5384 unsigned long start_pfn, last_pfn;
5385 unsigned int npages;
5386 int iommu_id, level = 0;
5388 /* Cope with horrid API which requires us to unmap more than the
5389 size argument if it happens to be a large-page mapping. */
5390 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5392 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5393 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5395 start_pfn = iova >> VTD_PAGE_SHIFT;
5396 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5398 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5400 npages = last_pfn - start_pfn + 1;
5402 for_each_domain_iommu(iommu_id, dmar_domain)
5403 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5404 start_pfn, npages, !freelist, 0);
5406 dma_free_pagelist(freelist);
5408 if (dmar_domain->max_addr == iova + size)
5409 dmar_domain->max_addr = iova;
5414 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5417 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5418 struct dma_pte *pte;
5422 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5423 if (pte && dma_pte_present(pte))
5424 phys = dma_pte_addr(pte) +
5425 (iova & (BIT_MASK(level_to_offset_bits(level) +
5426 VTD_PAGE_SHIFT) - 1));
5431 static inline bool scalable_mode_support(void)
5433 struct dmar_drhd_unit *drhd;
5434 struct intel_iommu *iommu;
5438 for_each_active_iommu(iommu, drhd) {
5439 if (!sm_supported(iommu)) {
5449 static inline bool iommu_pasid_support(void)
5451 struct dmar_drhd_unit *drhd;
5452 struct intel_iommu *iommu;
5456 for_each_active_iommu(iommu, drhd) {
5457 if (!pasid_supported(iommu)) {
5467 static inline bool nested_mode_support(void)
5469 struct dmar_drhd_unit *drhd;
5470 struct intel_iommu *iommu;
5474 for_each_active_iommu(iommu, drhd) {
5475 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5485 static bool intel_iommu_capable(enum iommu_cap cap)
5487 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5488 return domain_update_iommu_snooping(NULL) == 1;
5489 if (cap == IOMMU_CAP_INTR_REMAP)
5490 return irq_remapping_enabled == 1;
5495 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5497 struct intel_iommu *iommu;
5500 iommu = device_to_iommu(dev, &bus, &devfn);
5502 return ERR_PTR(-ENODEV);
5504 if (translation_pre_enabled(iommu))
5505 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5507 if (device_needs_bounce(dev)) {
5508 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5509 set_dma_ops(dev, &bounce_dma_ops);
5512 return &iommu->iommu;
5515 static void intel_iommu_release_device(struct device *dev)
5517 struct intel_iommu *iommu;
5520 iommu = device_to_iommu(dev, &bus, &devfn);
5524 dmar_remove_one_dev_info(dev);
5526 if (device_needs_bounce(dev))
5527 set_dma_ops(dev, NULL);
5530 static void intel_iommu_get_resv_regions(struct device *device,
5531 struct list_head *head)
5533 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5534 struct iommu_resv_region *reg;
5535 struct dmar_rmrr_unit *rmrr;
5536 struct device *i_dev;
5539 down_read(&dmar_global_lock);
5540 for_each_rmrr_units(rmrr) {
5541 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5543 struct iommu_resv_region *resv;
5544 enum iommu_resv_type type;
5547 if (i_dev != device &&
5548 !is_downstream_to_pci_bridge(device, i_dev))
5551 length = rmrr->end_address - rmrr->base_address + 1;
5553 type = device_rmrr_is_relaxable(device) ?
5554 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5556 resv = iommu_alloc_resv_region(rmrr->base_address,
5557 length, prot, type);
5561 list_add_tail(&resv->list, head);
5564 up_read(&dmar_global_lock);
5566 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5567 if (dev_is_pci(device)) {
5568 struct pci_dev *pdev = to_pci_dev(device);
5570 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5571 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5572 IOMMU_RESV_DIRECT_RELAXABLE);
5574 list_add_tail(®->list, head);
5577 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5579 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5580 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5584 list_add_tail(®->list, head);
5587 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5589 struct device_domain_info *info;
5590 struct context_entry *context;
5591 struct dmar_domain *domain;
5592 unsigned long flags;
5596 domain = find_domain(dev);
5600 spin_lock_irqsave(&device_domain_lock, flags);
5601 spin_lock(&iommu->lock);
5604 info = dev->archdata.iommu;
5605 if (!info || !info->pasid_supported)
5608 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5609 if (WARN_ON(!context))
5612 ctx_lo = context[0].lo;
5614 if (!(ctx_lo & CONTEXT_PASIDE)) {
5615 ctx_lo |= CONTEXT_PASIDE;
5616 context[0].lo = ctx_lo;
5618 iommu->flush.flush_context(iommu,
5619 domain->iommu_did[iommu->seq_id],
5620 PCI_DEVID(info->bus, info->devfn),
5621 DMA_CCMD_MASK_NOBIT,
5622 DMA_CCMD_DEVICE_INVL);
5625 /* Enable PASID support in the device, if it wasn't already */
5626 if (!info->pasid_enabled)
5627 iommu_enable_dev_iotlb(info);
5632 spin_unlock(&iommu->lock);
5633 spin_unlock_irqrestore(&device_domain_lock, flags);
5638 static void intel_iommu_apply_resv_region(struct device *dev,
5639 struct iommu_domain *domain,
5640 struct iommu_resv_region *region)
5642 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5643 unsigned long start, end;
5645 start = IOVA_PFN(region->start);
5646 end = IOVA_PFN(region->start + region->length - 1);
5648 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5651 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5653 if (dev_is_pci(dev))
5654 return pci_device_group(dev);
5655 return generic_device_group(dev);
5658 #ifdef CONFIG_INTEL_IOMMU_SVM
5659 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5661 struct intel_iommu *iommu;
5664 if (iommu_dummy(dev)) {
5666 "No IOMMU translation for device; cannot enable SVM\n");
5670 iommu = device_to_iommu(dev, &bus, &devfn);
5672 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5678 #endif /* CONFIG_INTEL_IOMMU_SVM */
5680 static int intel_iommu_enable_auxd(struct device *dev)
5682 struct device_domain_info *info;
5683 struct intel_iommu *iommu;
5684 unsigned long flags;
5688 iommu = device_to_iommu(dev, &bus, &devfn);
5689 if (!iommu || dmar_disabled)
5692 if (!sm_supported(iommu) || !pasid_supported(iommu))
5695 ret = intel_iommu_enable_pasid(iommu, dev);
5699 spin_lock_irqsave(&device_domain_lock, flags);
5700 info = dev->archdata.iommu;
5701 info->auxd_enabled = 1;
5702 spin_unlock_irqrestore(&device_domain_lock, flags);
5707 static int intel_iommu_disable_auxd(struct device *dev)
5709 struct device_domain_info *info;
5710 unsigned long flags;
5712 spin_lock_irqsave(&device_domain_lock, flags);
5713 info = dev->archdata.iommu;
5714 if (!WARN_ON(!info))
5715 info->auxd_enabled = 0;
5716 spin_unlock_irqrestore(&device_domain_lock, flags);
5722 * A PCI express designated vendor specific extended capability is defined
5723 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5724 * for system software and tools to detect endpoint devices supporting the
5725 * Intel scalable IO virtualization without host driver dependency.
5727 * Returns the address of the matching extended capability structure within
5728 * the device's PCI configuration space or 0 if the device does not support
5731 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5736 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5738 pci_read_config_word(pdev, pos + 4, &vendor);
5739 pci_read_config_word(pdev, pos + 8, &id);
5740 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5743 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5750 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5752 if (feat == IOMMU_DEV_FEAT_AUX) {
5755 if (!dev_is_pci(dev) || dmar_disabled ||
5756 !scalable_mode_support() || !iommu_pasid_support())
5759 ret = pci_pasid_features(to_pci_dev(dev));
5763 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5770 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5772 if (feat == IOMMU_DEV_FEAT_AUX)
5773 return intel_iommu_enable_auxd(dev);
5779 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5781 if (feat == IOMMU_DEV_FEAT_AUX)
5782 return intel_iommu_disable_auxd(dev);
5788 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5790 struct device_domain_info *info = dev->archdata.iommu;
5792 if (feat == IOMMU_DEV_FEAT_AUX)
5793 return scalable_mode_support() && info && info->auxd_enabled;
5799 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5801 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5803 return dmar_domain->default_pasid > 0 ?
5804 dmar_domain->default_pasid : -EINVAL;
5807 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5810 return attach_deferred(dev);
5814 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5815 enum iommu_attr attr, void *data)
5817 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5818 unsigned long flags;
5821 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5825 case DOMAIN_ATTR_NESTING:
5826 spin_lock_irqsave(&device_domain_lock, flags);
5827 if (nested_mode_support() &&
5828 list_empty(&dmar_domain->devices)) {
5829 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5830 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5834 spin_unlock_irqrestore(&device_domain_lock, flags);
5844 const struct iommu_ops intel_iommu_ops = {
5845 .capable = intel_iommu_capable,
5846 .domain_alloc = intel_iommu_domain_alloc,
5847 .domain_free = intel_iommu_domain_free,
5848 .domain_set_attr = intel_iommu_domain_set_attr,
5849 .attach_dev = intel_iommu_attach_device,
5850 .detach_dev = intel_iommu_detach_device,
5851 .aux_attach_dev = intel_iommu_aux_attach_device,
5852 .aux_detach_dev = intel_iommu_aux_detach_device,
5853 .aux_get_pasid = intel_iommu_aux_get_pasid,
5854 .map = intel_iommu_map,
5855 .unmap = intel_iommu_unmap,
5856 .iova_to_phys = intel_iommu_iova_to_phys,
5857 .probe_device = intel_iommu_probe_device,
5858 .release_device = intel_iommu_release_device,
5859 .get_resv_regions = intel_iommu_get_resv_regions,
5860 .put_resv_regions = generic_iommu_put_resv_regions,
5861 .apply_resv_region = intel_iommu_apply_resv_region,
5862 .device_group = intel_iommu_device_group,
5863 .dev_has_feat = intel_iommu_dev_has_feat,
5864 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5865 .dev_enable_feat = intel_iommu_dev_enable_feat,
5866 .dev_disable_feat = intel_iommu_dev_disable_feat,
5867 .is_attach_deferred = intel_iommu_is_attach_deferred,
5868 .def_domain_type = device_def_domain_type,
5869 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5872 static void quirk_iommu_igfx(struct pci_dev *dev)
5874 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5878 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5887 /* Broadwell igfx malfunctions with dmar */
5888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5913 static void quirk_iommu_rwbf(struct pci_dev *dev)
5916 * Mobile 4 Series Chipset neglects to set RWBF capability,
5917 * but needs it. Same seems to hold for the desktop versions.
5919 pci_info(dev, "Forcing write-buffer flush capability\n");
5923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5932 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5933 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5934 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5935 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5936 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5937 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5938 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5939 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5941 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5945 if (pci_read_config_word(dev, GGC, &ggc))
5948 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5949 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5951 } else if (dmar_map_gfx) {
5952 /* we have to ensure the gfx device is idle before we flush */
5953 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5954 intel_iommu_strict = 1;
5957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5962 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5963 ISOCH DMAR unit for the Azalia sound device, but not give it any
5964 TLB entries, which causes it to deadlock. Check for that. We do
5965 this in a function called from init_dmars(), instead of in a PCI
5966 quirk, because we don't want to print the obnoxious "BIOS broken"
5967 message if VT-d is actually disabled.
5969 static void __init check_tylersburg_isoch(void)
5971 struct pci_dev *pdev;
5972 uint32_t vtisochctrl;
5974 /* If there's no Azalia in the system anyway, forget it. */
5975 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5980 /* System Management Registers. Might be hidden, in which case
5981 we can't do the sanity check. But that's OK, because the
5982 known-broken BIOSes _don't_ actually hide it, so far. */
5983 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5987 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5994 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5995 if (vtisochctrl & 1)
5998 /* Drop all bits other than the number of TLB entries */
5999 vtisochctrl &= 0x1c;
6001 /* If we have the recommended number of TLB entries (16), fine. */
6002 if (vtisochctrl == 0x10)
6005 /* Zero TLB entries? You get to ride the short bus to school. */
6007 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6008 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6009 dmi_get_system_info(DMI_BIOS_VENDOR),
6010 dmi_get_system_info(DMI_BIOS_VERSION),
6011 dmi_get_system_info(DMI_PRODUCT_VERSION));
6012 iommu_identity_mapping |= IDENTMAP_AZALIA;
6016 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",