1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343 struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
363 static int intel_no_bounce;
365 #define IDENTMAP_ALL 1
366 #define IDENTMAP_GFX 2
367 #define IDENTMAP_AZALIA 4
369 int intel_iommu_gfx_mapped;
370 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
373 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
374 static DEFINE_SPINLOCK(device_domain_lock);
375 static LIST_HEAD(device_domain_list);
377 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
378 to_pci_dev(d)->untrusted)
381 * Iterate over elements in device_domain_list and call the specified
382 * callback @fn against each element.
384 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
385 void *data), void *data)
389 struct device_domain_info *info;
391 spin_lock_irqsave(&device_domain_lock, flags);
392 list_for_each_entry(info, &device_domain_list, global) {
393 ret = fn(info, data);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
399 spin_unlock_irqrestore(&device_domain_lock, flags);
404 const struct iommu_ops intel_iommu_ops;
406 static bool translation_pre_enabled(struct intel_iommu *iommu)
408 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
411 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
413 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
416 static void init_translation_status(struct intel_iommu *iommu)
420 gsts = readl(iommu->reg + DMAR_GSTS_REG);
421 if (gsts & DMA_GSTS_TES)
422 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
425 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
426 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
428 return container_of(dom, struct dmar_domain, domain);
431 static int __init intel_iommu_setup(char *str)
436 if (!strncmp(str, "on", 2)) {
438 pr_info("IOMMU enabled\n");
439 } else if (!strncmp(str, "off", 3)) {
441 no_platform_optin = 1;
442 pr_info("IOMMU disabled\n");
443 } else if (!strncmp(str, "igfx_off", 8)) {
445 pr_info("Disable GFX device mapping\n");
446 } else if (!strncmp(str, "forcedac", 8)) {
447 pr_info("Forcing DAC for PCI devices\n");
449 } else if (!strncmp(str, "strict", 6)) {
450 pr_info("Disable batched IOTLB flush\n");
451 intel_iommu_strict = 1;
452 } else if (!strncmp(str, "sp_off", 6)) {
453 pr_info("Disable supported super page\n");
454 intel_iommu_superpage = 0;
455 } else if (!strncmp(str, "sm_on", 5)) {
456 pr_info("Intel-IOMMU: scalable mode supported\n");
458 } else if (!strncmp(str, "tboot_noforce", 13)) {
460 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
461 intel_iommu_tboot_noforce = 1;
462 } else if (!strncmp(str, "nobounce", 8)) {
463 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
467 str += strcspn(str, ",");
473 __setup("intel_iommu=", intel_iommu_setup);
475 static struct kmem_cache *iommu_domain_cache;
476 static struct kmem_cache *iommu_devinfo_cache;
478 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 struct dmar_domain **domains;
483 domains = iommu->domains[idx];
487 return domains[did & 0xff];
490 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
491 struct dmar_domain *domain)
493 struct dmar_domain **domains;
496 if (!iommu->domains[idx]) {
497 size_t size = 256 * sizeof(struct dmar_domain *);
498 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501 domains = iommu->domains[idx];
502 if (WARN_ON(!domains))
505 domains[did & 0xff] = domain;
508 void *alloc_pgtable_page(int node)
513 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 vaddr = page_address(page);
519 void free_pgtable_page(void *vaddr)
521 free_page((unsigned long)vaddr);
524 static inline void *alloc_domain_mem(void)
526 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 static void free_domain_mem(void *vaddr)
531 kmem_cache_free(iommu_domain_cache, vaddr);
534 static inline void * alloc_devinfo_mem(void)
536 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 static inline void free_devinfo_mem(void *vaddr)
541 kmem_cache_free(iommu_devinfo_cache, vaddr);
544 static inline int domain_type_is_si(struct dmar_domain *domain)
546 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 static inline int domain_pfn_supported(struct dmar_domain *domain,
552 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
554 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
557 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
562 sagaw = cap_sagaw(iommu->cap);
563 for (agaw = width_to_agaw(max_gaw);
565 if (test_bit(agaw, &sagaw))
573 * Calculate max SAGAW for each iommu.
575 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
577 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
581 * calculate agaw for each iommu.
582 * "SAGAW" may be different across iommus, use a default agaw, and
583 * get a supported less agaw for iommus that don't support the default agaw.
585 int iommu_calculate_agaw(struct intel_iommu *iommu)
587 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
590 /* This functionin only returns single iommu in a domain */
591 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
595 /* si_domain and vm domain should not get here. */
596 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
599 for_each_domain_iommu(iommu_id, domain)
602 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
605 return g_iommus[iommu_id];
608 static void domain_update_iommu_coherency(struct dmar_domain *domain)
610 struct dmar_drhd_unit *drhd;
611 struct intel_iommu *iommu;
615 domain->iommu_coherency = 1;
617 for_each_domain_iommu(i, domain) {
619 if (!ecap_coherent(g_iommus[i]->ecap)) {
620 domain->iommu_coherency = 0;
627 /* No hardware attached; use lowest common denominator */
629 for_each_active_iommu(iommu, drhd) {
630 if (!ecap_coherent(iommu->ecap)) {
631 domain->iommu_coherency = 0;
638 static int domain_update_iommu_snooping(struct intel_iommu *skip)
640 struct dmar_drhd_unit *drhd;
641 struct intel_iommu *iommu;
645 for_each_active_iommu(iommu, drhd) {
647 if (!ecap_sc_support(iommu->ecap)) {
658 static int domain_update_iommu_superpage(struct intel_iommu *skip)
660 struct dmar_drhd_unit *drhd;
661 struct intel_iommu *iommu;
664 if (!intel_iommu_superpage) {
668 /* set iommu_superpage to the smallest common denominator */
670 for_each_active_iommu(iommu, drhd) {
672 mask &= cap_super_page_val(iommu->cap);
682 /* Some capabilities may be different across iommus */
683 static void domain_update_iommu_cap(struct dmar_domain *domain)
685 domain_update_iommu_coherency(domain);
686 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
687 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
690 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
693 struct root_entry *root = &iommu->root_entry[bus];
694 struct context_entry *context;
698 if (sm_supported(iommu)) {
706 context = phys_to_virt(*entry & VTD_PAGE_MASK);
708 unsigned long phy_addr;
712 context = alloc_pgtable_page(iommu->node);
716 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
717 phy_addr = virt_to_phys((void *)context);
718 *entry = phy_addr | 1;
719 __iommu_flush_cache(iommu, entry, sizeof(*entry));
721 return &context[devfn];
724 static int iommu_dummy(struct device *dev)
726 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
730 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
731 * sub-hierarchy of a candidate PCI-PCI bridge
732 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
733 * @bridge: the candidate PCI-PCI bridge
735 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
738 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
740 struct pci_dev *pdev, *pbridge;
742 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
745 pdev = to_pci_dev(dev);
746 pbridge = to_pci_dev(bridge);
748 if (pbridge->subordinate &&
749 pbridge->subordinate->number <= pdev->bus->number &&
750 pbridge->subordinate->busn_res.end >= pdev->bus->number)
756 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
758 struct dmar_drhd_unit *drhd = NULL;
759 struct intel_iommu *iommu;
761 struct pci_dev *pdev = NULL;
765 if (iommu_dummy(dev))
768 if (dev_is_pci(dev)) {
769 struct pci_dev *pf_pdev;
771 pdev = to_pci_dev(dev);
774 /* VMD child devices currently cannot be handled individually */
775 if (is_vmd(pdev->bus))
779 /* VFs aren't listed in scope tables; we need to look up
780 * the PF instead to find the IOMMU. */
781 pf_pdev = pci_physfn(pdev);
783 segment = pci_domain_nr(pdev->bus);
784 } else if (has_acpi_companion(dev))
785 dev = &ACPI_COMPANION(dev)->dev;
788 for_each_active_iommu(iommu, drhd) {
789 if (pdev && segment != drhd->segment)
792 for_each_active_dev_scope(drhd->devices,
793 drhd->devices_cnt, i, tmp) {
795 /* For a VF use its original BDF# not that of the PF
796 * which we used for the IOMMU lookup. Strictly speaking
797 * we could do this for all PCI devices; we only need to
798 * get the BDF# from the scope table for ACPI matches. */
799 if (pdev && pdev->is_virtfn)
802 *bus = drhd->devices[i].bus;
803 *devfn = drhd->devices[i].devfn;
807 if (is_downstream_to_pci_bridge(dev, tmp))
811 if (pdev && drhd->include_all) {
813 *bus = pdev->bus->number;
814 *devfn = pdev->devfn;
825 static void domain_flush_cache(struct dmar_domain *domain,
826 void *addr, int size)
828 if (!domain->iommu_coherency)
829 clflush_cache_range(addr, size);
832 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
834 struct context_entry *context;
838 spin_lock_irqsave(&iommu->lock, flags);
839 context = iommu_context_addr(iommu, bus, devfn, 0);
841 ret = context_present(context);
842 spin_unlock_irqrestore(&iommu->lock, flags);
846 static void free_context_table(struct intel_iommu *iommu)
850 struct context_entry *context;
852 spin_lock_irqsave(&iommu->lock, flags);
853 if (!iommu->root_entry) {
856 for (i = 0; i < ROOT_ENTRY_NR; i++) {
857 context = iommu_context_addr(iommu, i, 0, 0);
859 free_pgtable_page(context);
861 if (!sm_supported(iommu))
864 context = iommu_context_addr(iommu, i, 0x80, 0);
866 free_pgtable_page(context);
869 free_pgtable_page(iommu->root_entry);
870 iommu->root_entry = NULL;
872 spin_unlock_irqrestore(&iommu->lock, flags);
875 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
876 unsigned long pfn, int *target_level)
878 struct dma_pte *parent, *pte;
879 int level = agaw_to_level(domain->agaw);
882 BUG_ON(!domain->pgd);
884 if (!domain_pfn_supported(domain, pfn))
885 /* Address beyond IOMMU's addressing capabilities. */
888 parent = domain->pgd;
893 offset = pfn_level_offset(pfn, level);
894 pte = &parent[offset];
895 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
897 if (level == *target_level)
900 if (!dma_pte_present(pte)) {
903 tmp_page = alloc_pgtable_page(domain->nid);
908 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
909 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
910 if (cmpxchg64(&pte->val, 0ULL, pteval))
911 /* Someone else set it while we were thinking; use theirs. */
912 free_pgtable_page(tmp_page);
914 domain_flush_cache(domain, pte, sizeof(*pte));
919 parent = phys_to_virt(dma_pte_addr(pte));
924 *target_level = level;
929 /* return address's pte at specific level */
930 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
932 int level, int *large_page)
934 struct dma_pte *parent, *pte;
935 int total = agaw_to_level(domain->agaw);
938 parent = domain->pgd;
939 while (level <= total) {
940 offset = pfn_level_offset(pfn, total);
941 pte = &parent[offset];
945 if (!dma_pte_present(pte)) {
950 if (dma_pte_superpage(pte)) {
955 parent = phys_to_virt(dma_pte_addr(pte));
961 /* clear last level pte, a tlb flush should be followed */
962 static void dma_pte_clear_range(struct dmar_domain *domain,
963 unsigned long start_pfn,
964 unsigned long last_pfn)
966 unsigned int large_page;
967 struct dma_pte *first_pte, *pte;
969 BUG_ON(!domain_pfn_supported(domain, start_pfn));
970 BUG_ON(!domain_pfn_supported(domain, last_pfn));
971 BUG_ON(start_pfn > last_pfn);
973 /* we don't need lock here; nobody else touches the iova range */
976 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
978 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
983 start_pfn += lvl_to_nr_pages(large_page);
985 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
987 domain_flush_cache(domain, first_pte,
988 (void *)pte - (void *)first_pte);
990 } while (start_pfn && start_pfn <= last_pfn);
993 static void dma_pte_free_level(struct dmar_domain *domain, int level,
994 int retain_level, struct dma_pte *pte,
995 unsigned long pfn, unsigned long start_pfn,
996 unsigned long last_pfn)
998 pfn = max(start_pfn, pfn);
999 pte = &pte[pfn_level_offset(pfn, level)];
1002 unsigned long level_pfn;
1003 struct dma_pte *level_pte;
1005 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1008 level_pfn = pfn & level_mask(level);
1009 level_pte = phys_to_virt(dma_pte_addr(pte));
1012 dma_pte_free_level(domain, level - 1, retain_level,
1013 level_pte, level_pfn, start_pfn,
1018 * Free the page table if we're below the level we want to
1019 * retain and the range covers the entire table.
1021 if (level < retain_level && !(start_pfn > level_pfn ||
1022 last_pfn < level_pfn + level_size(level) - 1)) {
1024 domain_flush_cache(domain, pte, sizeof(*pte));
1025 free_pgtable_page(level_pte);
1028 pfn += level_size(level);
1029 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1033 * clear last level (leaf) ptes and free page table pages below the
1034 * level we wish to keep intact.
1036 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1037 unsigned long start_pfn,
1038 unsigned long last_pfn,
1041 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1042 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1043 BUG_ON(start_pfn > last_pfn);
1045 dma_pte_clear_range(domain, start_pfn, last_pfn);
1047 /* We don't need lock here; nobody else touches the iova range */
1048 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1049 domain->pgd, 0, start_pfn, last_pfn);
1052 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1053 free_pgtable_page(domain->pgd);
1058 /* When a page at a given level is being unlinked from its parent, we don't
1059 need to *modify* it at all. All we need to do is make a list of all the
1060 pages which can be freed just as soon as we've flushed the IOTLB and we
1061 know the hardware page-walk will no longer touch them.
1062 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1064 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1065 int level, struct dma_pte *pte,
1066 struct page *freelist)
1070 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1071 pg->freelist = freelist;
1077 pte = page_address(pg);
1079 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1080 freelist = dma_pte_list_pagetables(domain, level - 1,
1083 } while (!first_pte_in_page(pte));
1088 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1089 struct dma_pte *pte, unsigned long pfn,
1090 unsigned long start_pfn,
1091 unsigned long last_pfn,
1092 struct page *freelist)
1094 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1096 pfn = max(start_pfn, pfn);
1097 pte = &pte[pfn_level_offset(pfn, level)];
1100 unsigned long level_pfn;
1102 if (!dma_pte_present(pte))
1105 level_pfn = pfn & level_mask(level);
1107 /* If range covers entire pagetable, free it */
1108 if (start_pfn <= level_pfn &&
1109 last_pfn >= level_pfn + level_size(level) - 1) {
1110 /* These suborbinate page tables are going away entirely. Don't
1111 bother to clear them; we're just going to *free* them. */
1112 if (level > 1 && !dma_pte_superpage(pte))
1113 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1119 } else if (level > 1) {
1120 /* Recurse down into a level that isn't *entirely* obsolete */
1121 freelist = dma_pte_clear_level(domain, level - 1,
1122 phys_to_virt(dma_pte_addr(pte)),
1123 level_pfn, start_pfn, last_pfn,
1127 pfn += level_size(level);
1128 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1131 domain_flush_cache(domain, first_pte,
1132 (void *)++last_pte - (void *)first_pte);
1137 /* We can't just free the pages because the IOMMU may still be walking
1138 the page tables, and may have cached the intermediate levels. The
1139 pages can only be freed after the IOTLB flush has been done. */
1140 static struct page *domain_unmap(struct dmar_domain *domain,
1141 unsigned long start_pfn,
1142 unsigned long last_pfn)
1144 struct page *freelist;
1146 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1147 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1148 BUG_ON(start_pfn > last_pfn);
1150 /* we don't need lock here; nobody else touches the iova range */
1151 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1152 domain->pgd, 0, start_pfn, last_pfn, NULL);
1155 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1156 struct page *pgd_page = virt_to_page(domain->pgd);
1157 pgd_page->freelist = freelist;
1158 freelist = pgd_page;
1166 static void dma_free_pagelist(struct page *freelist)
1170 while ((pg = freelist)) {
1171 freelist = pg->freelist;
1172 free_pgtable_page(page_address(pg));
1176 static void iova_entry_free(unsigned long data)
1178 struct page *freelist = (struct page *)data;
1180 dma_free_pagelist(freelist);
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186 struct root_entry *root;
1187 unsigned long flags;
1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1191 pr_err("Allocating root entry for %s failed\n",
1196 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1198 spin_lock_irqsave(&iommu->lock, flags);
1199 iommu->root_entry = root;
1200 spin_unlock_irqrestore(&iommu->lock, flags);
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1211 addr = virt_to_phys(iommu->root_entry);
1212 if (sm_supported(iommu))
1213 addr |= DMA_RTADDR_SMT;
1215 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220 /* Make sure hardware complete it */
1221 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222 readl, (sts & DMA_GSTS_RTPS), sts);
1224 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1232 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1235 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1238 /* Make sure hardware complete it */
1239 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 readl, (!(val & DMA_GSTS_WBFS)), val);
1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245 /* return value determine if we need a write buffer flush */
1246 static void __iommu_flush_context(struct intel_iommu *iommu,
1247 u16 did, u16 source_id, u8 function_mask,
1254 case DMA_CCMD_GLOBAL_INVL:
1255 val = DMA_CCMD_GLOBAL_INVL;
1257 case DMA_CCMD_DOMAIN_INVL:
1258 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1260 case DMA_CCMD_DEVICE_INVL:
1261 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1262 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1267 val |= DMA_CCMD_ICC;
1269 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1270 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1272 /* Make sure hardware complete it */
1273 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1274 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1276 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1279 /* return value determine if we need a write buffer flush */
1280 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1281 u64 addr, unsigned int size_order, u64 type)
1283 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1284 u64 val = 0, val_iva = 0;
1288 case DMA_TLB_GLOBAL_FLUSH:
1289 /* global flush doesn't need set IVA_REG */
1290 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1292 case DMA_TLB_DSI_FLUSH:
1293 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1295 case DMA_TLB_PSI_FLUSH:
1296 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1297 /* IH bit is passed in as part of address */
1298 val_iva = size_order | addr;
1303 /* Note: set drain read/write */
1306 * This is probably to be super secure.. Looks like we can
1307 * ignore it without any impact.
1309 if (cap_read_drain(iommu->cap))
1310 val |= DMA_TLB_READ_DRAIN;
1312 if (cap_write_drain(iommu->cap))
1313 val |= DMA_TLB_WRITE_DRAIN;
1315 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1316 /* Note: Only uses first TLB reg currently */
1318 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1319 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1321 /* Make sure hardware complete it */
1322 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1323 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1325 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 /* check IOTLB invalidation granularity */
1328 if (DMA_TLB_IAIG(val) == 0)
1329 pr_err("Flush IOTLB failed\n");
1330 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1331 pr_debug("TLB flush request %Lx, actual %Lx\n",
1332 (unsigned long long)DMA_TLB_IIRG(type),
1333 (unsigned long long)DMA_TLB_IAIG(val));
1336 static struct device_domain_info *
1337 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1340 struct device_domain_info *info;
1342 assert_spin_locked(&device_domain_lock);
1347 list_for_each_entry(info, &domain->devices, link)
1348 if (info->iommu == iommu && info->bus == bus &&
1349 info->devfn == devfn) {
1350 if (info->ats_supported && info->dev)
1358 static void domain_update_iotlb(struct dmar_domain *domain)
1360 struct device_domain_info *info;
1361 bool has_iotlb_device = false;
1363 assert_spin_locked(&device_domain_lock);
1365 list_for_each_entry(info, &domain->devices, link) {
1366 struct pci_dev *pdev;
1368 if (!info->dev || !dev_is_pci(info->dev))
1371 pdev = to_pci_dev(info->dev);
1372 if (pdev->ats_enabled) {
1373 has_iotlb_device = true;
1378 domain->has_iotlb_device = has_iotlb_device;
1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1383 struct pci_dev *pdev;
1385 assert_spin_locked(&device_domain_lock);
1387 if (!info || !dev_is_pci(info->dev))
1390 pdev = to_pci_dev(info->dev);
1391 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1392 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1393 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1394 * reserved, which should be set to 0.
1396 if (!ecap_dit(info->iommu->ecap))
1399 struct pci_dev *pf_pdev;
1401 /* pdev will be returned if device is not a vf */
1402 pf_pdev = pci_physfn(pdev);
1403 info->pfsid = pci_dev_id(pf_pdev);
1406 #ifdef CONFIG_INTEL_IOMMU_SVM
1407 /* The PCIe spec, in its wisdom, declares that the behaviour of
1408 the device if you enable PASID support after ATS support is
1409 undefined. So always enable PASID support on devices which
1410 have it, even if we can't yet know if we're ever going to
1412 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1413 info->pasid_enabled = 1;
1415 if (info->pri_supported &&
1416 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1417 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1418 info->pri_enabled = 1;
1420 if (!pdev->untrusted && info->ats_supported &&
1421 pci_ats_page_aligned(pdev) &&
1422 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1423 info->ats_enabled = 1;
1424 domain_update_iotlb(info->domain);
1425 info->ats_qdep = pci_ats_queue_depth(pdev);
1429 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1431 struct pci_dev *pdev;
1433 assert_spin_locked(&device_domain_lock);
1435 if (!dev_is_pci(info->dev))
1438 pdev = to_pci_dev(info->dev);
1440 if (info->ats_enabled) {
1441 pci_disable_ats(pdev);
1442 info->ats_enabled = 0;
1443 domain_update_iotlb(info->domain);
1445 #ifdef CONFIG_INTEL_IOMMU_SVM
1446 if (info->pri_enabled) {
1447 pci_disable_pri(pdev);
1448 info->pri_enabled = 0;
1450 if (info->pasid_enabled) {
1451 pci_disable_pasid(pdev);
1452 info->pasid_enabled = 0;
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458 u64 addr, unsigned mask)
1461 unsigned long flags;
1462 struct device_domain_info *info;
1464 if (!domain->has_iotlb_device)
1467 spin_lock_irqsave(&device_domain_lock, flags);
1468 list_for_each_entry(info, &domain->devices, link) {
1469 if (!info->ats_enabled)
1472 sid = info->bus << 8 | info->devfn;
1473 qdep = info->ats_qdep;
1474 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1477 spin_unlock_irqrestore(&device_domain_lock, flags);
1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1481 struct dmar_domain *domain,
1482 unsigned long pfn, unsigned int pages,
1485 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1486 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1487 u16 did = domain->iommu_did[iommu->seq_id];
1494 * Fallback to domain selective flush if no PSI support or the size is
1496 * PSI requires page size to be 2 ^ x, and the base address is naturally
1497 * aligned to the size
1499 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1500 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1503 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1507 * In caching mode, changes of pages from non-present to present require
1508 * flush. However, device IOTLB doesn't need to be flushed in this case.
1510 if (!cap_caching_mode(iommu->cap) || !map)
1511 iommu_flush_dev_iotlb(domain, addr, mask);
1514 /* Notification for newly created mappings */
1515 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1516 struct dmar_domain *domain,
1517 unsigned long pfn, unsigned int pages)
1519 /* It's a non-present to present mapping. Only flush if caching mode */
1520 if (cap_caching_mode(iommu->cap))
1521 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1523 iommu_flush_write_buffer(iommu);
1526 static void iommu_flush_iova(struct iova_domain *iovad)
1528 struct dmar_domain *domain;
1531 domain = container_of(iovad, struct dmar_domain, iovad);
1533 for_each_domain_iommu(idx, domain) {
1534 struct intel_iommu *iommu = g_iommus[idx];
1535 u16 did = domain->iommu_did[iommu->seq_id];
1537 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1539 if (!cap_caching_mode(iommu->cap))
1540 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1541 0, MAX_AGAW_PFN_WIDTH);
1545 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1548 unsigned long flags;
1550 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1553 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1554 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1555 pmen &= ~DMA_PMEN_EPM;
1556 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1558 /* wait for the protected region status bit to clear */
1559 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1560 readl, !(pmen & DMA_PMEN_PRS), pmen);
1562 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1565 static void iommu_enable_translation(struct intel_iommu *iommu)
1568 unsigned long flags;
1570 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1571 iommu->gcmd |= DMA_GCMD_TE;
1572 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1574 /* Make sure hardware complete it */
1575 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1576 readl, (sts & DMA_GSTS_TES), sts);
1578 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1581 static void iommu_disable_translation(struct intel_iommu *iommu)
1586 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1587 iommu->gcmd &= ~DMA_GCMD_TE;
1588 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1590 /* Make sure hardware complete it */
1591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1592 readl, (!(sts & DMA_GSTS_TES)), sts);
1594 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1597 static int iommu_init_domains(struct intel_iommu *iommu)
1599 u32 ndomains, nlongs;
1602 ndomains = cap_ndoms(iommu->cap);
1603 pr_debug("%s: Number of Domains supported <%d>\n",
1604 iommu->name, ndomains);
1605 nlongs = BITS_TO_LONGS(ndomains);
1607 spin_lock_init(&iommu->lock);
1609 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1610 if (!iommu->domain_ids) {
1611 pr_err("%s: Allocating domain id array failed\n",
1616 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1617 iommu->domains = kzalloc(size, GFP_KERNEL);
1619 if (iommu->domains) {
1620 size = 256 * sizeof(struct dmar_domain *);
1621 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1624 if (!iommu->domains || !iommu->domains[0]) {
1625 pr_err("%s: Allocating domain array failed\n",
1627 kfree(iommu->domain_ids);
1628 kfree(iommu->domains);
1629 iommu->domain_ids = NULL;
1630 iommu->domains = NULL;
1635 * If Caching mode is set, then invalid translations are tagged
1636 * with domain-id 0, hence we need to pre-allocate it. We also
1637 * use domain-id 0 as a marker for non-allocated domain-id, so
1638 * make sure it is not used for a real domain.
1640 set_bit(0, iommu->domain_ids);
1643 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1644 * entry for first-level or pass-through translation modes should
1645 * be programmed with a domain id different from those used for
1646 * second-level or nested translation. We reserve a domain id for
1649 if (sm_supported(iommu))
1650 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1655 static void disable_dmar_iommu(struct intel_iommu *iommu)
1657 struct device_domain_info *info, *tmp;
1658 unsigned long flags;
1660 if (!iommu->domains || !iommu->domain_ids)
1663 spin_lock_irqsave(&device_domain_lock, flags);
1664 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1665 if (info->iommu != iommu)
1668 if (!info->dev || !info->domain)
1671 __dmar_remove_one_dev_info(info);
1673 spin_unlock_irqrestore(&device_domain_lock, flags);
1675 if (iommu->gcmd & DMA_GCMD_TE)
1676 iommu_disable_translation(iommu);
1679 static void free_dmar_iommu(struct intel_iommu *iommu)
1681 if ((iommu->domains) && (iommu->domain_ids)) {
1682 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1685 for (i = 0; i < elems; i++)
1686 kfree(iommu->domains[i]);
1687 kfree(iommu->domains);
1688 kfree(iommu->domain_ids);
1689 iommu->domains = NULL;
1690 iommu->domain_ids = NULL;
1693 g_iommus[iommu->seq_id] = NULL;
1695 /* free context mapping */
1696 free_context_table(iommu);
1698 #ifdef CONFIG_INTEL_IOMMU_SVM
1699 if (pasid_supported(iommu)) {
1700 if (ecap_prs(iommu->ecap))
1701 intel_svm_finish_prq(iommu);
1706 static struct dmar_domain *alloc_domain(int flags)
1708 struct dmar_domain *domain;
1710 domain = alloc_domain_mem();
1714 memset(domain, 0, sizeof(*domain));
1715 domain->nid = NUMA_NO_NODE;
1716 domain->flags = flags;
1717 domain->has_iotlb_device = false;
1718 INIT_LIST_HEAD(&domain->devices);
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725 struct intel_iommu *iommu)
1727 unsigned long ndomains;
1730 assert_spin_locked(&device_domain_lock);
1731 assert_spin_locked(&iommu->lock);
1733 domain->iommu_refcnt[iommu->seq_id] += 1;
1734 domain->iommu_count += 1;
1735 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736 ndomains = cap_ndoms(iommu->cap);
1737 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1739 if (num >= ndomains) {
1740 pr_err("%s: No free domain ids\n", iommu->name);
1741 domain->iommu_refcnt[iommu->seq_id] -= 1;
1742 domain->iommu_count -= 1;
1746 set_bit(num, iommu->domain_ids);
1747 set_iommu_domain(iommu, num, domain);
1749 domain->iommu_did[iommu->seq_id] = num;
1750 domain->nid = iommu->node;
1752 domain_update_iommu_cap(domain);
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759 struct intel_iommu *iommu)
1763 assert_spin_locked(&device_domain_lock);
1764 assert_spin_locked(&iommu->lock);
1766 domain->iommu_refcnt[iommu->seq_id] -= 1;
1767 count = --domain->iommu_count;
1768 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769 num = domain->iommu_did[iommu->seq_id];
1770 clear_bit(num, iommu->domain_ids);
1771 set_iommu_domain(iommu, num, NULL);
1773 domain_update_iommu_cap(domain);
1774 domain->iommu_did[iommu->seq_id] = 0;
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1783 static int dmar_init_reserved_ranges(void)
1785 struct pci_dev *pdev = NULL;
1789 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1791 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792 &reserved_rbtree_key);
1794 /* IOAPIC ranges shouldn't be accessed by DMA */
1795 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796 IOVA_PFN(IOAPIC_RANGE_END));
1798 pr_err("Reserve IOAPIC range failed\n");
1802 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803 for_each_pci_dev(pdev) {
1806 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807 r = &pdev->resource[i];
1808 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1810 iova = reserve_iova(&reserved_iova_list,
1814 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1824 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1830 int r = (gaw - 12) % 9;
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1844 int adjust_width, agaw;
1845 unsigned long sagaw;
1848 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1850 err = init_iova_flush_queue(&domain->iovad,
1851 iommu_flush_iova, iova_entry_free);
1855 domain_reserve_special_ranges(domain);
1857 /* calculate AGAW */
1858 if (guest_width > cap_mgaw(iommu->cap))
1859 guest_width = cap_mgaw(iommu->cap);
1860 domain->gaw = guest_width;
1861 adjust_width = guestwidth_to_adjustwidth(guest_width);
1862 agaw = width_to_agaw(adjust_width);
1863 sagaw = cap_sagaw(iommu->cap);
1864 if (!test_bit(agaw, &sagaw)) {
1865 /* hardware doesn't support it, choose a bigger one */
1866 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867 agaw = find_next_bit(&sagaw, 5, agaw);
1871 domain->agaw = agaw;
1873 if (ecap_coherent(iommu->ecap))
1874 domain->iommu_coherency = 1;
1876 domain->iommu_coherency = 0;
1878 if (ecap_sc_support(iommu->ecap))
1879 domain->iommu_snooping = 1;
1881 domain->iommu_snooping = 0;
1883 if (intel_iommu_superpage)
1884 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1886 domain->iommu_superpage = 0;
1888 domain->nid = iommu->node;
1890 /* always allocate the top pgd */
1891 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1894 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1898 static void domain_exit(struct dmar_domain *domain)
1901 /* Remove associated devices and clear attached or cached domains */
1902 domain_remove_dev_info(domain);
1905 put_iova_domain(&domain->iovad);
1908 struct page *freelist;
1910 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1911 dma_free_pagelist(freelist);
1914 free_domain_mem(domain);
1918 * Get the PASID directory size for scalable mode context entry.
1919 * Value of X in the PDTS field of a scalable mode context entry
1920 * indicates PASID directory with 2^(X + 7) entries.
1922 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1926 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1927 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1935 * Set the RID_PASID field of a scalable mode context entry. The
1936 * IOMMU hardware will use the PASID value set in this field for
1937 * DMA translations of DMA requests without PASID.
1940 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1942 context->hi |= pasid & ((1 << 20) - 1);
1943 context->hi |= (1 << 20);
1947 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1950 static inline void context_set_sm_dte(struct context_entry *context)
1952 context->lo |= (1 << 2);
1956 * Set the PRE(Page Request Enable) field of a scalable mode context
1959 static inline void context_set_sm_pre(struct context_entry *context)
1961 context->lo |= (1 << 4);
1964 /* Convert value to context PASID directory size field coding. */
1965 #define context_pdts(pds) (((pds) & 0x7) << 9)
1967 static int domain_context_mapping_one(struct dmar_domain *domain,
1968 struct intel_iommu *iommu,
1969 struct pasid_table *table,
1972 u16 did = domain->iommu_did[iommu->seq_id];
1973 int translation = CONTEXT_TT_MULTI_LEVEL;
1974 struct device_domain_info *info = NULL;
1975 struct context_entry *context;
1976 unsigned long flags;
1981 if (hw_pass_through && domain_type_is_si(domain))
1982 translation = CONTEXT_TT_PASS_THROUGH;
1984 pr_debug("Set context mapping for %02x:%02x.%d\n",
1985 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1987 BUG_ON(!domain->pgd);
1989 spin_lock_irqsave(&device_domain_lock, flags);
1990 spin_lock(&iommu->lock);
1993 context = iommu_context_addr(iommu, bus, devfn, 1);
1998 if (context_present(context))
2002 * For kdump cases, old valid entries may be cached due to the
2003 * in-flight DMA and copied pgtable, but there is no unmapping
2004 * behaviour for them, thus we need an explicit cache flush for
2005 * the newly-mapped device. For kdump, at this point, the device
2006 * is supposed to finish reset at its driver probe stage, so no
2007 * in-flight DMA will exist, and we don't need to worry anymore
2010 if (context_copied(context)) {
2011 u16 did_old = context_domain_id(context);
2013 if (did_old < cap_ndoms(iommu->cap)) {
2014 iommu->flush.flush_context(iommu, did_old,
2015 (((u16)bus) << 8) | devfn,
2016 DMA_CCMD_MASK_NOBIT,
2017 DMA_CCMD_DEVICE_INVL);
2018 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2023 context_clear_entry(context);
2025 if (sm_supported(iommu)) {
2030 /* Setup the PASID DIR pointer: */
2031 pds = context_get_sm_pds(table);
2032 context->lo = (u64)virt_to_phys(table->table) |
2035 /* Setup the RID_PASID field: */
2036 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2039 * Setup the Device-TLB enable bit and Page request
2042 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2043 if (info && info->ats_supported)
2044 context_set_sm_dte(context);
2045 if (info && info->pri_supported)
2046 context_set_sm_pre(context);
2048 struct dma_pte *pgd = domain->pgd;
2051 context_set_domain_id(context, did);
2053 if (translation != CONTEXT_TT_PASS_THROUGH) {
2055 * Skip top levels of page tables for iommu which has
2056 * less agaw than default. Unnecessary for PT mode.
2058 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2060 pgd = phys_to_virt(dma_pte_addr(pgd));
2061 if (!dma_pte_present(pgd))
2065 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2066 if (info && info->ats_supported)
2067 translation = CONTEXT_TT_DEV_IOTLB;
2069 translation = CONTEXT_TT_MULTI_LEVEL;
2071 context_set_address_root(context, virt_to_phys(pgd));
2072 context_set_address_width(context, agaw);
2075 * In pass through mode, AW must be programmed to
2076 * indicate the largest AGAW value supported by
2077 * hardware. And ASR is ignored by hardware.
2079 context_set_address_width(context, iommu->msagaw);
2082 context_set_translation_type(context, translation);
2085 context_set_fault_enable(context);
2086 context_set_present(context);
2087 domain_flush_cache(domain, context, sizeof(*context));
2090 * It's a non-present to present mapping. If hardware doesn't cache
2091 * non-present entry we only need to flush the write-buffer. If the
2092 * _does_ cache non-present entries, then it does so in the special
2093 * domain #0, which we have to flush:
2095 if (cap_caching_mode(iommu->cap)) {
2096 iommu->flush.flush_context(iommu, 0,
2097 (((u16)bus) << 8) | devfn,
2098 DMA_CCMD_MASK_NOBIT,
2099 DMA_CCMD_DEVICE_INVL);
2100 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2102 iommu_flush_write_buffer(iommu);
2104 iommu_enable_dev_iotlb(info);
2109 spin_unlock(&iommu->lock);
2110 spin_unlock_irqrestore(&device_domain_lock, flags);
2116 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2118 struct pasid_table *table;
2119 struct intel_iommu *iommu;
2122 iommu = device_to_iommu(dev, &bus, &devfn);
2126 table = intel_pasid_get_table(dev);
2127 return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2130 static int domain_context_mapped_cb(struct pci_dev *pdev,
2131 u16 alias, void *opaque)
2133 struct intel_iommu *iommu = opaque;
2135 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2138 static int domain_context_mapped(struct device *dev)
2140 struct intel_iommu *iommu;
2143 iommu = device_to_iommu(dev, &bus, &devfn);
2147 if (!dev_is_pci(dev))
2148 return device_context_mapped(iommu, bus, devfn);
2150 return !pci_for_each_dma_alias(to_pci_dev(dev),
2151 domain_context_mapped_cb, iommu);
2154 /* Returns a number of VTD pages, but aligned to MM page size */
2155 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2158 host_addr &= ~PAGE_MASK;
2159 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2162 /* Return largest possible superpage level for a given mapping */
2163 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2164 unsigned long iov_pfn,
2165 unsigned long phy_pfn,
2166 unsigned long pages)
2168 int support, level = 1;
2169 unsigned long pfnmerge;
2171 support = domain->iommu_superpage;
2173 /* To use a large page, the virtual *and* physical addresses
2174 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2175 of them will mean we have to use smaller pages. So just
2176 merge them and check both at once. */
2177 pfnmerge = iov_pfn | phy_pfn;
2179 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2180 pages >>= VTD_STRIDE_SHIFT;
2183 pfnmerge >>= VTD_STRIDE_SHIFT;
2190 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2191 struct scatterlist *sg, unsigned long phys_pfn,
2192 unsigned long nr_pages, int prot)
2194 struct dma_pte *first_pte = NULL, *pte = NULL;
2195 phys_addr_t uninitialized_var(pteval);
2196 unsigned long sg_res = 0;
2197 unsigned int largepage_lvl = 0;
2198 unsigned long lvl_pages = 0;
2200 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2202 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2205 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2209 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2212 while (nr_pages > 0) {
2216 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2218 sg_res = aligned_nrpages(sg->offset, sg->length);
2219 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2220 sg->dma_length = sg->length;
2221 pteval = (sg_phys(sg) - pgoff) | prot;
2222 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2226 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2228 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2231 /* It is large page*/
2232 if (largepage_lvl > 1) {
2233 unsigned long nr_superpages, end_pfn;
2235 pteval |= DMA_PTE_LARGE_PAGE;
2236 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2238 nr_superpages = sg_res / lvl_pages;
2239 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2242 * Ensure that old small page tables are
2243 * removed to make room for superpage(s).
2244 * We're adding new large pages, so make sure
2245 * we don't remove their parent tables.
2247 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2250 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2254 /* We don't need lock here, nobody else
2255 * touches the iova range
2257 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2259 static int dumps = 5;
2260 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2261 iov_pfn, tmp, (unsigned long long)pteval);
2264 debug_dma_dump_mappings(NULL);
2269 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2271 BUG_ON(nr_pages < lvl_pages);
2272 BUG_ON(sg_res < lvl_pages);
2274 nr_pages -= lvl_pages;
2275 iov_pfn += lvl_pages;
2276 phys_pfn += lvl_pages;
2277 pteval += lvl_pages * VTD_PAGE_SIZE;
2278 sg_res -= lvl_pages;
2280 /* If the next PTE would be the first in a new page, then we
2281 need to flush the cache on the entries we've just written.
2282 And then we'll need to recalculate 'pte', so clear it and
2283 let it get set again in the if (!pte) block above.
2285 If we're done (!nr_pages) we need to flush the cache too.
2287 Also if we've been setting superpages, we may need to
2288 recalculate 'pte' and switch back to smaller pages for the
2289 end of the mapping, if the trailing size is not enough to
2290 use another superpage (i.e. sg_res < lvl_pages). */
2292 if (!nr_pages || first_pte_in_page(pte) ||
2293 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2294 domain_flush_cache(domain, first_pte,
2295 (void *)pte - (void *)first_pte);
2299 if (!sg_res && nr_pages)
2305 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2306 struct scatterlist *sg, unsigned long phys_pfn,
2307 unsigned long nr_pages, int prot)
2310 struct intel_iommu *iommu;
2312 /* Do the real mapping first */
2313 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2317 for_each_domain_iommu(iommu_id, domain) {
2318 iommu = g_iommus[iommu_id];
2319 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2325 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326 struct scatterlist *sg, unsigned long nr_pages,
2329 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2332 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2333 unsigned long phys_pfn, unsigned long nr_pages,
2336 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2339 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2341 unsigned long flags;
2342 struct context_entry *context;
2348 spin_lock_irqsave(&iommu->lock, flags);
2349 context = iommu_context_addr(iommu, bus, devfn, 0);
2351 spin_unlock_irqrestore(&iommu->lock, flags);
2354 did_old = context_domain_id(context);
2355 context_clear_entry(context);
2356 __iommu_flush_cache(iommu, context, sizeof(*context));
2357 spin_unlock_irqrestore(&iommu->lock, flags);
2358 iommu->flush.flush_context(iommu,
2360 (((u16)bus) << 8) | devfn,
2361 DMA_CCMD_MASK_NOBIT,
2362 DMA_CCMD_DEVICE_INVL);
2363 iommu->flush.flush_iotlb(iommu,
2370 static inline void unlink_domain_info(struct device_domain_info *info)
2372 assert_spin_locked(&device_domain_lock);
2373 list_del(&info->link);
2374 list_del(&info->global);
2376 info->dev->archdata.iommu = NULL;
2379 static void domain_remove_dev_info(struct dmar_domain *domain)
2381 struct device_domain_info *info, *tmp;
2382 unsigned long flags;
2384 spin_lock_irqsave(&device_domain_lock, flags);
2385 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2386 __dmar_remove_one_dev_info(info);
2387 spin_unlock_irqrestore(&device_domain_lock, flags);
2392 * Note: we use struct device->archdata.iommu stores the info
2394 static struct dmar_domain *find_domain(struct device *dev)
2396 struct device_domain_info *info;
2398 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2399 struct iommu_domain *domain;
2401 dev->archdata.iommu = NULL;
2402 domain = iommu_get_domain_for_dev(dev);
2404 intel_iommu_attach_device(domain, dev);
2407 /* No lock here, assumes no domain exit in normal case */
2408 info = dev->archdata.iommu;
2411 return info->domain;
2415 static inline struct device_domain_info *
2416 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2418 struct device_domain_info *info;
2420 list_for_each_entry(info, &device_domain_list, global)
2421 if (info->iommu->segment == segment && info->bus == bus &&
2422 info->devfn == devfn)
2428 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2431 struct dmar_domain *domain)
2433 struct dmar_domain *found = NULL;
2434 struct device_domain_info *info;
2435 unsigned long flags;
2438 info = alloc_devinfo_mem();
2443 info->devfn = devfn;
2444 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2445 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2448 info->domain = domain;
2449 info->iommu = iommu;
2450 info->pasid_table = NULL;
2451 info->auxd_enabled = 0;
2452 INIT_LIST_HEAD(&info->auxiliary_domains);
2454 if (dev && dev_is_pci(dev)) {
2455 struct pci_dev *pdev = to_pci_dev(info->dev);
2457 if (!pdev->untrusted &&
2458 !pci_ats_disabled() &&
2459 ecap_dev_iotlb_support(iommu->ecap) &&
2460 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2461 dmar_find_matched_atsr_unit(pdev))
2462 info->ats_supported = 1;
2464 if (sm_supported(iommu)) {
2465 if (pasid_supported(iommu)) {
2466 int features = pci_pasid_features(pdev);
2468 info->pasid_supported = features | 1;
2471 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2472 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2473 info->pri_supported = 1;
2477 spin_lock_irqsave(&device_domain_lock, flags);
2479 found = find_domain(dev);
2482 struct device_domain_info *info2;
2483 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2485 found = info2->domain;
2491 spin_unlock_irqrestore(&device_domain_lock, flags);
2492 free_devinfo_mem(info);
2493 /* Caller must free the original domain */
2497 spin_lock(&iommu->lock);
2498 ret = domain_attach_iommu(domain, iommu);
2499 spin_unlock(&iommu->lock);
2502 spin_unlock_irqrestore(&device_domain_lock, flags);
2503 free_devinfo_mem(info);
2507 list_add(&info->link, &domain->devices);
2508 list_add(&info->global, &device_domain_list);
2510 dev->archdata.iommu = info;
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2513 /* PASID table is mandatory for a PCI device in scalable mode. */
2514 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2515 ret = intel_pasid_alloc_table(dev);
2517 dev_err(dev, "PASID table allocation failed\n");
2518 dmar_remove_one_dev_info(dev);
2522 /* Setup the PASID entry for requests without PASID: */
2523 spin_lock(&iommu->lock);
2524 if (hw_pass_through && domain_type_is_si(domain))
2525 ret = intel_pasid_setup_pass_through(iommu, domain,
2526 dev, PASID_RID2PASID);
2528 ret = intel_pasid_setup_second_level(iommu, domain,
2529 dev, PASID_RID2PASID);
2530 spin_unlock(&iommu->lock);
2532 dev_err(dev, "Setup RID2PASID failed\n");
2533 dmar_remove_one_dev_info(dev);
2538 if (dev && domain_context_mapping(domain, dev)) {
2539 dev_err(dev, "Domain context map failed\n");
2540 dmar_remove_one_dev_info(dev);
2547 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2549 *(u16 *)opaque = alias;
2553 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2555 struct device_domain_info *info;
2556 struct dmar_domain *domain = NULL;
2557 struct intel_iommu *iommu;
2559 unsigned long flags;
2562 iommu = device_to_iommu(dev, &bus, &devfn);
2566 if (dev_is_pci(dev)) {
2567 struct pci_dev *pdev = to_pci_dev(dev);
2569 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2571 spin_lock_irqsave(&device_domain_lock, flags);
2572 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2573 PCI_BUS_NUM(dma_alias),
2576 iommu = info->iommu;
2577 domain = info->domain;
2579 spin_unlock_irqrestore(&device_domain_lock, flags);
2581 /* DMA alias already has a domain, use it */
2586 /* Allocate and initialize new domain for the device */
2587 domain = alloc_domain(0);
2590 if (domain_init(domain, iommu, gaw)) {
2591 domain_exit(domain);
2599 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2600 struct dmar_domain *domain)
2602 struct intel_iommu *iommu;
2603 struct dmar_domain *tmp;
2604 u16 req_id, dma_alias;
2607 iommu = device_to_iommu(dev, &bus, &devfn);
2611 req_id = ((u16)bus << 8) | devfn;
2613 if (dev_is_pci(dev)) {
2614 struct pci_dev *pdev = to_pci_dev(dev);
2616 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2618 /* register PCI DMA alias device */
2619 if (req_id != dma_alias) {
2620 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2621 dma_alias & 0xff, NULL, domain);
2623 if (!tmp || tmp != domain)
2628 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2629 if (!tmp || tmp != domain)
2635 static int iommu_domain_identity_map(struct dmar_domain *domain,
2636 unsigned long long start,
2637 unsigned long long end)
2639 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2640 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2642 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2643 dma_to_mm_pfn(last_vpfn))) {
2644 pr_err("Reserving iova failed\n");
2648 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2650 * RMRR range might have overlap with physical memory range,
2653 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2655 return __domain_mapping(domain, first_vpfn, NULL,
2656 first_vpfn, last_vpfn - first_vpfn + 1,
2657 DMA_PTE_READ|DMA_PTE_WRITE);
2660 static int domain_prepare_identity_map(struct device *dev,
2661 struct dmar_domain *domain,
2662 unsigned long long start,
2663 unsigned long long end)
2665 /* For _hardware_ passthrough, don't bother. But for software
2666 passthrough, we do it anyway -- it may indicate a memory
2667 range which is reserved in E820, so which didn't get set
2668 up to start with in si_domain */
2669 if (domain == si_domain && hw_pass_through) {
2670 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2675 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2678 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2679 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2680 dmi_get_system_info(DMI_BIOS_VENDOR),
2681 dmi_get_system_info(DMI_BIOS_VERSION),
2682 dmi_get_system_info(DMI_PRODUCT_VERSION));
2686 if (end >> agaw_to_width(domain->agaw)) {
2687 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2688 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2689 agaw_to_width(domain->agaw),
2690 dmi_get_system_info(DMI_BIOS_VENDOR),
2691 dmi_get_system_info(DMI_BIOS_VERSION),
2692 dmi_get_system_info(DMI_PRODUCT_VERSION));
2696 return iommu_domain_identity_map(domain, start, end);
2699 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2701 static int __init si_domain_init(int hw)
2703 struct dmar_rmrr_unit *rmrr;
2707 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2711 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2712 domain_exit(si_domain);
2719 for_each_online_node(nid) {
2720 unsigned long start_pfn, end_pfn;
2723 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2724 ret = iommu_domain_identity_map(si_domain,
2725 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2732 * Normally we use DMA domains for devices which have RMRRs. But we
2733 * loose this requirement for graphic and usb devices. Identity map
2734 * the RMRRs for graphic and USB devices so that they could use the
2737 for_each_rmrr_units(rmrr) {
2738 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2740 unsigned long long start = rmrr->base_address;
2741 unsigned long long end = rmrr->end_address;
2743 if (device_is_rmrr_locked(dev))
2746 if (WARN_ON(end < start ||
2747 end >> agaw_to_width(si_domain->agaw)))
2750 ret = iommu_domain_identity_map(si_domain, start, end);
2759 static int identity_mapping(struct device *dev)
2761 struct device_domain_info *info;
2763 info = dev->archdata.iommu;
2764 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2765 return (info->domain == si_domain);
2770 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2772 struct dmar_domain *ndomain;
2773 struct intel_iommu *iommu;
2776 iommu = device_to_iommu(dev, &bus, &devfn);
2780 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2781 if (ndomain != domain)
2787 static bool device_has_rmrr(struct device *dev)
2789 struct dmar_rmrr_unit *rmrr;
2794 for_each_rmrr_units(rmrr) {
2796 * Return TRUE if this RMRR contains the device that
2799 for_each_active_dev_scope(rmrr->devices,
2800 rmrr->devices_cnt, i, tmp)
2802 is_downstream_to_pci_bridge(dev, tmp)) {
2812 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2813 * is relaxable (ie. is allowed to be not enforced under some conditions)
2814 * @dev: device handle
2816 * We assume that PCI USB devices with RMRRs have them largely
2817 * for historical reasons and that the RMRR space is not actively used post
2818 * boot. This exclusion may change if vendors begin to abuse it.
2820 * The same exception is made for graphics devices, with the requirement that
2821 * any use of the RMRR regions will be torn down before assigning the device
2824 * Return: true if the RMRR is relaxable, false otherwise
2826 static bool device_rmrr_is_relaxable(struct device *dev)
2828 struct pci_dev *pdev;
2830 if (!dev_is_pci(dev))
2833 pdev = to_pci_dev(dev);
2834 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2841 * There are a couple cases where we need to restrict the functionality of
2842 * devices associated with RMRRs. The first is when evaluating a device for
2843 * identity mapping because problems exist when devices are moved in and out
2844 * of domains and their respective RMRR information is lost. This means that
2845 * a device with associated RMRRs will never be in a "passthrough" domain.
2846 * The second is use of the device through the IOMMU API. This interface
2847 * expects to have full control of the IOVA space for the device. We cannot
2848 * satisfy both the requirement that RMRR access is maintained and have an
2849 * unencumbered IOVA space. We also have no ability to quiesce the device's
2850 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2851 * We therefore prevent devices associated with an RMRR from participating in
2852 * the IOMMU API, which eliminates them from device assignment.
2854 * In both cases, devices which have relaxable RMRRs are not concerned by this
2855 * restriction. See device_rmrr_is_relaxable comment.
2857 static bool device_is_rmrr_locked(struct device *dev)
2859 if (!device_has_rmrr(dev))
2862 if (device_rmrr_is_relaxable(dev))
2869 * Return the required default domain type for a specific device.
2871 * @dev: the device in query
2872 * @startup: true if this is during early boot
2875 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2876 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2877 * - 0: both identity and dynamic domains work for this device
2879 static int device_def_domain_type(struct device *dev)
2881 if (dev_is_pci(dev)) {
2882 struct pci_dev *pdev = to_pci_dev(dev);
2884 if (device_is_rmrr_locked(dev))
2885 return IOMMU_DOMAIN_DMA;
2888 * Prevent any device marked as untrusted from getting
2889 * placed into the statically identity mapping domain.
2891 if (pdev->untrusted)
2892 return IOMMU_DOMAIN_DMA;
2894 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2895 return IOMMU_DOMAIN_IDENTITY;
2897 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2898 return IOMMU_DOMAIN_IDENTITY;
2901 * We want to start off with all devices in the 1:1 domain, and
2902 * take them out later if we find they can't access all of memory.
2904 * However, we can't do this for PCI devices behind bridges,
2905 * because all PCI devices behind the same bridge will end up
2906 * with the same source-id on their transactions.
2908 * Practically speaking, we can't change things around for these
2909 * devices at run-time, because we can't be sure there'll be no
2910 * DMA transactions in flight for any of their siblings.
2912 * So PCI devices (unless they're on the root bus) as well as
2913 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2914 * the 1:1 domain, just in _case_ one of their siblings turns out
2915 * not to be able to map all of memory.
2917 if (!pci_is_pcie(pdev)) {
2918 if (!pci_is_root_bus(pdev->bus))
2919 return IOMMU_DOMAIN_DMA;
2920 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2921 return IOMMU_DOMAIN_DMA;
2922 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923 return IOMMU_DOMAIN_DMA;
2925 if (device_has_rmrr(dev))
2926 return IOMMU_DOMAIN_DMA;
2929 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2930 IOMMU_DOMAIN_IDENTITY : 0;
2933 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2936 * Start from the sane iommu hardware state.
2937 * If the queued invalidation is already initialized by us
2938 * (for example, while enabling interrupt-remapping) then
2939 * we got the things already rolling from a sane state.
2943 * Clear any previous faults.
2945 dmar_fault(-1, iommu);
2947 * Disable queued invalidation if supported and already enabled
2948 * before OS handover.
2950 dmar_disable_qi(iommu);
2953 if (dmar_enable_qi(iommu)) {
2955 * Queued Invalidate not enabled, use Register Based Invalidate
2957 iommu->flush.flush_context = __iommu_flush_context;
2958 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2959 pr_info("%s: Using Register based invalidation\n",
2962 iommu->flush.flush_context = qi_flush_context;
2963 iommu->flush.flush_iotlb = qi_flush_iotlb;
2964 pr_info("%s: Using Queued invalidation\n", iommu->name);
2968 static int copy_context_table(struct intel_iommu *iommu,
2969 struct root_entry *old_re,
2970 struct context_entry **tbl,
2973 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2974 struct context_entry *new_ce = NULL, ce;
2975 struct context_entry *old_ce = NULL;
2976 struct root_entry re;
2977 phys_addr_t old_ce_phys;
2979 tbl_idx = ext ? bus * 2 : bus;
2980 memcpy(&re, old_re, sizeof(re));
2982 for (devfn = 0; devfn < 256; devfn++) {
2983 /* First calculate the correct index */
2984 idx = (ext ? devfn * 2 : devfn) % 256;
2987 /* First save what we may have and clean up */
2989 tbl[tbl_idx] = new_ce;
2990 __iommu_flush_cache(iommu, new_ce,
3000 old_ce_phys = root_entry_lctp(&re);
3002 old_ce_phys = root_entry_uctp(&re);
3005 if (ext && devfn == 0) {
3006 /* No LCTP, try UCTP */
3015 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3020 new_ce = alloc_pgtable_page(iommu->node);
3027 /* Now copy the context entry */
3028 memcpy(&ce, old_ce + idx, sizeof(ce));
3030 if (!__context_present(&ce))
3033 did = context_domain_id(&ce);
3034 if (did >= 0 && did < cap_ndoms(iommu->cap))
3035 set_bit(did, iommu->domain_ids);
3038 * We need a marker for copied context entries. This
3039 * marker needs to work for the old format as well as
3040 * for extended context entries.
3042 * Bit 67 of the context entry is used. In the old
3043 * format this bit is available to software, in the
3044 * extended format it is the PGE bit, but PGE is ignored
3045 * by HW if PASIDs are disabled (and thus still
3048 * So disable PASIDs first and then mark the entry
3049 * copied. This means that we don't copy PASID
3050 * translations from the old kernel, but this is fine as
3051 * faults there are not fatal.
3053 context_clear_pasid_enable(&ce);
3054 context_set_copied(&ce);
3059 tbl[tbl_idx + pos] = new_ce;
3061 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3070 static int copy_translation_tables(struct intel_iommu *iommu)
3072 struct context_entry **ctxt_tbls;
3073 struct root_entry *old_rt;
3074 phys_addr_t old_rt_phys;
3075 int ctxt_table_entries;
3076 unsigned long flags;
3081 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3082 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3083 new_ext = !!ecap_ecs(iommu->ecap);
3086 * The RTT bit can only be changed when translation is disabled,
3087 * but disabling translation means to open a window for data
3088 * corruption. So bail out and don't copy anything if we would
3089 * have to change the bit.
3094 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3098 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3102 /* This is too big for the stack - allocate it from slab */
3103 ctxt_table_entries = ext ? 512 : 256;
3105 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3109 for (bus = 0; bus < 256; bus++) {
3110 ret = copy_context_table(iommu, &old_rt[bus],
3111 ctxt_tbls, bus, ext);
3113 pr_err("%s: Failed to copy context table for bus %d\n",
3119 spin_lock_irqsave(&iommu->lock, flags);
3121 /* Context tables are copied, now write them to the root_entry table */
3122 for (bus = 0; bus < 256; bus++) {
3123 int idx = ext ? bus * 2 : bus;
3126 if (ctxt_tbls[idx]) {
3127 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3128 iommu->root_entry[bus].lo = val;
3131 if (!ext || !ctxt_tbls[idx + 1])
3134 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3135 iommu->root_entry[bus].hi = val;
3138 spin_unlock_irqrestore(&iommu->lock, flags);
3142 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3152 static int __init init_dmars(void)
3154 struct dmar_drhd_unit *drhd;
3155 struct intel_iommu *iommu;
3161 * initialize and program root entry to not present
3164 for_each_drhd_unit(drhd) {
3166 * lock not needed as this is only incremented in the single
3167 * threaded kernel __init code path all other access are read
3170 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3174 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3177 /* Preallocate enough resources for IOMMU hot-addition */
3178 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3179 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3181 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3184 pr_err("Allocating global iommu array failed\n");
3189 for_each_iommu(iommu, drhd) {
3190 if (drhd->ignored) {
3191 iommu_disable_translation(iommu);
3196 * Find the max pasid size of all IOMMU's in the system.
3197 * We need to ensure the system pasid table is no bigger
3198 * than the smallest supported.
3200 if (pasid_supported(iommu)) {
3201 u32 temp = 2 << ecap_pss(iommu->ecap);
3203 intel_pasid_max_id = min_t(u32, temp,
3204 intel_pasid_max_id);
3207 g_iommus[iommu->seq_id] = iommu;
3209 intel_iommu_init_qi(iommu);
3211 ret = iommu_init_domains(iommu);
3215 init_translation_status(iommu);
3217 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3218 iommu_disable_translation(iommu);
3219 clear_translation_pre_enabled(iommu);
3220 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3226 * we could share the same root & context tables
3227 * among all IOMMU's. Need to Split it later.
3229 ret = iommu_alloc_root_entry(iommu);
3233 if (translation_pre_enabled(iommu)) {
3234 pr_info("Translation already enabled - trying to copy translation structures\n");
3236 ret = copy_translation_tables(iommu);
3239 * We found the IOMMU with translation
3240 * enabled - but failed to copy over the
3241 * old root-entry table. Try to proceed
3242 * by disabling translation now and
3243 * allocating a clean root-entry table.
3244 * This might cause DMAR faults, but
3245 * probably the dump will still succeed.
3247 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3249 iommu_disable_translation(iommu);
3250 clear_translation_pre_enabled(iommu);
3252 pr_info("Copied translation tables from previous kernel for %s\n",
3257 if (!ecap_pass_through(iommu->ecap))
3258 hw_pass_through = 0;
3259 #ifdef CONFIG_INTEL_IOMMU_SVM
3260 if (pasid_supported(iommu))
3261 intel_svm_init(iommu);
3266 * Now that qi is enabled on all iommus, set the root entry and flush
3267 * caches. This is required on some Intel X58 chipsets, otherwise the
3268 * flush_context function will loop forever and the boot hangs.
3270 for_each_active_iommu(iommu, drhd) {
3271 iommu_flush_write_buffer(iommu);
3272 iommu_set_root_entry(iommu);
3273 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3274 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3277 if (iommu_pass_through)
3278 iommu_identity_mapping |= IDENTMAP_ALL;
3280 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3285 iommu_identity_mapping |= IDENTMAP_GFX;
3287 check_tylersburg_isoch();
3289 ret = si_domain_init(hw_pass_through);
3296 * global invalidate context cache
3297 * global invalidate iotlb
3298 * enable translation
3300 for_each_iommu(iommu, drhd) {
3301 if (drhd->ignored) {
3303 * we always have to disable PMRs or DMA may fail on
3307 iommu_disable_protect_mem_regions(iommu);
3311 iommu_flush_write_buffer(iommu);
3313 #ifdef CONFIG_INTEL_IOMMU_SVM
3314 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3316 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3317 * could cause possible lock race condition.
3319 up_write(&dmar_global_lock);
3320 ret = intel_svm_enable_prq(iommu);
3321 down_write(&dmar_global_lock);
3326 ret = dmar_set_interrupt(iommu);
3334 for_each_active_iommu(iommu, drhd) {
3335 disable_dmar_iommu(iommu);
3336 free_dmar_iommu(iommu);
3345 /* This takes a number of _MM_ pages, not VTD pages */
3346 static unsigned long intel_alloc_iova(struct device *dev,
3347 struct dmar_domain *domain,
3348 unsigned long nrpages, uint64_t dma_mask)
3350 unsigned long iova_pfn;
3352 /* Restrict dma_mask to the width that the iommu can handle */
3353 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3354 /* Ensure we reserve the whole size-aligned region */
3355 nrpages = __roundup_pow_of_two(nrpages);
3357 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3359 * First try to allocate an io virtual address in
3360 * DMA_BIT_MASK(32) and if that fails then try allocating
3363 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3364 IOVA_PFN(DMA_BIT_MASK(32)), false);
3368 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3369 IOVA_PFN(dma_mask), true);
3370 if (unlikely(!iova_pfn)) {
3371 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3378 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3380 struct dmar_domain *domain, *tmp;
3381 struct dmar_rmrr_unit *rmrr;
3382 struct device *i_dev;
3385 /* Device shouldn't be attached by any domains. */
3386 domain = find_domain(dev);
3390 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3394 /* We have a new domain - setup possible RMRRs for the device */
3396 for_each_rmrr_units(rmrr) {
3397 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3402 ret = domain_prepare_identity_map(dev, domain,
3406 dev_err(dev, "Mapping reserved region failed\n");
3411 tmp = set_domain_for_dev(dev, domain);
3412 if (!tmp || domain != tmp) {
3413 domain_exit(domain);
3419 dev_err(dev, "Allocating domain failed\n");
3421 domain->domain.type = IOMMU_DOMAIN_DMA;
3426 /* Check if the dev needs to go through non-identity map and unmap process.*/
3427 static bool iommu_need_mapping(struct device *dev)
3431 if (iommu_dummy(dev))
3434 ret = identity_mapping(dev);
3436 u64 dma_mask = *dev->dma_mask;
3438 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3439 dma_mask = dev->coherent_dma_mask;
3441 if (dma_mask >= dma_get_required_mask(dev))
3445 * 32 bit DMA is removed from si_domain and fall back to
3446 * non-identity mapping.
3448 dmar_remove_one_dev_info(dev);
3449 ret = iommu_request_dma_domain_for_dev(dev);
3451 struct iommu_domain *domain;
3452 struct dmar_domain *dmar_domain;
3454 domain = iommu_get_domain_for_dev(dev);
3456 dmar_domain = to_dmar_domain(domain);
3457 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3459 dmar_remove_one_dev_info(dev);
3460 get_private_domain_for_dev(dev);
3463 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3469 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3470 size_t size, int dir, u64 dma_mask)
3472 struct dmar_domain *domain;
3473 phys_addr_t start_paddr;
3474 unsigned long iova_pfn;
3477 struct intel_iommu *iommu;
3478 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3480 BUG_ON(dir == DMA_NONE);
3482 domain = find_domain(dev);
3484 return DMA_MAPPING_ERROR;
3486 iommu = domain_get_iommu(domain);
3487 size = aligned_nrpages(paddr, size);
3489 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3494 * Check if DMAR supports zero-length reads on write only
3497 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3498 !cap_zlr(iommu->cap))
3499 prot |= DMA_PTE_READ;
3500 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3501 prot |= DMA_PTE_WRITE;
3503 * paddr - (paddr + size) might be partial page, we should map the whole
3504 * page. Note: if two part of one page are separately mapped, we
3505 * might have two guest_addr mapping to the same host paddr, but this
3506 * is not a big problem
3508 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3509 mm_to_dma_pfn(paddr_pfn), size, prot);
3513 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3514 start_paddr += paddr & ~PAGE_MASK;
3519 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3520 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3521 size, (unsigned long long)paddr, dir);
3522 return DMA_MAPPING_ERROR;
3525 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3526 unsigned long offset, size_t size,
3527 enum dma_data_direction dir,
3528 unsigned long attrs)
3530 if (iommu_need_mapping(dev))
3531 return __intel_map_single(dev, page_to_phys(page) + offset,
3532 size, dir, *dev->dma_mask);
3533 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3536 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3537 size_t size, enum dma_data_direction dir,
3538 unsigned long attrs)
3540 if (iommu_need_mapping(dev))
3541 return __intel_map_single(dev, phys_addr, size, dir,
3543 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3548 struct dmar_domain *domain;
3549 unsigned long start_pfn, last_pfn;
3550 unsigned long nrpages;
3551 unsigned long iova_pfn;
3552 struct intel_iommu *iommu;
3553 struct page *freelist;
3554 struct pci_dev *pdev = NULL;
3556 domain = find_domain(dev);
3559 iommu = domain_get_iommu(domain);
3561 iova_pfn = IOVA_PFN(dev_addr);
3563 nrpages = aligned_nrpages(dev_addr, size);
3564 start_pfn = mm_to_dma_pfn(iova_pfn);
3565 last_pfn = start_pfn + nrpages - 1;
3567 if (dev_is_pci(dev))
3568 pdev = to_pci_dev(dev);
3570 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3572 freelist = domain_unmap(domain, start_pfn, last_pfn);
3574 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3575 !has_iova_flush_queue(&domain->iovad)) {
3576 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577 nrpages, !freelist, 0);
3579 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580 dma_free_pagelist(freelist);
3582 queue_iova(&domain->iovad, iova_pfn, nrpages,
3583 (unsigned long)freelist);
3585 * queue up the release of the unmap to save the 1/6th of the
3586 * cpu used up by the iotlb flush operation...
3591 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3592 size_t size, enum dma_data_direction dir,
3593 unsigned long attrs)
3595 if (iommu_need_mapping(dev))
3596 intel_unmap(dev, dev_addr, size);
3598 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3601 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3602 size_t size, enum dma_data_direction dir, unsigned long attrs)
3604 if (iommu_need_mapping(dev))
3605 intel_unmap(dev, dev_addr, size);
3608 static void *intel_alloc_coherent(struct device *dev, size_t size,
3609 dma_addr_t *dma_handle, gfp_t flags,
3610 unsigned long attrs)
3612 struct page *page = NULL;
3615 if (!iommu_need_mapping(dev))
3616 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3618 size = PAGE_ALIGN(size);
3619 order = get_order(size);
3621 if (gfpflags_allow_blocking(flags)) {
3622 unsigned int count = size >> PAGE_SHIFT;
3624 page = dma_alloc_from_contiguous(dev, count, order,
3625 flags & __GFP_NOWARN);
3629 page = alloc_pages(flags, order);
3632 memset(page_address(page), 0, size);
3634 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3636 dev->coherent_dma_mask);
3637 if (*dma_handle != DMA_MAPPING_ERROR)
3638 return page_address(page);
3639 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3640 __free_pages(page, order);
3645 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3646 dma_addr_t dma_handle, unsigned long attrs)
3649 struct page *page = virt_to_page(vaddr);
3651 if (!iommu_need_mapping(dev))
3652 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3654 size = PAGE_ALIGN(size);
3655 order = get_order(size);
3657 intel_unmap(dev, dma_handle, size);
3658 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3659 __free_pages(page, order);
3662 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3663 int nelems, enum dma_data_direction dir,
3664 unsigned long attrs)
3666 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3667 unsigned long nrpages = 0;
3668 struct scatterlist *sg;
3671 if (!iommu_need_mapping(dev))
3672 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3674 for_each_sg(sglist, sg, nelems, i) {
3675 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3678 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3681 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3682 enum dma_data_direction dir, unsigned long attrs)
3685 struct dmar_domain *domain;
3688 unsigned long iova_pfn;
3690 struct scatterlist *sg;
3691 unsigned long start_vpfn;
3692 struct intel_iommu *iommu;
3694 BUG_ON(dir == DMA_NONE);
3695 if (!iommu_need_mapping(dev))
3696 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3698 domain = find_domain(dev);
3702 iommu = domain_get_iommu(domain);
3704 for_each_sg(sglist, sg, nelems, i)
3705 size += aligned_nrpages(sg->offset, sg->length);
3707 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3710 sglist->dma_length = 0;
3715 * Check if DMAR supports zero-length reads on write only
3718 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3719 !cap_zlr(iommu->cap))
3720 prot |= DMA_PTE_READ;
3721 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3722 prot |= DMA_PTE_WRITE;
3724 start_vpfn = mm_to_dma_pfn(iova_pfn);
3726 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3727 if (unlikely(ret)) {
3728 dma_pte_free_pagetable(domain, start_vpfn,
3729 start_vpfn + size - 1,
3730 agaw_to_level(domain->agaw) + 1);
3731 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3738 static const struct dma_map_ops intel_dma_ops = {
3739 .alloc = intel_alloc_coherent,
3740 .free = intel_free_coherent,
3741 .map_sg = intel_map_sg,
3742 .unmap_sg = intel_unmap_sg,
3743 .map_page = intel_map_page,
3744 .unmap_page = intel_unmap_page,
3745 .map_resource = intel_map_resource,
3746 .unmap_resource = intel_unmap_resource,
3747 .dma_supported = dma_direct_supported,
3750 static inline int iommu_domain_cache_init(void)
3754 iommu_domain_cache = kmem_cache_create("iommu_domain",
3755 sizeof(struct dmar_domain),
3760 if (!iommu_domain_cache) {
3761 pr_err("Couldn't create iommu_domain cache\n");
3768 static inline int iommu_devinfo_cache_init(void)
3772 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3773 sizeof(struct device_domain_info),
3777 if (!iommu_devinfo_cache) {
3778 pr_err("Couldn't create devinfo cache\n");
3785 static int __init iommu_init_mempool(void)
3788 ret = iova_cache_get();
3792 ret = iommu_domain_cache_init();
3796 ret = iommu_devinfo_cache_init();
3800 kmem_cache_destroy(iommu_domain_cache);
3807 static void __init iommu_exit_mempool(void)
3809 kmem_cache_destroy(iommu_devinfo_cache);
3810 kmem_cache_destroy(iommu_domain_cache);
3814 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3816 struct dmar_drhd_unit *drhd;
3820 /* We know that this device on this chipset has its own IOMMU.
3821 * If we find it under a different IOMMU, then the BIOS is lying
3822 * to us. Hope that the IOMMU for this device is actually
3823 * disabled, and it needs no translation...
3825 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3827 /* "can't" happen */
3828 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3831 vtbar &= 0xffff0000;
3833 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3834 drhd = dmar_find_matched_drhd_unit(pdev);
3835 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3836 TAINT_FIRMWARE_WORKAROUND,
3837 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3838 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3840 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3842 static void __init init_no_remapping_devices(void)
3844 struct dmar_drhd_unit *drhd;
3848 for_each_drhd_unit(drhd) {
3849 if (!drhd->include_all) {
3850 for_each_active_dev_scope(drhd->devices,
3851 drhd->devices_cnt, i, dev)
3853 /* ignore DMAR unit if no devices exist */
3854 if (i == drhd->devices_cnt)
3859 for_each_active_drhd_unit(drhd) {
3860 if (drhd->include_all)
3863 for_each_active_dev_scope(drhd->devices,
3864 drhd->devices_cnt, i, dev)
3865 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3867 if (i < drhd->devices_cnt)
3870 /* This IOMMU has *only* gfx devices. Either bypass it or
3871 set the gfx_mapped flag, as appropriate */
3872 if (!dmar_map_gfx) {
3874 for_each_active_dev_scope(drhd->devices,
3875 drhd->devices_cnt, i, dev)
3876 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3881 #ifdef CONFIG_SUSPEND
3882 static int init_iommu_hw(void)
3884 struct dmar_drhd_unit *drhd;
3885 struct intel_iommu *iommu = NULL;
3887 for_each_active_iommu(iommu, drhd)
3889 dmar_reenable_qi(iommu);
3891 for_each_iommu(iommu, drhd) {
3892 if (drhd->ignored) {
3894 * we always have to disable PMRs or DMA may fail on
3898 iommu_disable_protect_mem_regions(iommu);
3902 iommu_flush_write_buffer(iommu);
3904 iommu_set_root_entry(iommu);
3906 iommu->flush.flush_context(iommu, 0, 0, 0,
3907 DMA_CCMD_GLOBAL_INVL);
3908 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3909 iommu_enable_translation(iommu);
3910 iommu_disable_protect_mem_regions(iommu);
3916 static void iommu_flush_all(void)
3918 struct dmar_drhd_unit *drhd;
3919 struct intel_iommu *iommu;
3921 for_each_active_iommu(iommu, drhd) {
3922 iommu->flush.flush_context(iommu, 0, 0, 0,
3923 DMA_CCMD_GLOBAL_INVL);
3924 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3925 DMA_TLB_GLOBAL_FLUSH);
3929 static int iommu_suspend(void)
3931 struct dmar_drhd_unit *drhd;
3932 struct intel_iommu *iommu = NULL;
3935 for_each_active_iommu(iommu, drhd) {
3936 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3938 if (!iommu->iommu_state)
3944 for_each_active_iommu(iommu, drhd) {
3945 iommu_disable_translation(iommu);
3947 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3949 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3950 readl(iommu->reg + DMAR_FECTL_REG);
3951 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3952 readl(iommu->reg + DMAR_FEDATA_REG);
3953 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3954 readl(iommu->reg + DMAR_FEADDR_REG);
3955 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3956 readl(iommu->reg + DMAR_FEUADDR_REG);
3958 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3963 for_each_active_iommu(iommu, drhd)
3964 kfree(iommu->iommu_state);
3969 static void iommu_resume(void)
3971 struct dmar_drhd_unit *drhd;
3972 struct intel_iommu *iommu = NULL;
3975 if (init_iommu_hw()) {
3977 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3979 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3983 for_each_active_iommu(iommu, drhd) {
3985 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3987 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3988 iommu->reg + DMAR_FECTL_REG);
3989 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3990 iommu->reg + DMAR_FEDATA_REG);
3991 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3992 iommu->reg + DMAR_FEADDR_REG);
3993 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3994 iommu->reg + DMAR_FEUADDR_REG);
3996 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3999 for_each_active_iommu(iommu, drhd)
4000 kfree(iommu->iommu_state);
4003 static struct syscore_ops iommu_syscore_ops = {
4004 .resume = iommu_resume,
4005 .suspend = iommu_suspend,
4008 static void __init init_iommu_pm_ops(void)
4010 register_syscore_ops(&iommu_syscore_ops);
4014 static inline void init_iommu_pm_ops(void) {}
4015 #endif /* CONFIG_PM */
4017 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4019 struct acpi_dmar_reserved_memory *rmrr;
4020 struct dmar_rmrr_unit *rmrru;
4022 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4026 rmrru->hdr = header;
4027 rmrr = (struct acpi_dmar_reserved_memory *)header;
4028 rmrru->base_address = rmrr->base_address;
4029 rmrru->end_address = rmrr->end_address;
4031 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4032 ((void *)rmrr) + rmrr->header.length,
4033 &rmrru->devices_cnt);
4034 if (rmrru->devices_cnt && rmrru->devices == NULL)
4037 list_add(&rmrru->list, &dmar_rmrr_units);
4046 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4048 struct dmar_atsr_unit *atsru;
4049 struct acpi_dmar_atsr *tmp;
4051 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4052 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4053 if (atsr->segment != tmp->segment)
4055 if (atsr->header.length != tmp->header.length)
4057 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4064 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4066 struct acpi_dmar_atsr *atsr;
4067 struct dmar_atsr_unit *atsru;
4069 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4072 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4073 atsru = dmar_find_atsr(atsr);
4077 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4082 * If memory is allocated from slab by ACPI _DSM method, we need to
4083 * copy the memory content because the memory buffer will be freed
4086 atsru->hdr = (void *)(atsru + 1);
4087 memcpy(atsru->hdr, hdr, hdr->length);
4088 atsru->include_all = atsr->flags & 0x1;
4089 if (!atsru->include_all) {
4090 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4091 (void *)atsr + atsr->header.length,
4092 &atsru->devices_cnt);
4093 if (atsru->devices_cnt && atsru->devices == NULL) {
4099 list_add_rcu(&atsru->list, &dmar_atsr_units);
4104 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4106 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4110 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4112 struct acpi_dmar_atsr *atsr;
4113 struct dmar_atsr_unit *atsru;
4115 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4116 atsru = dmar_find_atsr(atsr);
4118 list_del_rcu(&atsru->list);
4120 intel_iommu_free_atsr(atsru);
4126 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4130 struct acpi_dmar_atsr *atsr;
4131 struct dmar_atsr_unit *atsru;
4133 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4134 atsru = dmar_find_atsr(atsr);
4138 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4139 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4147 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4150 struct intel_iommu *iommu = dmaru->iommu;
4152 if (g_iommus[iommu->seq_id])
4155 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4156 pr_warn("%s: Doesn't support hardware pass through.\n",
4160 if (!ecap_sc_support(iommu->ecap) &&
4161 domain_update_iommu_snooping(iommu)) {
4162 pr_warn("%s: Doesn't support snooping.\n",
4166 sp = domain_update_iommu_superpage(iommu) - 1;
4167 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4168 pr_warn("%s: Doesn't support large page.\n",
4174 * Disable translation if already enabled prior to OS handover.
4176 if (iommu->gcmd & DMA_GCMD_TE)
4177 iommu_disable_translation(iommu);
4179 g_iommus[iommu->seq_id] = iommu;
4180 ret = iommu_init_domains(iommu);
4182 ret = iommu_alloc_root_entry(iommu);
4186 #ifdef CONFIG_INTEL_IOMMU_SVM
4187 if (pasid_supported(iommu))
4188 intel_svm_init(iommu);
4191 if (dmaru->ignored) {
4193 * we always have to disable PMRs or DMA may fail on this device
4196 iommu_disable_protect_mem_regions(iommu);
4200 intel_iommu_init_qi(iommu);
4201 iommu_flush_write_buffer(iommu);
4203 #ifdef CONFIG_INTEL_IOMMU_SVM
4204 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4205 ret = intel_svm_enable_prq(iommu);
4210 ret = dmar_set_interrupt(iommu);
4214 iommu_set_root_entry(iommu);
4215 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4216 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4217 iommu_enable_translation(iommu);
4219 iommu_disable_protect_mem_regions(iommu);
4223 disable_dmar_iommu(iommu);
4225 free_dmar_iommu(iommu);
4229 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4232 struct intel_iommu *iommu = dmaru->iommu;
4234 if (!intel_iommu_enabled)
4240 ret = intel_iommu_add(dmaru);
4242 disable_dmar_iommu(iommu);
4243 free_dmar_iommu(iommu);
4249 static void intel_iommu_free_dmars(void)
4251 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4252 struct dmar_atsr_unit *atsru, *atsr_n;
4254 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4255 list_del(&rmrru->list);
4256 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4260 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4261 list_del(&atsru->list);
4262 intel_iommu_free_atsr(atsru);
4266 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4269 struct pci_bus *bus;
4270 struct pci_dev *bridge = NULL;
4272 struct acpi_dmar_atsr *atsr;
4273 struct dmar_atsr_unit *atsru;
4275 dev = pci_physfn(dev);
4276 for (bus = dev->bus; bus; bus = bus->parent) {
4278 /* If it's an integrated device, allow ATS */
4281 /* Connected via non-PCIe: no ATS */
4282 if (!pci_is_pcie(bridge) ||
4283 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4285 /* If we found the root port, look it up in the ATSR */
4286 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4291 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4292 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4293 if (atsr->segment != pci_domain_nr(dev->bus))
4296 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4297 if (tmp == &bridge->dev)
4300 if (atsru->include_all)
4310 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4313 struct dmar_rmrr_unit *rmrru;
4314 struct dmar_atsr_unit *atsru;
4315 struct acpi_dmar_atsr *atsr;
4316 struct acpi_dmar_reserved_memory *rmrr;
4318 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4321 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4322 rmrr = container_of(rmrru->hdr,
4323 struct acpi_dmar_reserved_memory, header);
4324 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4325 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4326 ((void *)rmrr) + rmrr->header.length,
4327 rmrr->segment, rmrru->devices,
4328 rmrru->devices_cnt);
4331 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4332 dmar_remove_dev_scope(info, rmrr->segment,
4333 rmrru->devices, rmrru->devices_cnt);
4337 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4338 if (atsru->include_all)
4341 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4342 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4343 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4344 (void *)atsr + atsr->header.length,
4345 atsr->segment, atsru->devices,
4346 atsru->devices_cnt);
4351 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4352 if (dmar_remove_dev_scope(info, atsr->segment,
4353 atsru->devices, atsru->devices_cnt))
4361 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4362 unsigned long val, void *v)
4364 struct memory_notify *mhp = v;
4365 unsigned long long start, end;
4366 unsigned long start_vpfn, last_vpfn;
4369 case MEM_GOING_ONLINE:
4370 start = mhp->start_pfn << PAGE_SHIFT;
4371 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4372 if (iommu_domain_identity_map(si_domain, start, end)) {
4373 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4380 case MEM_CANCEL_ONLINE:
4381 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4382 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4383 while (start_vpfn <= last_vpfn) {
4385 struct dmar_drhd_unit *drhd;
4386 struct intel_iommu *iommu;
4387 struct page *freelist;
4389 iova = find_iova(&si_domain->iovad, start_vpfn);
4391 pr_debug("Failed get IOVA for PFN %lx\n",
4396 iova = split_and_remove_iova(&si_domain->iovad, iova,
4397 start_vpfn, last_vpfn);
4399 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4400 start_vpfn, last_vpfn);
4404 freelist = domain_unmap(si_domain, iova->pfn_lo,
4408 for_each_active_iommu(iommu, drhd)
4409 iommu_flush_iotlb_psi(iommu, si_domain,
4410 iova->pfn_lo, iova_size(iova),
4413 dma_free_pagelist(freelist);
4415 start_vpfn = iova->pfn_hi + 1;
4416 free_iova_mem(iova);
4424 static struct notifier_block intel_iommu_memory_nb = {
4425 .notifier_call = intel_iommu_memory_notifier,
4429 static void free_all_cpu_cached_iovas(unsigned int cpu)
4433 for (i = 0; i < g_num_of_iommus; i++) {
4434 struct intel_iommu *iommu = g_iommus[i];
4435 struct dmar_domain *domain;
4441 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4442 domain = get_iommu_domain(iommu, (u16)did);
4446 free_cpu_cached_iovas(cpu, &domain->iovad);
4451 static int intel_iommu_cpu_dead(unsigned int cpu)
4453 free_all_cpu_cached_iovas(cpu);
4457 static void intel_disable_iommus(void)
4459 struct intel_iommu *iommu = NULL;
4460 struct dmar_drhd_unit *drhd;
4462 for_each_iommu(iommu, drhd)
4463 iommu_disable_translation(iommu);
4466 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4468 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4470 return container_of(iommu_dev, struct intel_iommu, iommu);
4473 static ssize_t intel_iommu_show_version(struct device *dev,
4474 struct device_attribute *attr,
4477 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4478 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4479 return sprintf(buf, "%d:%d\n",
4480 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4482 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4484 static ssize_t intel_iommu_show_address(struct device *dev,
4485 struct device_attribute *attr,
4488 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4489 return sprintf(buf, "%llx\n", iommu->reg_phys);
4491 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4493 static ssize_t intel_iommu_show_cap(struct device *dev,
4494 struct device_attribute *attr,
4497 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4498 return sprintf(buf, "%llx\n", iommu->cap);
4500 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4502 static ssize_t intel_iommu_show_ecap(struct device *dev,
4503 struct device_attribute *attr,
4506 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4507 return sprintf(buf, "%llx\n", iommu->ecap);
4509 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4511 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4512 struct device_attribute *attr,
4515 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4516 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4518 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4520 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4521 struct device_attribute *attr,
4524 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4525 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4526 cap_ndoms(iommu->cap)));
4528 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4530 static struct attribute *intel_iommu_attrs[] = {
4531 &dev_attr_version.attr,
4532 &dev_attr_address.attr,
4534 &dev_attr_ecap.attr,
4535 &dev_attr_domains_supported.attr,
4536 &dev_attr_domains_used.attr,
4540 static struct attribute_group intel_iommu_group = {
4541 .name = "intel-iommu",
4542 .attrs = intel_iommu_attrs,
4545 const struct attribute_group *intel_iommu_groups[] = {
4550 static inline bool has_untrusted_dev(void)
4552 struct pci_dev *pdev = NULL;
4554 for_each_pci_dev(pdev)
4555 if (pdev->untrusted)
4561 static int __init platform_optin_force_iommu(void)
4563 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4566 if (no_iommu || dmar_disabled)
4567 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4570 * If Intel-IOMMU is disabled by default, we will apply identity
4571 * map for all devices except those marked as being untrusted.
4574 iommu_identity_mapping |= IDENTMAP_ALL;
4582 static int __init probe_acpi_namespace_devices(void)
4584 struct dmar_drhd_unit *drhd;
4585 /* To avoid a -Wunused-but-set-variable warning. */
4586 struct intel_iommu *iommu __maybe_unused;
4590 for_each_active_iommu(iommu, drhd) {
4591 for_each_active_dev_scope(drhd->devices,
4592 drhd->devices_cnt, i, dev) {
4593 struct acpi_device_physical_node *pn;
4594 struct iommu_group *group;
4595 struct acpi_device *adev;
4597 if (dev->bus != &acpi_bus_type)
4600 adev = to_acpi_device(dev);
4601 mutex_lock(&adev->physical_node_lock);
4602 list_for_each_entry(pn,
4603 &adev->physical_node_list, node) {
4604 group = iommu_group_get(pn->dev);
4606 iommu_group_put(group);
4610 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4611 ret = iommu_probe_device(pn->dev);
4615 mutex_unlock(&adev->physical_node_lock);
4625 int __init intel_iommu_init(void)
4628 struct dmar_drhd_unit *drhd;
4629 struct intel_iommu *iommu;
4632 * Intel IOMMU is required for a TXT/tboot launch or platform
4633 * opt in, so enforce that.
4635 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4637 if (iommu_init_mempool()) {
4639 panic("tboot: Failed to initialize iommu memory\n");
4643 down_write(&dmar_global_lock);
4644 if (dmar_table_init()) {
4646 panic("tboot: Failed to initialize DMAR table\n");
4650 if (dmar_dev_scope_init() < 0) {
4652 panic("tboot: Failed to initialize DMAR device scope\n");
4656 up_write(&dmar_global_lock);
4659 * The bus notifier takes the dmar_global_lock, so lockdep will
4660 * complain later when we register it under the lock.
4662 dmar_register_bus_notifier();
4664 down_write(&dmar_global_lock);
4666 if (no_iommu || dmar_disabled) {
4668 * We exit the function here to ensure IOMMU's remapping and
4669 * mempool aren't setup, which means that the IOMMU's PMRs
4670 * won't be disabled via the call to init_dmars(). So disable
4671 * it explicitly here. The PMRs were setup by tboot prior to
4672 * calling SENTER, but the kernel is expected to reset/tear
4675 if (intel_iommu_tboot_noforce) {
4676 for_each_iommu(iommu, drhd)
4677 iommu_disable_protect_mem_regions(iommu);
4681 * Make sure the IOMMUs are switched off, even when we
4682 * boot into a kexec kernel and the previous kernel left
4685 intel_disable_iommus();
4689 if (list_empty(&dmar_rmrr_units))
4690 pr_info("No RMRR found\n");
4692 if (list_empty(&dmar_atsr_units))
4693 pr_info("No ATSR found\n");
4695 if (dmar_init_reserved_ranges()) {
4697 panic("tboot: Failed to reserve iommu ranges\n");
4698 goto out_free_reserved_range;
4702 intel_iommu_gfx_mapped = 1;
4704 init_no_remapping_devices();
4709 panic("tboot: Failed to initialize DMARs\n");
4710 pr_err("Initialization failed\n");
4711 goto out_free_reserved_range;
4713 up_write(&dmar_global_lock);
4715 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4717 * If the system has no untrusted device or the user has decided
4718 * to disable the bounce page mechanisms, we don't need swiotlb.
4719 * Mark this and the pre-allocated bounce pages will be released
4722 if (!has_untrusted_dev() || intel_no_bounce)
4725 dma_ops = &intel_dma_ops;
4727 init_iommu_pm_ops();
4729 for_each_active_iommu(iommu, drhd) {
4730 iommu_device_sysfs_add(&iommu->iommu, NULL,
4733 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4734 iommu_device_register(&iommu->iommu);
4737 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4738 if (si_domain && !hw_pass_through)
4739 register_memory_notifier(&intel_iommu_memory_nb);
4740 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4741 intel_iommu_cpu_dead);
4743 down_read(&dmar_global_lock);
4744 if (probe_acpi_namespace_devices())
4745 pr_warn("ACPI name space devices didn't probe correctly\n");
4746 up_read(&dmar_global_lock);
4748 /* Finally, we enable the DMA remapping hardware. */
4749 for_each_iommu(iommu, drhd) {
4750 if (!drhd->ignored && !translation_pre_enabled(iommu))
4751 iommu_enable_translation(iommu);
4753 iommu_disable_protect_mem_regions(iommu);
4755 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4757 intel_iommu_enabled = 1;
4758 intel_iommu_debugfs_init();
4762 out_free_reserved_range:
4763 put_iova_domain(&reserved_iova_list);
4765 intel_iommu_free_dmars();
4766 up_write(&dmar_global_lock);
4767 iommu_exit_mempool();
4771 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4773 struct dmar_domain *domain;
4774 struct intel_iommu *iommu;
4775 unsigned long flags;
4777 assert_spin_locked(&device_domain_lock);
4782 iommu = info->iommu;
4783 domain = info->domain;
4786 if (dev_is_pci(info->dev) && sm_supported(iommu))
4787 intel_pasid_tear_down_entry(iommu, info->dev,
4790 iommu_disable_dev_iotlb(info);
4791 domain_context_clear_one(iommu, info->bus, info->devfn);
4792 intel_pasid_free_table(info->dev);
4795 unlink_domain_info(info);
4797 spin_lock_irqsave(&iommu->lock, flags);
4798 domain_detach_iommu(domain, iommu);
4799 spin_unlock_irqrestore(&iommu->lock, flags);
4801 /* free the private domain */
4802 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4803 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4804 list_empty(&domain->devices))
4805 domain_exit(info->domain);
4807 free_devinfo_mem(info);
4810 static void dmar_remove_one_dev_info(struct device *dev)
4812 struct device_domain_info *info;
4813 unsigned long flags;
4815 spin_lock_irqsave(&device_domain_lock, flags);
4816 info = dev->archdata.iommu;
4818 __dmar_remove_one_dev_info(info);
4819 spin_unlock_irqrestore(&device_domain_lock, flags);
4822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4826 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4827 domain_reserve_special_ranges(domain);
4829 /* calculate AGAW */
4830 domain->gaw = guest_width;
4831 adjust_width = guestwidth_to_adjustwidth(guest_width);
4832 domain->agaw = width_to_agaw(adjust_width);
4834 domain->iommu_coherency = 0;
4835 domain->iommu_snooping = 0;
4836 domain->iommu_superpage = 0;
4837 domain->max_addr = 0;
4839 /* always allocate the top pgd */
4840 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4843 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4847 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4849 struct dmar_domain *dmar_domain;
4850 struct iommu_domain *domain;
4853 case IOMMU_DOMAIN_DMA:
4855 case IOMMU_DOMAIN_UNMANAGED:
4856 dmar_domain = alloc_domain(0);
4858 pr_err("Can't allocate dmar_domain\n");
4861 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4862 pr_err("Domain initialization failed\n");
4863 domain_exit(dmar_domain);
4867 if (type == IOMMU_DOMAIN_DMA &&
4868 init_iova_flush_queue(&dmar_domain->iovad,
4869 iommu_flush_iova, iova_entry_free)) {
4870 pr_warn("iova flush queue initialization failed\n");
4871 intel_iommu_strict = 1;
4874 domain_update_iommu_cap(dmar_domain);
4876 domain = &dmar_domain->domain;
4877 domain->geometry.aperture_start = 0;
4878 domain->geometry.aperture_end =
4879 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4880 domain->geometry.force_aperture = true;
4883 case IOMMU_DOMAIN_IDENTITY:
4884 return &si_domain->domain;
4892 static void intel_iommu_domain_free(struct iommu_domain *domain)
4894 if (domain != &si_domain->domain)
4895 domain_exit(to_dmar_domain(domain));
4899 * Check whether a @domain could be attached to the @dev through the
4900 * aux-domain attach/detach APIs.
4903 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4905 struct device_domain_info *info = dev->archdata.iommu;
4907 return info && info->auxd_enabled &&
4908 domain->type == IOMMU_DOMAIN_UNMANAGED;
4911 static void auxiliary_link_device(struct dmar_domain *domain,
4914 struct device_domain_info *info = dev->archdata.iommu;
4916 assert_spin_locked(&device_domain_lock);
4920 domain->auxd_refcnt++;
4921 list_add(&domain->auxd, &info->auxiliary_domains);
4924 static void auxiliary_unlink_device(struct dmar_domain *domain,
4927 struct device_domain_info *info = dev->archdata.iommu;
4929 assert_spin_locked(&device_domain_lock);
4933 list_del(&domain->auxd);
4934 domain->auxd_refcnt--;
4936 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4937 intel_pasid_free_id(domain->default_pasid);
4940 static int aux_domain_add_dev(struct dmar_domain *domain,
4945 unsigned long flags;
4946 struct intel_iommu *iommu;
4948 iommu = device_to_iommu(dev, &bus, &devfn);
4952 if (domain->default_pasid <= 0) {
4955 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4956 pci_max_pasids(to_pci_dev(dev)),
4959 pr_err("Can't allocate default pasid\n");
4962 domain->default_pasid = pasid;
4965 spin_lock_irqsave(&device_domain_lock, flags);
4967 * iommu->lock must be held to attach domain to iommu and setup the
4968 * pasid entry for second level translation.
4970 spin_lock(&iommu->lock);
4971 ret = domain_attach_iommu(domain, iommu);
4975 /* Setup the PASID entry for mediated devices: */
4976 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4977 domain->default_pasid);
4980 spin_unlock(&iommu->lock);
4982 auxiliary_link_device(domain, dev);
4984 spin_unlock_irqrestore(&device_domain_lock, flags);
4989 domain_detach_iommu(domain, iommu);
4991 spin_unlock(&iommu->lock);
4992 spin_unlock_irqrestore(&device_domain_lock, flags);
4993 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4994 intel_pasid_free_id(domain->default_pasid);
4999 static void aux_domain_remove_dev(struct dmar_domain *domain,
5002 struct device_domain_info *info;
5003 struct intel_iommu *iommu;
5004 unsigned long flags;
5006 if (!is_aux_domain(dev, &domain->domain))
5009 spin_lock_irqsave(&device_domain_lock, flags);
5010 info = dev->archdata.iommu;
5011 iommu = info->iommu;
5013 auxiliary_unlink_device(domain, dev);
5015 spin_lock(&iommu->lock);
5016 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5017 domain_detach_iommu(domain, iommu);
5018 spin_unlock(&iommu->lock);
5020 spin_unlock_irqrestore(&device_domain_lock, flags);
5023 static int prepare_domain_attach_device(struct iommu_domain *domain,
5026 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5027 struct intel_iommu *iommu;
5031 iommu = device_to_iommu(dev, &bus, &devfn);
5035 /* check if this iommu agaw is sufficient for max mapped address */
5036 addr_width = agaw_to_width(iommu->agaw);
5037 if (addr_width > cap_mgaw(iommu->cap))
5038 addr_width = cap_mgaw(iommu->cap);
5040 if (dmar_domain->max_addr > (1LL << addr_width)) {
5041 dev_err(dev, "%s: iommu width (%d) is not "
5042 "sufficient for the mapped address (%llx)\n",
5043 __func__, addr_width, dmar_domain->max_addr);
5046 dmar_domain->gaw = addr_width;
5049 * Knock out extra levels of page tables if necessary
5051 while (iommu->agaw < dmar_domain->agaw) {
5052 struct dma_pte *pte;
5054 pte = dmar_domain->pgd;
5055 if (dma_pte_present(pte)) {
5056 dmar_domain->pgd = (struct dma_pte *)
5057 phys_to_virt(dma_pte_addr(pte));
5058 free_pgtable_page(pte);
5060 dmar_domain->agaw--;
5066 static int intel_iommu_attach_device(struct iommu_domain *domain,
5071 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5072 device_is_rmrr_locked(dev)) {
5073 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5077 if (is_aux_domain(dev, domain))
5080 /* normally dev is not mapped */
5081 if (unlikely(domain_context_mapped(dev))) {
5082 struct dmar_domain *old_domain;
5084 old_domain = find_domain(dev);
5086 dmar_remove_one_dev_info(dev);
5089 ret = prepare_domain_attach_device(domain, dev);
5093 return domain_add_dev_info(to_dmar_domain(domain), dev);
5096 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5101 if (!is_aux_domain(dev, domain))
5104 ret = prepare_domain_attach_device(domain, dev);
5108 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5111 static void intel_iommu_detach_device(struct iommu_domain *domain,
5114 dmar_remove_one_dev_info(dev);
5117 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5120 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5123 static int intel_iommu_map(struct iommu_domain *domain,
5124 unsigned long iova, phys_addr_t hpa,
5125 size_t size, int iommu_prot)
5127 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5132 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5135 if (iommu_prot & IOMMU_READ)
5136 prot |= DMA_PTE_READ;
5137 if (iommu_prot & IOMMU_WRITE)
5138 prot |= DMA_PTE_WRITE;
5139 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5140 prot |= DMA_PTE_SNP;
5142 max_addr = iova + size;
5143 if (dmar_domain->max_addr < max_addr) {
5146 /* check if minimum agaw is sufficient for mapped address */
5147 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5148 if (end < max_addr) {
5149 pr_err("%s: iommu width (%d) is not "
5150 "sufficient for the mapped address (%llx)\n",
5151 __func__, dmar_domain->gaw, max_addr);
5154 dmar_domain->max_addr = max_addr;
5156 /* Round up size to next multiple of PAGE_SIZE, if it and
5157 the low bits of hpa would take us onto the next page */
5158 size = aligned_nrpages(hpa, size);
5159 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5160 hpa >> VTD_PAGE_SHIFT, size, prot);
5164 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5165 unsigned long iova, size_t size)
5167 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5168 struct page *freelist = NULL;
5169 unsigned long start_pfn, last_pfn;
5170 unsigned int npages;
5171 int iommu_id, level = 0;
5173 /* Cope with horrid API which requires us to unmap more than the
5174 size argument if it happens to be a large-page mapping. */
5175 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5176 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5179 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5180 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5182 start_pfn = iova >> VTD_PAGE_SHIFT;
5183 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5185 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5187 npages = last_pfn - start_pfn + 1;
5189 for_each_domain_iommu(iommu_id, dmar_domain)
5190 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5191 start_pfn, npages, !freelist, 0);
5193 dma_free_pagelist(freelist);
5195 if (dmar_domain->max_addr == iova + size)
5196 dmar_domain->max_addr = iova;
5201 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5204 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5205 struct dma_pte *pte;
5209 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5212 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5214 phys = dma_pte_addr(pte);
5219 static inline bool scalable_mode_support(void)
5221 struct dmar_drhd_unit *drhd;
5222 struct intel_iommu *iommu;
5226 for_each_active_iommu(iommu, drhd) {
5227 if (!sm_supported(iommu)) {
5237 static inline bool iommu_pasid_support(void)
5239 struct dmar_drhd_unit *drhd;
5240 struct intel_iommu *iommu;
5244 for_each_active_iommu(iommu, drhd) {
5245 if (!pasid_supported(iommu)) {
5255 static bool intel_iommu_capable(enum iommu_cap cap)
5257 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5258 return domain_update_iommu_snooping(NULL) == 1;
5259 if (cap == IOMMU_CAP_INTR_REMAP)
5260 return irq_remapping_enabled == 1;
5265 static int intel_iommu_add_device(struct device *dev)
5267 struct dmar_domain *dmar_domain;
5268 struct iommu_domain *domain;
5269 struct intel_iommu *iommu;
5270 struct iommu_group *group;
5274 iommu = device_to_iommu(dev, &bus, &devfn);
5278 iommu_device_link(&iommu->iommu, dev);
5280 if (translation_pre_enabled(iommu))
5281 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5283 group = iommu_group_get_for_dev(dev);
5286 return PTR_ERR(group);
5288 iommu_group_put(group);
5290 domain = iommu_get_domain_for_dev(dev);
5291 dmar_domain = to_dmar_domain(domain);
5292 if (domain->type == IOMMU_DOMAIN_DMA) {
5293 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5294 ret = iommu_request_dm_for_dev(dev);
5296 dmar_remove_one_dev_info(dev);
5297 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5298 domain_add_dev_info(si_domain, dev);
5300 "Device uses a private identity domain.\n");
5304 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5305 ret = iommu_request_dma_domain_for_dev(dev);
5307 dmar_remove_one_dev_info(dev);
5308 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5309 if (!get_private_domain_for_dev(dev)) {
5311 "Failed to get a private domain.\n");
5316 "Device uses a private dma domain.\n");
5324 static void intel_iommu_remove_device(struct device *dev)
5326 struct intel_iommu *iommu;
5329 iommu = device_to_iommu(dev, &bus, &devfn);
5333 dmar_remove_one_dev_info(dev);
5335 iommu_group_remove_device(dev);
5337 iommu_device_unlink(&iommu->iommu, dev);
5340 static void intel_iommu_get_resv_regions(struct device *device,
5341 struct list_head *head)
5343 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5344 struct iommu_resv_region *reg;
5345 struct dmar_rmrr_unit *rmrr;
5346 struct device *i_dev;
5349 down_read(&dmar_global_lock);
5350 for_each_rmrr_units(rmrr) {
5351 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5353 struct iommu_resv_region *resv;
5354 enum iommu_resv_type type;
5357 if (i_dev != device &&
5358 !is_downstream_to_pci_bridge(device, i_dev))
5361 length = rmrr->end_address - rmrr->base_address + 1;
5363 type = device_rmrr_is_relaxable(device) ?
5364 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5366 resv = iommu_alloc_resv_region(rmrr->base_address,
5367 length, prot, type);
5371 list_add_tail(&resv->list, head);
5374 up_read(&dmar_global_lock);
5376 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5377 if (dev_is_pci(device)) {
5378 struct pci_dev *pdev = to_pci_dev(device);
5380 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5381 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5384 list_add_tail(®->list, head);
5387 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5389 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5390 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5394 list_add_tail(®->list, head);
5397 static void intel_iommu_put_resv_regions(struct device *dev,
5398 struct list_head *head)
5400 struct iommu_resv_region *entry, *next;
5402 list_for_each_entry_safe(entry, next, head, list)
5406 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5408 struct device_domain_info *info;
5409 struct context_entry *context;
5410 struct dmar_domain *domain;
5411 unsigned long flags;
5415 domain = find_domain(dev);
5419 spin_lock_irqsave(&device_domain_lock, flags);
5420 spin_lock(&iommu->lock);
5423 info = dev->archdata.iommu;
5424 if (!info || !info->pasid_supported)
5427 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5428 if (WARN_ON(!context))
5431 ctx_lo = context[0].lo;
5433 if (!(ctx_lo & CONTEXT_PASIDE)) {
5434 ctx_lo |= CONTEXT_PASIDE;
5435 context[0].lo = ctx_lo;
5437 iommu->flush.flush_context(iommu,
5438 domain->iommu_did[iommu->seq_id],
5439 PCI_DEVID(info->bus, info->devfn),
5440 DMA_CCMD_MASK_NOBIT,
5441 DMA_CCMD_DEVICE_INVL);
5444 /* Enable PASID support in the device, if it wasn't already */
5445 if (!info->pasid_enabled)
5446 iommu_enable_dev_iotlb(info);
5451 spin_unlock(&iommu->lock);
5452 spin_unlock_irqrestore(&device_domain_lock, flags);
5457 static void intel_iommu_apply_resv_region(struct device *dev,
5458 struct iommu_domain *domain,
5459 struct iommu_resv_region *region)
5461 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5462 unsigned long start, end;
5464 start = IOVA_PFN(region->start);
5465 end = IOVA_PFN(region->start + region->length - 1);
5467 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5470 #ifdef CONFIG_INTEL_IOMMU_SVM
5471 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5473 struct intel_iommu *iommu;
5476 if (iommu_dummy(dev)) {
5478 "No IOMMU translation for device; cannot enable SVM\n");
5482 iommu = device_to_iommu(dev, &bus, &devfn);
5484 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5490 #endif /* CONFIG_INTEL_IOMMU_SVM */
5492 static int intel_iommu_enable_auxd(struct device *dev)
5494 struct device_domain_info *info;
5495 struct intel_iommu *iommu;
5496 unsigned long flags;
5500 iommu = device_to_iommu(dev, &bus, &devfn);
5501 if (!iommu || dmar_disabled)
5504 if (!sm_supported(iommu) || !pasid_supported(iommu))
5507 ret = intel_iommu_enable_pasid(iommu, dev);
5511 spin_lock_irqsave(&device_domain_lock, flags);
5512 info = dev->archdata.iommu;
5513 info->auxd_enabled = 1;
5514 spin_unlock_irqrestore(&device_domain_lock, flags);
5519 static int intel_iommu_disable_auxd(struct device *dev)
5521 struct device_domain_info *info;
5522 unsigned long flags;
5524 spin_lock_irqsave(&device_domain_lock, flags);
5525 info = dev->archdata.iommu;
5526 if (!WARN_ON(!info))
5527 info->auxd_enabled = 0;
5528 spin_unlock_irqrestore(&device_domain_lock, flags);
5534 * A PCI express designated vendor specific extended capability is defined
5535 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5536 * for system software and tools to detect endpoint devices supporting the
5537 * Intel scalable IO virtualization without host driver dependency.
5539 * Returns the address of the matching extended capability structure within
5540 * the device's PCI configuration space or 0 if the device does not support
5543 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5548 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5550 pci_read_config_word(pdev, pos + 4, &vendor);
5551 pci_read_config_word(pdev, pos + 8, &id);
5552 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5555 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5562 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5564 if (feat == IOMMU_DEV_FEAT_AUX) {
5567 if (!dev_is_pci(dev) || dmar_disabled ||
5568 !scalable_mode_support() || !iommu_pasid_support())
5571 ret = pci_pasid_features(to_pci_dev(dev));
5575 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5582 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5584 if (feat == IOMMU_DEV_FEAT_AUX)
5585 return intel_iommu_enable_auxd(dev);
5591 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5593 if (feat == IOMMU_DEV_FEAT_AUX)
5594 return intel_iommu_disable_auxd(dev);
5600 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5602 struct device_domain_info *info = dev->archdata.iommu;
5604 if (feat == IOMMU_DEV_FEAT_AUX)
5605 return scalable_mode_support() && info && info->auxd_enabled;
5611 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5613 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5615 return dmar_domain->default_pasid > 0 ?
5616 dmar_domain->default_pasid : -EINVAL;
5619 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5622 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5625 const struct iommu_ops intel_iommu_ops = {
5626 .capable = intel_iommu_capable,
5627 .domain_alloc = intel_iommu_domain_alloc,
5628 .domain_free = intel_iommu_domain_free,
5629 .attach_dev = intel_iommu_attach_device,
5630 .detach_dev = intel_iommu_detach_device,
5631 .aux_attach_dev = intel_iommu_aux_attach_device,
5632 .aux_detach_dev = intel_iommu_aux_detach_device,
5633 .aux_get_pasid = intel_iommu_aux_get_pasid,
5634 .map = intel_iommu_map,
5635 .unmap = intel_iommu_unmap,
5636 .iova_to_phys = intel_iommu_iova_to_phys,
5637 .add_device = intel_iommu_add_device,
5638 .remove_device = intel_iommu_remove_device,
5639 .get_resv_regions = intel_iommu_get_resv_regions,
5640 .put_resv_regions = intel_iommu_put_resv_regions,
5641 .apply_resv_region = intel_iommu_apply_resv_region,
5642 .device_group = pci_device_group,
5643 .dev_has_feat = intel_iommu_dev_has_feat,
5644 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5645 .dev_enable_feat = intel_iommu_dev_enable_feat,
5646 .dev_disable_feat = intel_iommu_dev_disable_feat,
5647 .is_attach_deferred = intel_iommu_is_attach_deferred,
5648 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5651 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5653 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5654 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5666 static void quirk_iommu_rwbf(struct pci_dev *dev)
5669 * Mobile 4 Series Chipset neglects to set RWBF capability,
5670 * but needs it. Same seems to hold for the desktop versions.
5672 pci_info(dev, "Forcing write-buffer flush capability\n");
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5682 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5685 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5686 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5687 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5688 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5689 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5690 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5691 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5692 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5694 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5698 if (pci_read_config_word(dev, GGC, &ggc))
5701 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5702 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5704 } else if (dmar_map_gfx) {
5705 /* we have to ensure the gfx device is idle before we flush */
5706 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5707 intel_iommu_strict = 1;
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5715 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5716 ISOCH DMAR unit for the Azalia sound device, but not give it any
5717 TLB entries, which causes it to deadlock. Check for that. We do
5718 this in a function called from init_dmars(), instead of in a PCI
5719 quirk, because we don't want to print the obnoxious "BIOS broken"
5720 message if VT-d is actually disabled.
5722 static void __init check_tylersburg_isoch(void)
5724 struct pci_dev *pdev;
5725 uint32_t vtisochctrl;
5727 /* If there's no Azalia in the system anyway, forget it. */
5728 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5733 /* System Management Registers. Might be hidden, in which case
5734 we can't do the sanity check. But that's OK, because the
5735 known-broken BIOSes _don't_ actually hide it, so far. */
5736 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5740 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5747 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5748 if (vtisochctrl & 1)
5751 /* Drop all bits other than the number of TLB entries */
5752 vtisochctrl &= 0x1c;
5754 /* If we have the recommended number of TLB entries (16), fine. */
5755 if (vtisochctrl == 0x10)
5758 /* Zero TLB entries? You get to ride the short bus to school. */
5760 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5761 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5762 dmi_get_system_info(DMI_BIOS_VENDOR),
5763 dmi_get_system_info(DMI_BIOS_VERSION),
5764 dmi_get_system_info(DMI_PRODUCT_VERSION));
5765 iommu_identity_mapping |= IDENTMAP_AZALIA;
5769 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",