2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
22 #define dev_fmt(fmt) pr_fmt(fmt)
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
59 #define ROOT_SIZE VTD_PAGE_SIZE
60 #define CONTEXT_SIZE VTD_PAGE_SIZE
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
67 #define IOAPIC_RANGE_START (0xfee00000)
68 #define IOAPIC_RANGE_END (0xfeefffff)
69 #define IOVA_START_ADDR (0x1000)
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
76 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
82 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN (1)
88 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
90 /* page table handling */
91 #define LEVEL_STRIDE (9)
92 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
95 * This bitmap is used to advertise the page sizes our hardware support
96 * to the IOMMU core, which will then use this information to split
97 * physically contiguous memory regions it is mapping into page sizes
100 * Traditionally the IOMMU core just handed us the mappings directly,
101 * after making sure the size is an order of a 4KiB page and that the
102 * mapping has natural alignment.
104 * To retain this behavior, we currently advertise that we support
105 * all page sizes that are an order of 4KiB.
107 * If at some point we'd like to utilize the IOMMU core's new behavior,
108 * we could change this to advertise the real page sizes we support.
110 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
112 static inline int agaw_to_level(int agaw)
117 static inline int agaw_to_width(int agaw)
119 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
122 static inline int width_to_agaw(int width)
124 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
127 static inline unsigned int level_to_offset_bits(int level)
129 return (level - 1) * LEVEL_STRIDE;
132 static inline int pfn_level_offset(unsigned long pfn, int level)
134 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
137 static inline unsigned long level_mask(int level)
139 return -1UL << level_to_offset_bits(level);
142 static inline unsigned long level_size(int level)
144 return 1UL << level_to_offset_bits(level);
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
149 return (pfn + level_size(level) - 1) & level_mask(level);
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
154 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158 are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
161 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
166 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
170 return mm_to_dma_pfn(page_to_pfn(pg));
172 static inline unsigned long virt_to_dma_pfn(void *p)
174 return page_to_dma_pfn(virt_to_page(p));
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
184 * set to 1 to panic kernel if can't successfully enable VT-d
185 * (used when kernel is launched w/ TXT)
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
194 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 return re->lo & VTD_PAGE_MASK;
206 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 return re->hi & VTD_PAGE_MASK;
217 static inline void context_clear_pasid_enable(struct context_entry *context)
219 context->lo &= ~(1ULL << 11);
222 static inline bool context_pasid_enabled(struct context_entry *context)
224 return !!(context->lo & (1ULL << 11));
227 static inline void context_set_copied(struct context_entry *context)
229 context->hi |= (1ull << 3);
232 static inline bool context_copied(struct context_entry *context)
234 return !!(context->hi & (1ULL << 3));
237 static inline bool __context_present(struct context_entry *context)
239 return (context->lo & 1);
242 bool context_present(struct context_entry *context)
244 return context_pasid_enabled(context) ?
245 __context_present(context) :
246 __context_present(context) && !context_copied(context);
249 static inline void context_set_present(struct context_entry *context)
254 static inline void context_set_fault_enable(struct context_entry *context)
256 context->lo &= (((u64)-1) << 2) | 1;
259 static inline void context_set_translation_type(struct context_entry *context,
262 context->lo &= (((u64)-1) << 4) | 3;
263 context->lo |= (value & 3) << 2;
266 static inline void context_set_address_root(struct context_entry *context,
269 context->lo &= ~VTD_PAGE_MASK;
270 context->lo |= value & VTD_PAGE_MASK;
273 static inline void context_set_address_width(struct context_entry *context,
276 context->hi |= value & 7;
279 static inline void context_set_domain_id(struct context_entry *context,
282 context->hi |= (value & ((1 << 16) - 1)) << 8;
285 static inline int context_domain_id(struct context_entry *c)
287 return((c->hi >> 8) & 0xffff);
290 static inline void context_clear_entry(struct context_entry *context)
297 * This domain is a statically identity mapping domain.
298 * 1. This domain creats a static 1:1 mapping to all usable memory.
299 * 2. It maps to each iommu if successful.
300 * 3. Each iommu mapps to this domain if successful.
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
306 * Domain represents a virtual machine, more than one devices
307 * across iommus may be owned in one domain, e.g. kvm guest.
309 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
311 /* si_domain contains mulitple devices */
312 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
314 #define for_each_domain_iommu(idx, domain) \
315 for (idx = 0; idx < g_num_of_iommus; idx++) \
316 if (domain->iommu_refcnt[idx])
318 struct dmar_rmrr_unit {
319 struct list_head list; /* list of rmrr units */
320 struct acpi_dmar_header *hdr; /* ACPI header */
321 u64 base_address; /* reserved base address*/
322 u64 end_address; /* reserved end address */
323 struct dmar_dev_scope *devices; /* target devices */
324 int devices_cnt; /* target device count */
325 struct iommu_resv_region *resv; /* reserved region handle */
328 struct dmar_atsr_unit {
329 struct list_head list; /* list of ATSR units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 struct dmar_dev_scope *devices; /* target devices */
332 int devices_cnt; /* target device count */
333 u8 include_all:1; /* include all ports */
336 static LIST_HEAD(dmar_atsr_units);
337 static LIST_HEAD(dmar_rmrr_units);
339 #define for_each_rmrr_units(rmrr) \
340 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
342 /* bitmap for indexing intel_iommus */
343 static int g_num_of_iommus;
345 static void domain_exit(struct dmar_domain *domain);
346 static void domain_remove_dev_info(struct dmar_domain *domain);
347 static void dmar_remove_one_dev_info(struct device *dev);
348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
349 static void domain_context_clear(struct intel_iommu *iommu,
351 static int domain_detach_iommu(struct dmar_domain *domain,
352 struct intel_iommu *iommu);
353 static bool device_is_rmrr_locked(struct device *dev);
355 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
356 int dmar_disabled = 0;
358 int dmar_disabled = 1;
359 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
362 int intel_iommu_enabled = 0;
363 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
365 static int dmar_map_gfx = 1;
366 static int dmar_forcedac;
367 static int intel_iommu_strict;
368 static int intel_iommu_superpage = 1;
369 static int iommu_identity_mapping;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 static DEFINE_SPINLOCK(device_domain_lock);
380 static LIST_HEAD(device_domain_list);
383 * Iterate over elements in device_domain_list and call the specified
384 * callback @fn against each element.
386 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
387 void *data), void *data)
391 struct device_domain_info *info;
393 spin_lock_irqsave(&device_domain_lock, flags);
394 list_for_each_entry(info, &device_domain_list, global) {
395 ret = fn(info, data);
397 spin_unlock_irqrestore(&device_domain_lock, flags);
401 spin_unlock_irqrestore(&device_domain_lock, flags);
406 const struct iommu_ops intel_iommu_ops;
408 static bool translation_pre_enabled(struct intel_iommu *iommu)
410 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
413 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
415 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
418 static void init_translation_status(struct intel_iommu *iommu)
422 gsts = readl(iommu->reg + DMAR_GSTS_REG);
423 if (gsts & DMA_GSTS_TES)
424 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
427 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
428 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
430 return container_of(dom, struct dmar_domain, domain);
433 static int __init intel_iommu_setup(char *str)
438 if (!strncmp(str, "on", 2)) {
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
462 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce = 1;
466 str += strcspn(str, ",");
472 __setup("intel_iommu=", intel_iommu_setup);
474 static struct kmem_cache *iommu_domain_cache;
475 static struct kmem_cache *iommu_devinfo_cache;
477 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
479 struct dmar_domain **domains;
482 domains = iommu->domains[idx];
486 return domains[did & 0xff];
489 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
490 struct dmar_domain *domain)
492 struct dmar_domain **domains;
495 if (!iommu->domains[idx]) {
496 size_t size = 256 * sizeof(struct dmar_domain *);
497 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 domains = iommu->domains[idx];
501 if (WARN_ON(!domains))
504 domains[did & 0xff] = domain;
507 void *alloc_pgtable_page(int node)
512 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
514 vaddr = page_address(page);
518 void free_pgtable_page(void *vaddr)
520 free_page((unsigned long)vaddr);
523 static inline void *alloc_domain_mem(void)
525 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 static void free_domain_mem(void *vaddr)
530 kmem_cache_free(iommu_domain_cache, vaddr);
533 static inline void * alloc_devinfo_mem(void)
535 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 static inline void free_devinfo_mem(void *vaddr)
540 kmem_cache_free(iommu_devinfo_cache, vaddr);
543 static inline int domain_type_is_vm(struct dmar_domain *domain)
545 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
548 static inline int domain_type_is_si(struct dmar_domain *domain)
550 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
555 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
556 DOMAIN_FLAG_STATIC_IDENTITY);
559 static inline int domain_pfn_supported(struct dmar_domain *domain,
562 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
564 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
567 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
572 sagaw = cap_sagaw(iommu->cap);
573 for (agaw = width_to_agaw(max_gaw);
575 if (test_bit(agaw, &sagaw))
583 * Calculate max SAGAW for each iommu.
585 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
587 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
591 * calculate agaw for each iommu.
592 * "SAGAW" may be different across iommus, use a default agaw, and
593 * get a supported less agaw for iommus that don't support the default agaw.
595 int iommu_calculate_agaw(struct intel_iommu *iommu)
597 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
600 /* This functionin only returns single iommu in a domain */
601 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
605 /* si_domain and vm domain should not get here. */
606 BUG_ON(domain_type_is_vm_or_si(domain));
607 for_each_domain_iommu(iommu_id, domain)
610 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
613 return g_iommus[iommu_id];
616 static void domain_update_iommu_coherency(struct dmar_domain *domain)
618 struct dmar_drhd_unit *drhd;
619 struct intel_iommu *iommu;
623 domain->iommu_coherency = 1;
625 for_each_domain_iommu(i, domain) {
627 if (!ecap_coherent(g_iommus[i]->ecap)) {
628 domain->iommu_coherency = 0;
635 /* No hardware attached; use lowest common denominator */
637 for_each_active_iommu(iommu, drhd) {
638 if (!ecap_coherent(iommu->ecap)) {
639 domain->iommu_coherency = 0;
646 static int domain_update_iommu_snooping(struct intel_iommu *skip)
648 struct dmar_drhd_unit *drhd;
649 struct intel_iommu *iommu;
653 for_each_active_iommu(iommu, drhd) {
655 if (!ecap_sc_support(iommu->ecap)) {
666 static int domain_update_iommu_superpage(struct intel_iommu *skip)
668 struct dmar_drhd_unit *drhd;
669 struct intel_iommu *iommu;
672 if (!intel_iommu_superpage) {
676 /* set iommu_superpage to the smallest common denominator */
678 for_each_active_iommu(iommu, drhd) {
680 mask &= cap_super_page_val(iommu->cap);
690 /* Some capabilities may be different across iommus */
691 static void domain_update_iommu_cap(struct dmar_domain *domain)
693 domain_update_iommu_coherency(domain);
694 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
695 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
698 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
701 struct root_entry *root = &iommu->root_entry[bus];
702 struct context_entry *context;
706 if (sm_supported(iommu)) {
714 context = phys_to_virt(*entry & VTD_PAGE_MASK);
716 unsigned long phy_addr;
720 context = alloc_pgtable_page(iommu->node);
724 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
725 phy_addr = virt_to_phys((void *)context);
726 *entry = phy_addr | 1;
727 __iommu_flush_cache(iommu, entry, sizeof(*entry));
729 return &context[devfn];
732 static int iommu_dummy(struct device *dev)
734 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
737 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
739 struct dmar_drhd_unit *drhd = NULL;
740 struct intel_iommu *iommu;
742 struct pci_dev *ptmp, *pdev = NULL;
746 if (iommu_dummy(dev))
749 if (dev_is_pci(dev)) {
750 struct pci_dev *pf_pdev;
752 pdev = to_pci_dev(dev);
755 /* VMD child devices currently cannot be handled individually */
756 if (is_vmd(pdev->bus))
760 /* VFs aren't listed in scope tables; we need to look up
761 * the PF instead to find the IOMMU. */
762 pf_pdev = pci_physfn(pdev);
764 segment = pci_domain_nr(pdev->bus);
765 } else if (has_acpi_companion(dev))
766 dev = &ACPI_COMPANION(dev)->dev;
769 for_each_active_iommu(iommu, drhd) {
770 if (pdev && segment != drhd->segment)
773 for_each_active_dev_scope(drhd->devices,
774 drhd->devices_cnt, i, tmp) {
776 /* For a VF use its original BDF# not that of the PF
777 * which we used for the IOMMU lookup. Strictly speaking
778 * we could do this for all PCI devices; we only need to
779 * get the BDF# from the scope table for ACPI matches. */
780 if (pdev && pdev->is_virtfn)
783 *bus = drhd->devices[i].bus;
784 *devfn = drhd->devices[i].devfn;
788 if (!pdev || !dev_is_pci(tmp))
791 ptmp = to_pci_dev(tmp);
792 if (ptmp->subordinate &&
793 ptmp->subordinate->number <= pdev->bus->number &&
794 ptmp->subordinate->busn_res.end >= pdev->bus->number)
798 if (pdev && drhd->include_all) {
800 *bus = pdev->bus->number;
801 *devfn = pdev->devfn;
812 static void domain_flush_cache(struct dmar_domain *domain,
813 void *addr, int size)
815 if (!domain->iommu_coherency)
816 clflush_cache_range(addr, size);
819 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
821 struct context_entry *context;
825 spin_lock_irqsave(&iommu->lock, flags);
826 context = iommu_context_addr(iommu, bus, devfn, 0);
828 ret = context_present(context);
829 spin_unlock_irqrestore(&iommu->lock, flags);
833 static void free_context_table(struct intel_iommu *iommu)
837 struct context_entry *context;
839 spin_lock_irqsave(&iommu->lock, flags);
840 if (!iommu->root_entry) {
843 for (i = 0; i < ROOT_ENTRY_NR; i++) {
844 context = iommu_context_addr(iommu, i, 0, 0);
846 free_pgtable_page(context);
848 if (!sm_supported(iommu))
851 context = iommu_context_addr(iommu, i, 0x80, 0);
853 free_pgtable_page(context);
856 free_pgtable_page(iommu->root_entry);
857 iommu->root_entry = NULL;
859 spin_unlock_irqrestore(&iommu->lock, flags);
862 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
863 unsigned long pfn, int *target_level)
865 struct dma_pte *parent, *pte;
866 int level = agaw_to_level(domain->agaw);
869 BUG_ON(!domain->pgd);
871 if (!domain_pfn_supported(domain, pfn))
872 /* Address beyond IOMMU's addressing capabilities. */
875 parent = domain->pgd;
880 offset = pfn_level_offset(pfn, level);
881 pte = &parent[offset];
882 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
884 if (level == *target_level)
887 if (!dma_pte_present(pte)) {
890 tmp_page = alloc_pgtable_page(domain->nid);
895 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
896 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
897 if (cmpxchg64(&pte->val, 0ULL, pteval))
898 /* Someone else set it while we were thinking; use theirs. */
899 free_pgtable_page(tmp_page);
901 domain_flush_cache(domain, pte, sizeof(*pte));
906 parent = phys_to_virt(dma_pte_addr(pte));
911 *target_level = level;
917 /* return address's pte at specific level */
918 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
920 int level, int *large_page)
922 struct dma_pte *parent, *pte;
923 int total = agaw_to_level(domain->agaw);
926 parent = domain->pgd;
927 while (level <= total) {
928 offset = pfn_level_offset(pfn, total);
929 pte = &parent[offset];
933 if (!dma_pte_present(pte)) {
938 if (dma_pte_superpage(pte)) {
943 parent = phys_to_virt(dma_pte_addr(pte));
949 /* clear last level pte, a tlb flush should be followed */
950 static void dma_pte_clear_range(struct dmar_domain *domain,
951 unsigned long start_pfn,
952 unsigned long last_pfn)
954 unsigned int large_page;
955 struct dma_pte *first_pte, *pte;
957 BUG_ON(!domain_pfn_supported(domain, start_pfn));
958 BUG_ON(!domain_pfn_supported(domain, last_pfn));
959 BUG_ON(start_pfn > last_pfn);
961 /* we don't need lock here; nobody else touches the iova range */
964 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
966 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
971 start_pfn += lvl_to_nr_pages(large_page);
973 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
975 domain_flush_cache(domain, first_pte,
976 (void *)pte - (void *)first_pte);
978 } while (start_pfn && start_pfn <= last_pfn);
981 static void dma_pte_free_level(struct dmar_domain *domain, int level,
982 int retain_level, struct dma_pte *pte,
983 unsigned long pfn, unsigned long start_pfn,
984 unsigned long last_pfn)
986 pfn = max(start_pfn, pfn);
987 pte = &pte[pfn_level_offset(pfn, level)];
990 unsigned long level_pfn;
991 struct dma_pte *level_pte;
993 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
996 level_pfn = pfn & level_mask(level);
997 level_pte = phys_to_virt(dma_pte_addr(pte));
1000 dma_pte_free_level(domain, level - 1, retain_level,
1001 level_pte, level_pfn, start_pfn,
1006 * Free the page table if we're below the level we want to
1007 * retain and the range covers the entire table.
1009 if (level < retain_level && !(start_pfn > level_pfn ||
1010 last_pfn < level_pfn + level_size(level) - 1)) {
1012 domain_flush_cache(domain, pte, sizeof(*pte));
1013 free_pgtable_page(level_pte);
1016 pfn += level_size(level);
1017 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1021 * clear last level (leaf) ptes and free page table pages below the
1022 * level we wish to keep intact.
1024 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1025 unsigned long start_pfn,
1026 unsigned long last_pfn,
1029 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1030 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1031 BUG_ON(start_pfn > last_pfn);
1033 dma_pte_clear_range(domain, start_pfn, last_pfn);
1035 /* We don't need lock here; nobody else touches the iova range */
1036 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1037 domain->pgd, 0, start_pfn, last_pfn);
1040 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1041 free_pgtable_page(domain->pgd);
1046 /* When a page at a given level is being unlinked from its parent, we don't
1047 need to *modify* it at all. All we need to do is make a list of all the
1048 pages which can be freed just as soon as we've flushed the IOTLB and we
1049 know the hardware page-walk will no longer touch them.
1050 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1052 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1053 int level, struct dma_pte *pte,
1054 struct page *freelist)
1058 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1059 pg->freelist = freelist;
1065 pte = page_address(pg);
1067 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1068 freelist = dma_pte_list_pagetables(domain, level - 1,
1071 } while (!first_pte_in_page(pte));
1076 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1077 struct dma_pte *pte, unsigned long pfn,
1078 unsigned long start_pfn,
1079 unsigned long last_pfn,
1080 struct page *freelist)
1082 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1084 pfn = max(start_pfn, pfn);
1085 pte = &pte[pfn_level_offset(pfn, level)];
1088 unsigned long level_pfn;
1090 if (!dma_pte_present(pte))
1093 level_pfn = pfn & level_mask(level);
1095 /* If range covers entire pagetable, free it */
1096 if (start_pfn <= level_pfn &&
1097 last_pfn >= level_pfn + level_size(level) - 1) {
1098 /* These suborbinate page tables are going away entirely. Don't
1099 bother to clear them; we're just going to *free* them. */
1100 if (level > 1 && !dma_pte_superpage(pte))
1101 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1107 } else if (level > 1) {
1108 /* Recurse down into a level that isn't *entirely* obsolete */
1109 freelist = dma_pte_clear_level(domain, level - 1,
1110 phys_to_virt(dma_pte_addr(pte)),
1111 level_pfn, start_pfn, last_pfn,
1115 pfn += level_size(level);
1116 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1119 domain_flush_cache(domain, first_pte,
1120 (void *)++last_pte - (void *)first_pte);
1125 /* We can't just free the pages because the IOMMU may still be walking
1126 the page tables, and may have cached the intermediate levels. The
1127 pages can only be freed after the IOTLB flush has been done. */
1128 static struct page *domain_unmap(struct dmar_domain *domain,
1129 unsigned long start_pfn,
1130 unsigned long last_pfn)
1132 struct page *freelist;
1134 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1135 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1136 BUG_ON(start_pfn > last_pfn);
1138 /* we don't need lock here; nobody else touches the iova range */
1139 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1140 domain->pgd, 0, start_pfn, last_pfn, NULL);
1143 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1144 struct page *pgd_page = virt_to_page(domain->pgd);
1145 pgd_page->freelist = freelist;
1146 freelist = pgd_page;
1154 static void dma_free_pagelist(struct page *freelist)
1158 while ((pg = freelist)) {
1159 freelist = pg->freelist;
1160 free_pgtable_page(page_address(pg));
1164 static void iova_entry_free(unsigned long data)
1166 struct page *freelist = (struct page *)data;
1168 dma_free_pagelist(freelist);
1171 /* iommu handling */
1172 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1174 struct root_entry *root;
1175 unsigned long flags;
1177 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1179 pr_err("Allocating root entry for %s failed\n",
1184 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1186 spin_lock_irqsave(&iommu->lock, flags);
1187 iommu->root_entry = root;
1188 spin_unlock_irqrestore(&iommu->lock, flags);
1193 static void iommu_set_root_entry(struct intel_iommu *iommu)
1199 addr = virt_to_phys(iommu->root_entry);
1200 if (sm_supported(iommu))
1201 addr |= DMA_RTADDR_SMT;
1203 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1204 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1206 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1208 /* Make sure hardware complete it */
1209 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1210 readl, (sts & DMA_GSTS_RTPS), sts);
1212 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1220 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1223 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1224 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228 readl, (!(val & DMA_GSTS_WBFS)), val);
1230 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233 /* return value determine if we need a write buffer flush */
1234 static void __iommu_flush_context(struct intel_iommu *iommu,
1235 u16 did, u16 source_id, u8 function_mask,
1242 case DMA_CCMD_GLOBAL_INVL:
1243 val = DMA_CCMD_GLOBAL_INVL;
1245 case DMA_CCMD_DOMAIN_INVL:
1246 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1248 case DMA_CCMD_DEVICE_INVL:
1249 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1250 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1255 val |= DMA_CCMD_ICC;
1257 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1258 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1260 /* Make sure hardware complete it */
1261 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1262 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1264 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1267 /* return value determine if we need a write buffer flush */
1268 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1269 u64 addr, unsigned int size_order, u64 type)
1271 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1272 u64 val = 0, val_iva = 0;
1276 case DMA_TLB_GLOBAL_FLUSH:
1277 /* global flush doesn't need set IVA_REG */
1278 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1280 case DMA_TLB_DSI_FLUSH:
1281 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1283 case DMA_TLB_PSI_FLUSH:
1284 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1285 /* IH bit is passed in as part of address */
1286 val_iva = size_order | addr;
1291 /* Note: set drain read/write */
1294 * This is probably to be super secure.. Looks like we can
1295 * ignore it without any impact.
1297 if (cap_read_drain(iommu->cap))
1298 val |= DMA_TLB_READ_DRAIN;
1300 if (cap_write_drain(iommu->cap))
1301 val |= DMA_TLB_WRITE_DRAIN;
1303 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1304 /* Note: Only uses first TLB reg currently */
1306 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1307 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1311 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1313 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1315 /* check IOTLB invalidation granularity */
1316 if (DMA_TLB_IAIG(val) == 0)
1317 pr_err("Flush IOTLB failed\n");
1318 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1319 pr_debug("TLB flush request %Lx, actual %Lx\n",
1320 (unsigned long long)DMA_TLB_IIRG(type),
1321 (unsigned long long)DMA_TLB_IAIG(val));
1324 static struct device_domain_info *
1325 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1328 struct device_domain_info *info;
1330 assert_spin_locked(&device_domain_lock);
1335 list_for_each_entry(info, &domain->devices, link)
1336 if (info->iommu == iommu && info->bus == bus &&
1337 info->devfn == devfn) {
1338 if (info->ats_supported && info->dev)
1346 static void domain_update_iotlb(struct dmar_domain *domain)
1348 struct device_domain_info *info;
1349 bool has_iotlb_device = false;
1351 assert_spin_locked(&device_domain_lock);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 struct pci_dev *pdev;
1356 if (!info->dev || !dev_is_pci(info->dev))
1359 pdev = to_pci_dev(info->dev);
1360 if (pdev->ats_enabled) {
1361 has_iotlb_device = true;
1366 domain->has_iotlb_device = has_iotlb_device;
1369 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1371 struct pci_dev *pdev;
1373 assert_spin_locked(&device_domain_lock);
1375 if (!info || !dev_is_pci(info->dev))
1378 pdev = to_pci_dev(info->dev);
1379 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1380 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1381 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1382 * reserved, which should be set to 0.
1384 if (!ecap_dit(info->iommu->ecap))
1387 struct pci_dev *pf_pdev;
1389 /* pdev will be returned if device is not a vf */
1390 pf_pdev = pci_physfn(pdev);
1391 info->pfsid = pci_dev_id(pf_pdev);
1394 #ifdef CONFIG_INTEL_IOMMU_SVM
1395 /* The PCIe spec, in its wisdom, declares that the behaviour of
1396 the device if you enable PASID support after ATS support is
1397 undefined. So always enable PASID support on devices which
1398 have it, even if we can't yet know if we're ever going to
1400 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1401 info->pasid_enabled = 1;
1403 if (info->pri_supported &&
1404 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1405 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1406 info->pri_enabled = 1;
1408 if (!pdev->untrusted && info->ats_supported &&
1409 pci_ats_page_aligned(pdev) &&
1410 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1411 info->ats_enabled = 1;
1412 domain_update_iotlb(info->domain);
1413 info->ats_qdep = pci_ats_queue_depth(pdev);
1417 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1419 struct pci_dev *pdev;
1421 assert_spin_locked(&device_domain_lock);
1423 if (!dev_is_pci(info->dev))
1426 pdev = to_pci_dev(info->dev);
1428 if (info->ats_enabled) {
1429 pci_disable_ats(pdev);
1430 info->ats_enabled = 0;
1431 domain_update_iotlb(info->domain);
1433 #ifdef CONFIG_INTEL_IOMMU_SVM
1434 if (info->pri_enabled) {
1435 pci_disable_pri(pdev);
1436 info->pri_enabled = 0;
1438 if (info->pasid_enabled) {
1439 pci_disable_pasid(pdev);
1440 info->pasid_enabled = 0;
1445 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1446 u64 addr, unsigned mask)
1449 unsigned long flags;
1450 struct device_domain_info *info;
1452 if (!domain->has_iotlb_device)
1455 spin_lock_irqsave(&device_domain_lock, flags);
1456 list_for_each_entry(info, &domain->devices, link) {
1457 if (!info->ats_enabled)
1460 sid = info->bus << 8 | info->devfn;
1461 qdep = info->ats_qdep;
1462 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 spin_unlock_irqrestore(&device_domain_lock, flags);
1468 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1469 struct dmar_domain *domain,
1470 unsigned long pfn, unsigned int pages,
1473 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1474 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1475 u16 did = domain->iommu_did[iommu->seq_id];
1482 * Fallback to domain selective flush if no PSI support or the size is
1484 * PSI requires page size to be 2 ^ x, and the base address is naturally
1485 * aligned to the size
1487 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1488 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1491 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1495 * In caching mode, changes of pages from non-present to present require
1496 * flush. However, device IOTLB doesn't need to be flushed in this case.
1498 if (!cap_caching_mode(iommu->cap) || !map)
1499 iommu_flush_dev_iotlb(domain, addr, mask);
1502 /* Notification for newly created mappings */
1503 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1504 struct dmar_domain *domain,
1505 unsigned long pfn, unsigned int pages)
1507 /* It's a non-present to present mapping. Only flush if caching mode */
1508 if (cap_caching_mode(iommu->cap))
1509 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1511 iommu_flush_write_buffer(iommu);
1514 static void iommu_flush_iova(struct iova_domain *iovad)
1516 struct dmar_domain *domain;
1519 domain = container_of(iovad, struct dmar_domain, iovad);
1521 for_each_domain_iommu(idx, domain) {
1522 struct intel_iommu *iommu = g_iommus[idx];
1523 u16 did = domain->iommu_did[iommu->seq_id];
1525 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1527 if (!cap_caching_mode(iommu->cap))
1528 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1529 0, MAX_AGAW_PFN_WIDTH);
1533 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1536 unsigned long flags;
1538 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1541 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1543 pmen &= ~DMA_PMEN_EPM;
1544 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1546 /* wait for the protected region status bit to clear */
1547 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1548 readl, !(pmen & DMA_PMEN_PRS), pmen);
1550 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1553 static void iommu_enable_translation(struct intel_iommu *iommu)
1556 unsigned long flags;
1558 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1559 iommu->gcmd |= DMA_GCMD_TE;
1560 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1562 /* Make sure hardware complete it */
1563 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1564 readl, (sts & DMA_GSTS_TES), sts);
1566 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569 static void iommu_disable_translation(struct intel_iommu *iommu)
1574 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1575 iommu->gcmd &= ~DMA_GCMD_TE;
1576 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1578 /* Make sure hardware complete it */
1579 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1580 readl, (!(sts & DMA_GSTS_TES)), sts);
1582 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1586 static int iommu_init_domains(struct intel_iommu *iommu)
1588 u32 ndomains, nlongs;
1591 ndomains = cap_ndoms(iommu->cap);
1592 pr_debug("%s: Number of Domains supported <%d>\n",
1593 iommu->name, ndomains);
1594 nlongs = BITS_TO_LONGS(ndomains);
1596 spin_lock_init(&iommu->lock);
1598 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1599 if (!iommu->domain_ids) {
1600 pr_err("%s: Allocating domain id array failed\n",
1605 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1606 iommu->domains = kzalloc(size, GFP_KERNEL);
1608 if (iommu->domains) {
1609 size = 256 * sizeof(struct dmar_domain *);
1610 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1613 if (!iommu->domains || !iommu->domains[0]) {
1614 pr_err("%s: Allocating domain array failed\n",
1616 kfree(iommu->domain_ids);
1617 kfree(iommu->domains);
1618 iommu->domain_ids = NULL;
1619 iommu->domains = NULL;
1626 * If Caching mode is set, then invalid translations are tagged
1627 * with domain-id 0, hence we need to pre-allocate it. We also
1628 * use domain-id 0 as a marker for non-allocated domain-id, so
1629 * make sure it is not used for a real domain.
1631 set_bit(0, iommu->domain_ids);
1634 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1635 * entry for first-level or pass-through translation modes should
1636 * be programmed with a domain id different from those used for
1637 * second-level or nested translation. We reserve a domain id for
1640 if (sm_supported(iommu))
1641 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646 static void disable_dmar_iommu(struct intel_iommu *iommu)
1648 struct device_domain_info *info, *tmp;
1649 unsigned long flags;
1651 if (!iommu->domains || !iommu->domain_ids)
1655 spin_lock_irqsave(&device_domain_lock, flags);
1656 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1657 struct dmar_domain *domain;
1659 if (info->iommu != iommu)
1662 if (!info->dev || !info->domain)
1665 domain = info->domain;
1667 __dmar_remove_one_dev_info(info);
1669 if (!domain_type_is_vm_or_si(domain)) {
1671 * The domain_exit() function can't be called under
1672 * device_domain_lock, as it takes this lock itself.
1673 * So release the lock here and re-run the loop
1676 spin_unlock_irqrestore(&device_domain_lock, flags);
1677 domain_exit(domain);
1681 spin_unlock_irqrestore(&device_domain_lock, flags);
1683 if (iommu->gcmd & DMA_GCMD_TE)
1684 iommu_disable_translation(iommu);
1687 static void free_dmar_iommu(struct intel_iommu *iommu)
1689 if ((iommu->domains) && (iommu->domain_ids)) {
1690 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1693 for (i = 0; i < elems; i++)
1694 kfree(iommu->domains[i]);
1695 kfree(iommu->domains);
1696 kfree(iommu->domain_ids);
1697 iommu->domains = NULL;
1698 iommu->domain_ids = NULL;
1701 g_iommus[iommu->seq_id] = NULL;
1703 /* free context mapping */
1704 free_context_table(iommu);
1706 #ifdef CONFIG_INTEL_IOMMU_SVM
1707 if (pasid_supported(iommu)) {
1708 if (ecap_prs(iommu->ecap))
1709 intel_svm_finish_prq(iommu);
1714 static struct dmar_domain *alloc_domain(int flags)
1716 struct dmar_domain *domain;
1718 domain = alloc_domain_mem();
1722 memset(domain, 0, sizeof(*domain));
1723 domain->nid = NUMA_NO_NODE;
1724 domain->flags = flags;
1725 domain->has_iotlb_device = false;
1726 INIT_LIST_HEAD(&domain->devices);
1731 /* Must be called with iommu->lock */
1732 static int domain_attach_iommu(struct dmar_domain *domain,
1733 struct intel_iommu *iommu)
1735 unsigned long ndomains;
1738 assert_spin_locked(&device_domain_lock);
1739 assert_spin_locked(&iommu->lock);
1741 domain->iommu_refcnt[iommu->seq_id] += 1;
1742 domain->iommu_count += 1;
1743 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1744 ndomains = cap_ndoms(iommu->cap);
1745 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1747 if (num >= ndomains) {
1748 pr_err("%s: No free domain ids\n", iommu->name);
1749 domain->iommu_refcnt[iommu->seq_id] -= 1;
1750 domain->iommu_count -= 1;
1754 set_bit(num, iommu->domain_ids);
1755 set_iommu_domain(iommu, num, domain);
1757 domain->iommu_did[iommu->seq_id] = num;
1758 domain->nid = iommu->node;
1760 domain_update_iommu_cap(domain);
1766 static int domain_detach_iommu(struct dmar_domain *domain,
1767 struct intel_iommu *iommu)
1771 assert_spin_locked(&device_domain_lock);
1772 assert_spin_locked(&iommu->lock);
1774 domain->iommu_refcnt[iommu->seq_id] -= 1;
1775 count = --domain->iommu_count;
1776 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1777 num = domain->iommu_did[iommu->seq_id];
1778 clear_bit(num, iommu->domain_ids);
1779 set_iommu_domain(iommu, num, NULL);
1781 domain_update_iommu_cap(domain);
1782 domain->iommu_did[iommu->seq_id] = 0;
1788 static struct iova_domain reserved_iova_list;
1789 static struct lock_class_key reserved_rbtree_key;
1791 static int dmar_init_reserved_ranges(void)
1793 struct pci_dev *pdev = NULL;
1797 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1799 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1800 &reserved_rbtree_key);
1802 /* IOAPIC ranges shouldn't be accessed by DMA */
1803 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1804 IOVA_PFN(IOAPIC_RANGE_END));
1806 pr_err("Reserve IOAPIC range failed\n");
1810 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1811 for_each_pci_dev(pdev) {
1814 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1815 r = &pdev->resource[i];
1816 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1818 iova = reserve_iova(&reserved_iova_list,
1822 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1830 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1832 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1838 int r = (gaw - 12) % 9;
1849 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1852 int adjust_width, agaw;
1853 unsigned long sagaw;
1856 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1858 err = init_iova_flush_queue(&domain->iovad,
1859 iommu_flush_iova, iova_entry_free);
1863 domain_reserve_special_ranges(domain);
1865 /* calculate AGAW */
1866 if (guest_width > cap_mgaw(iommu->cap))
1867 guest_width = cap_mgaw(iommu->cap);
1868 domain->gaw = guest_width;
1869 adjust_width = guestwidth_to_adjustwidth(guest_width);
1870 agaw = width_to_agaw(adjust_width);
1871 sagaw = cap_sagaw(iommu->cap);
1872 if (!test_bit(agaw, &sagaw)) {
1873 /* hardware doesn't support it, choose a bigger one */
1874 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1875 agaw = find_next_bit(&sagaw, 5, agaw);
1879 domain->agaw = agaw;
1881 if (ecap_coherent(iommu->ecap))
1882 domain->iommu_coherency = 1;
1884 domain->iommu_coherency = 0;
1886 if (ecap_sc_support(iommu->ecap))
1887 domain->iommu_snooping = 1;
1889 domain->iommu_snooping = 0;
1891 if (intel_iommu_superpage)
1892 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1894 domain->iommu_superpage = 0;
1896 domain->nid = iommu->node;
1898 /* always allocate the top pgd */
1899 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1902 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1906 static void domain_exit(struct dmar_domain *domain)
1908 struct page *freelist;
1910 /* Remove associated devices and clear attached or cached domains */
1911 domain_remove_dev_info(domain);
1914 put_iova_domain(&domain->iovad);
1916 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1918 dma_free_pagelist(freelist);
1920 free_domain_mem(domain);
1924 * Get the PASID directory size for scalable mode context entry.
1925 * Value of X in the PDTS field of a scalable mode context entry
1926 * indicates PASID directory with 2^(X + 7) entries.
1928 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1932 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1941 * Set the RID_PASID field of a scalable mode context entry. The
1942 * IOMMU hardware will use the PASID value set in this field for
1943 * DMA translations of DMA requests without PASID.
1946 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1948 context->hi |= pasid & ((1 << 20) - 1);
1949 context->hi |= (1 << 20);
1953 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1956 static inline void context_set_sm_dte(struct context_entry *context)
1958 context->lo |= (1 << 2);
1962 * Set the PRE(Page Request Enable) field of a scalable mode context
1965 static inline void context_set_sm_pre(struct context_entry *context)
1967 context->lo |= (1 << 4);
1970 /* Convert value to context PASID directory size field coding. */
1971 #define context_pdts(pds) (((pds) & 0x7) << 9)
1973 static int domain_context_mapping_one(struct dmar_domain *domain,
1974 struct intel_iommu *iommu,
1975 struct pasid_table *table,
1978 u16 did = domain->iommu_did[iommu->seq_id];
1979 int translation = CONTEXT_TT_MULTI_LEVEL;
1980 struct device_domain_info *info = NULL;
1981 struct context_entry *context;
1982 unsigned long flags;
1987 if (hw_pass_through && domain_type_is_si(domain))
1988 translation = CONTEXT_TT_PASS_THROUGH;
1990 pr_debug("Set context mapping for %02x:%02x.%d\n",
1991 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1993 BUG_ON(!domain->pgd);
1995 spin_lock_irqsave(&device_domain_lock, flags);
1996 spin_lock(&iommu->lock);
1999 context = iommu_context_addr(iommu, bus, devfn, 1);
2004 if (context_present(context))
2008 * For kdump cases, old valid entries may be cached due to the
2009 * in-flight DMA and copied pgtable, but there is no unmapping
2010 * behaviour for them, thus we need an explicit cache flush for
2011 * the newly-mapped device. For kdump, at this point, the device
2012 * is supposed to finish reset at its driver probe stage, so no
2013 * in-flight DMA will exist, and we don't need to worry anymore
2016 if (context_copied(context)) {
2017 u16 did_old = context_domain_id(context);
2019 if (did_old < cap_ndoms(iommu->cap)) {
2020 iommu->flush.flush_context(iommu, did_old,
2021 (((u16)bus) << 8) | devfn,
2022 DMA_CCMD_MASK_NOBIT,
2023 DMA_CCMD_DEVICE_INVL);
2024 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2029 context_clear_entry(context);
2031 if (sm_supported(iommu)) {
2036 /* Setup the PASID DIR pointer: */
2037 pds = context_get_sm_pds(table);
2038 context->lo = (u64)virt_to_phys(table->table) |
2041 /* Setup the RID_PASID field: */
2042 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2045 * Setup the Device-TLB enable bit and Page request
2048 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049 if (info && info->ats_supported)
2050 context_set_sm_dte(context);
2051 if (info && info->pri_supported)
2052 context_set_sm_pre(context);
2054 struct dma_pte *pgd = domain->pgd;
2057 context_set_domain_id(context, did);
2059 if (translation != CONTEXT_TT_PASS_THROUGH) {
2061 * Skip top levels of page tables for iommu which has
2062 * less agaw than default. Unnecessary for PT mode.
2064 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2066 pgd = phys_to_virt(dma_pte_addr(pgd));
2067 if (!dma_pte_present(pgd))
2071 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072 if (info && info->ats_supported)
2073 translation = CONTEXT_TT_DEV_IOTLB;
2075 translation = CONTEXT_TT_MULTI_LEVEL;
2077 context_set_address_root(context, virt_to_phys(pgd));
2078 context_set_address_width(context, agaw);
2081 * In pass through mode, AW must be programmed to
2082 * indicate the largest AGAW value supported by
2083 * hardware. And ASR is ignored by hardware.
2085 context_set_address_width(context, iommu->msagaw);
2088 context_set_translation_type(context, translation);
2091 context_set_fault_enable(context);
2092 context_set_present(context);
2093 domain_flush_cache(domain, context, sizeof(*context));
2096 * It's a non-present to present mapping. If hardware doesn't cache
2097 * non-present entry we only need to flush the write-buffer. If the
2098 * _does_ cache non-present entries, then it does so in the special
2099 * domain #0, which we have to flush:
2101 if (cap_caching_mode(iommu->cap)) {
2102 iommu->flush.flush_context(iommu, 0,
2103 (((u16)bus) << 8) | devfn,
2104 DMA_CCMD_MASK_NOBIT,
2105 DMA_CCMD_DEVICE_INVL);
2106 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2108 iommu_flush_write_buffer(iommu);
2110 iommu_enable_dev_iotlb(info);
2115 spin_unlock(&iommu->lock);
2116 spin_unlock_irqrestore(&device_domain_lock, flags);
2121 struct domain_context_mapping_data {
2122 struct dmar_domain *domain;
2123 struct intel_iommu *iommu;
2124 struct pasid_table *table;
2127 static int domain_context_mapping_cb(struct pci_dev *pdev,
2128 u16 alias, void *opaque)
2130 struct domain_context_mapping_data *data = opaque;
2132 return domain_context_mapping_one(data->domain, data->iommu,
2133 data->table, PCI_BUS_NUM(alias),
2138 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2140 struct domain_context_mapping_data data;
2141 struct pasid_table *table;
2142 struct intel_iommu *iommu;
2145 iommu = device_to_iommu(dev, &bus, &devfn);
2149 table = intel_pasid_get_table(dev);
2151 if (!dev_is_pci(dev))
2152 return domain_context_mapping_one(domain, iommu, table,
2155 data.domain = domain;
2159 return pci_for_each_dma_alias(to_pci_dev(dev),
2160 &domain_context_mapping_cb, &data);
2163 static int domain_context_mapped_cb(struct pci_dev *pdev,
2164 u16 alias, void *opaque)
2166 struct intel_iommu *iommu = opaque;
2168 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2171 static int domain_context_mapped(struct device *dev)
2173 struct intel_iommu *iommu;
2176 iommu = device_to_iommu(dev, &bus, &devfn);
2180 if (!dev_is_pci(dev))
2181 return device_context_mapped(iommu, bus, devfn);
2183 return !pci_for_each_dma_alias(to_pci_dev(dev),
2184 domain_context_mapped_cb, iommu);
2187 /* Returns a number of VTD pages, but aligned to MM page size */
2188 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2191 host_addr &= ~PAGE_MASK;
2192 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2195 /* Return largest possible superpage level for a given mapping */
2196 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197 unsigned long iov_pfn,
2198 unsigned long phy_pfn,
2199 unsigned long pages)
2201 int support, level = 1;
2202 unsigned long pfnmerge;
2204 support = domain->iommu_superpage;
2206 /* To use a large page, the virtual *and* physical addresses
2207 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208 of them will mean we have to use smaller pages. So just
2209 merge them and check both at once. */
2210 pfnmerge = iov_pfn | phy_pfn;
2212 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213 pages >>= VTD_STRIDE_SHIFT;
2216 pfnmerge >>= VTD_STRIDE_SHIFT;
2223 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224 struct scatterlist *sg, unsigned long phys_pfn,
2225 unsigned long nr_pages, int prot)
2227 struct dma_pte *first_pte = NULL, *pte = NULL;
2228 phys_addr_t uninitialized_var(pteval);
2229 unsigned long sg_res = 0;
2230 unsigned int largepage_lvl = 0;
2231 unsigned long lvl_pages = 0;
2233 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2235 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2238 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2242 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2245 while (nr_pages > 0) {
2249 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2251 sg_res = aligned_nrpages(sg->offset, sg->length);
2252 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253 sg->dma_length = sg->length;
2254 pteval = (sg_phys(sg) - pgoff) | prot;
2255 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2259 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2261 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2264 /* It is large page*/
2265 if (largepage_lvl > 1) {
2266 unsigned long nr_superpages, end_pfn;
2268 pteval |= DMA_PTE_LARGE_PAGE;
2269 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2271 nr_superpages = sg_res / lvl_pages;
2272 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2275 * Ensure that old small page tables are
2276 * removed to make room for superpage(s).
2277 * We're adding new large pages, so make sure
2278 * we don't remove their parent tables.
2280 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2283 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2287 /* We don't need lock here, nobody else
2288 * touches the iova range
2290 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2292 static int dumps = 5;
2293 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294 iov_pfn, tmp, (unsigned long long)pteval);
2297 debug_dma_dump_mappings(NULL);
2302 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2304 BUG_ON(nr_pages < lvl_pages);
2305 BUG_ON(sg_res < lvl_pages);
2307 nr_pages -= lvl_pages;
2308 iov_pfn += lvl_pages;
2309 phys_pfn += lvl_pages;
2310 pteval += lvl_pages * VTD_PAGE_SIZE;
2311 sg_res -= lvl_pages;
2313 /* If the next PTE would be the first in a new page, then we
2314 need to flush the cache on the entries we've just written.
2315 And then we'll need to recalculate 'pte', so clear it and
2316 let it get set again in the if (!pte) block above.
2318 If we're done (!nr_pages) we need to flush the cache too.
2320 Also if we've been setting superpages, we may need to
2321 recalculate 'pte' and switch back to smaller pages for the
2322 end of the mapping, if the trailing size is not enough to
2323 use another superpage (i.e. sg_res < lvl_pages). */
2325 if (!nr_pages || first_pte_in_page(pte) ||
2326 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327 domain_flush_cache(domain, first_pte,
2328 (void *)pte - (void *)first_pte);
2332 if (!sg_res && nr_pages)
2338 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339 struct scatterlist *sg, unsigned long phys_pfn,
2340 unsigned long nr_pages, int prot)
2343 struct intel_iommu *iommu;
2345 /* Do the real mapping first */
2346 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2350 /* Notify about the new mapping */
2351 if (domain_type_is_vm(domain)) {
2352 /* VM typed domains can have more than one IOMMUs */
2355 for_each_domain_iommu(iommu_id, domain) {
2356 iommu = g_iommus[iommu_id];
2357 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2360 /* General domains only have one IOMMU */
2361 iommu = domain_get_iommu(domain);
2362 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2368 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2369 struct scatterlist *sg, unsigned long nr_pages,
2372 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2375 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2376 unsigned long phys_pfn, unsigned long nr_pages,
2379 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2382 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2384 unsigned long flags;
2385 struct context_entry *context;
2391 spin_lock_irqsave(&iommu->lock, flags);
2392 context = iommu_context_addr(iommu, bus, devfn, 0);
2394 spin_unlock_irqrestore(&iommu->lock, flags);
2397 did_old = context_domain_id(context);
2398 context_clear_entry(context);
2399 __iommu_flush_cache(iommu, context, sizeof(*context));
2400 spin_unlock_irqrestore(&iommu->lock, flags);
2401 iommu->flush.flush_context(iommu,
2403 (((u16)bus) << 8) | devfn,
2404 DMA_CCMD_MASK_NOBIT,
2405 DMA_CCMD_DEVICE_INVL);
2406 iommu->flush.flush_iotlb(iommu,
2413 static inline void unlink_domain_info(struct device_domain_info *info)
2415 assert_spin_locked(&device_domain_lock);
2416 list_del(&info->link);
2417 list_del(&info->global);
2419 info->dev->archdata.iommu = NULL;
2422 static void domain_remove_dev_info(struct dmar_domain *domain)
2424 struct device_domain_info *info, *tmp;
2425 unsigned long flags;
2427 spin_lock_irqsave(&device_domain_lock, flags);
2428 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2429 __dmar_remove_one_dev_info(info);
2430 spin_unlock_irqrestore(&device_domain_lock, flags);
2435 * Note: we use struct device->archdata.iommu stores the info
2437 static struct dmar_domain *find_domain(struct device *dev)
2439 struct device_domain_info *info;
2441 /* No lock here, assumes no domain exit in normal case */
2442 info = dev->archdata.iommu;
2444 return info->domain;
2448 static inline struct device_domain_info *
2449 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2451 struct device_domain_info *info;
2453 list_for_each_entry(info, &device_domain_list, global)
2454 if (info->iommu->segment == segment && info->bus == bus &&
2455 info->devfn == devfn)
2461 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2464 struct dmar_domain *domain)
2466 struct dmar_domain *found = NULL;
2467 struct device_domain_info *info;
2468 unsigned long flags;
2471 info = alloc_devinfo_mem();
2476 info->devfn = devfn;
2477 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2481 info->domain = domain;
2482 info->iommu = iommu;
2483 info->pasid_table = NULL;
2484 info->auxd_enabled = 0;
2485 INIT_LIST_HEAD(&info->auxiliary_domains);
2487 if (dev && dev_is_pci(dev)) {
2488 struct pci_dev *pdev = to_pci_dev(info->dev);
2490 if (!pdev->untrusted &&
2491 !pci_ats_disabled() &&
2492 ecap_dev_iotlb_support(iommu->ecap) &&
2493 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494 dmar_find_matched_atsr_unit(pdev))
2495 info->ats_supported = 1;
2497 if (sm_supported(iommu)) {
2498 if (pasid_supported(iommu)) {
2499 int features = pci_pasid_features(pdev);
2501 info->pasid_supported = features | 1;
2504 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506 info->pri_supported = 1;
2510 spin_lock_irqsave(&device_domain_lock, flags);
2512 found = find_domain(dev);
2515 struct device_domain_info *info2;
2516 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2518 found = info2->domain;
2524 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 free_devinfo_mem(info);
2526 /* Caller must free the original domain */
2530 spin_lock(&iommu->lock);
2531 ret = domain_attach_iommu(domain, iommu);
2532 spin_unlock(&iommu->lock);
2535 spin_unlock_irqrestore(&device_domain_lock, flags);
2536 free_devinfo_mem(info);
2540 list_add(&info->link, &domain->devices);
2541 list_add(&info->global, &device_domain_list);
2543 dev->archdata.iommu = info;
2544 spin_unlock_irqrestore(&device_domain_lock, flags);
2546 /* PASID table is mandatory for a PCI device in scalable mode. */
2547 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2548 ret = intel_pasid_alloc_table(dev);
2550 dev_err(dev, "PASID table allocation failed\n");
2551 dmar_remove_one_dev_info(dev);
2555 /* Setup the PASID entry for requests without PASID: */
2556 spin_lock(&iommu->lock);
2557 if (hw_pass_through && domain_type_is_si(domain))
2558 ret = intel_pasid_setup_pass_through(iommu, domain,
2559 dev, PASID_RID2PASID);
2561 ret = intel_pasid_setup_second_level(iommu, domain,
2562 dev, PASID_RID2PASID);
2563 spin_unlock(&iommu->lock);
2565 dev_err(dev, "Setup RID2PASID failed\n");
2566 dmar_remove_one_dev_info(dev);
2571 if (dev && domain_context_mapping(domain, dev)) {
2572 dev_err(dev, "Domain context map failed\n");
2573 dmar_remove_one_dev_info(dev);
2580 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2582 *(u16 *)opaque = alias;
2586 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2588 struct device_domain_info *info;
2589 struct dmar_domain *domain = NULL;
2590 struct intel_iommu *iommu;
2592 unsigned long flags;
2595 iommu = device_to_iommu(dev, &bus, &devfn);
2599 if (dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(dev);
2602 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2604 spin_lock_irqsave(&device_domain_lock, flags);
2605 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2606 PCI_BUS_NUM(dma_alias),
2609 iommu = info->iommu;
2610 domain = info->domain;
2612 spin_unlock_irqrestore(&device_domain_lock, flags);
2614 /* DMA alias already has a domain, use it */
2619 /* Allocate and initialize new domain for the device */
2620 domain = alloc_domain(0);
2623 if (domain_init(domain, iommu, gaw)) {
2624 domain_exit(domain);
2633 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2634 struct dmar_domain *domain)
2636 struct intel_iommu *iommu;
2637 struct dmar_domain *tmp;
2638 u16 req_id, dma_alias;
2641 iommu = device_to_iommu(dev, &bus, &devfn);
2645 req_id = ((u16)bus << 8) | devfn;
2647 if (dev_is_pci(dev)) {
2648 struct pci_dev *pdev = to_pci_dev(dev);
2650 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2652 /* register PCI DMA alias device */
2653 if (req_id != dma_alias) {
2654 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2655 dma_alias & 0xff, NULL, domain);
2657 if (!tmp || tmp != domain)
2662 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2663 if (!tmp || tmp != domain)
2669 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2671 struct dmar_domain *domain, *tmp;
2673 domain = find_domain(dev);
2677 domain = find_or_alloc_domain(dev, gaw);
2681 tmp = set_domain_for_dev(dev, domain);
2682 if (!tmp || domain != tmp) {
2683 domain_exit(domain);
2692 static int iommu_domain_identity_map(struct dmar_domain *domain,
2693 unsigned long long start,
2694 unsigned long long end)
2696 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2697 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2699 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2700 dma_to_mm_pfn(last_vpfn))) {
2701 pr_err("Reserving iova failed\n");
2705 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2707 * RMRR range might have overlap with physical memory range,
2710 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2712 return __domain_mapping(domain, first_vpfn, NULL,
2713 first_vpfn, last_vpfn - first_vpfn + 1,
2714 DMA_PTE_READ|DMA_PTE_WRITE);
2717 static int domain_prepare_identity_map(struct device *dev,
2718 struct dmar_domain *domain,
2719 unsigned long long start,
2720 unsigned long long end)
2722 /* For _hardware_ passthrough, don't bother. But for software
2723 passthrough, we do it anyway -- it may indicate a memory
2724 range which is reserved in E820, so which didn't get set
2725 up to start with in si_domain */
2726 if (domain == si_domain && hw_pass_through) {
2727 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2732 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2735 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2736 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2737 dmi_get_system_info(DMI_BIOS_VENDOR),
2738 dmi_get_system_info(DMI_BIOS_VERSION),
2739 dmi_get_system_info(DMI_PRODUCT_VERSION));
2743 if (end >> agaw_to_width(domain->agaw)) {
2744 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2745 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2746 agaw_to_width(domain->agaw),
2747 dmi_get_system_info(DMI_BIOS_VENDOR),
2748 dmi_get_system_info(DMI_BIOS_VERSION),
2749 dmi_get_system_info(DMI_PRODUCT_VERSION));
2753 return iommu_domain_identity_map(domain, start, end);
2756 static int iommu_prepare_identity_map(struct device *dev,
2757 unsigned long long start,
2758 unsigned long long end)
2760 struct dmar_domain *domain;
2763 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2767 ret = domain_prepare_identity_map(dev, domain, start, end);
2769 domain_exit(domain);
2774 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2777 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2779 return iommu_prepare_identity_map(dev, rmrr->base_address,
2783 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2784 static inline void iommu_prepare_isa(void)
2786 struct pci_dev *pdev;
2789 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2793 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2794 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2797 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2802 static inline void iommu_prepare_isa(void)
2806 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2808 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2810 static int __init si_domain_init(int hw)
2812 struct dmar_rmrr_unit *rmrr;
2816 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2820 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2821 domain_exit(si_domain);
2828 for_each_online_node(nid) {
2829 unsigned long start_pfn, end_pfn;
2832 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2833 ret = iommu_domain_identity_map(si_domain,
2834 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2841 * Normally we use DMA domains for devices which have RMRRs. But we
2842 * loose this requirement for graphic and usb devices. Identity map
2843 * the RMRRs for graphic and USB devices so that they could use the
2846 for_each_rmrr_units(rmrr) {
2847 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2849 unsigned long long start = rmrr->base_address;
2850 unsigned long long end = rmrr->end_address;
2852 if (device_is_rmrr_locked(dev))
2855 if (WARN_ON(end < start ||
2856 end >> agaw_to_width(si_domain->agaw)))
2859 ret = iommu_domain_identity_map(si_domain, start, end);
2868 static int identity_mapping(struct device *dev)
2870 struct device_domain_info *info;
2872 info = dev->archdata.iommu;
2873 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2874 return (info->domain == si_domain);
2879 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2881 struct dmar_domain *ndomain;
2882 struct intel_iommu *iommu;
2885 iommu = device_to_iommu(dev, &bus, &devfn);
2889 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2890 if (ndomain != domain)
2896 static bool device_has_rmrr(struct device *dev)
2898 struct dmar_rmrr_unit *rmrr;
2903 for_each_rmrr_units(rmrr) {
2905 * Return TRUE if this RMRR contains the device that
2908 for_each_active_dev_scope(rmrr->devices,
2909 rmrr->devices_cnt, i, tmp)
2920 * There are a couple cases where we need to restrict the functionality of
2921 * devices associated with RMRRs. The first is when evaluating a device for
2922 * identity mapping because problems exist when devices are moved in and out
2923 * of domains and their respective RMRR information is lost. This means that
2924 * a device with associated RMRRs will never be in a "passthrough" domain.
2925 * The second is use of the device through the IOMMU API. This interface
2926 * expects to have full control of the IOVA space for the device. We cannot
2927 * satisfy both the requirement that RMRR access is maintained and have an
2928 * unencumbered IOVA space. We also have no ability to quiesce the device's
2929 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2930 * We therefore prevent devices associated with an RMRR from participating in
2931 * the IOMMU API, which eliminates them from device assignment.
2933 * In both cases we assume that PCI USB devices with RMRRs have them largely
2934 * for historical reasons and that the RMRR space is not actively used post
2935 * boot. This exclusion may change if vendors begin to abuse it.
2937 * The same exception is made for graphics devices, with the requirement that
2938 * any use of the RMRR regions will be torn down before assigning the device
2941 static bool device_is_rmrr_locked(struct device *dev)
2943 if (!device_has_rmrr(dev))
2946 if (dev_is_pci(dev)) {
2947 struct pci_dev *pdev = to_pci_dev(dev);
2949 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2957 * Return the required default domain type for a specific device.
2959 * @dev: the device in query
2960 * @startup: true if this is during early boot
2963 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2964 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2965 * - 0: both identity and dynamic domains work for this device
2967 static int device_def_domain_type(struct device *dev, int startup)
2969 if (dev_is_pci(dev)) {
2970 struct pci_dev *pdev = to_pci_dev(dev);
2972 if (device_is_rmrr_locked(dev))
2973 return IOMMU_DOMAIN_DMA;
2976 * Prevent any device marked as untrusted from getting
2977 * placed into the statically identity mapping domain.
2979 if (pdev->untrusted)
2980 return IOMMU_DOMAIN_DMA;
2982 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2983 return IOMMU_DOMAIN_IDENTITY;
2985 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2986 return IOMMU_DOMAIN_IDENTITY;
2989 * We want to start off with all devices in the 1:1 domain, and
2990 * take them out later if we find they can't access all of memory.
2992 * However, we can't do this for PCI devices behind bridges,
2993 * because all PCI devices behind the same bridge will end up
2994 * with the same source-id on their transactions.
2996 * Practically speaking, we can't change things around for these
2997 * devices at run-time, because we can't be sure there'll be no
2998 * DMA transactions in flight for any of their siblings.
3000 * So PCI devices (unless they're on the root bus) as well as
3001 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3002 * the 1:1 domain, just in _case_ one of their siblings turns out
3003 * not to be able to map all of memory.
3005 if (!pci_is_pcie(pdev)) {
3006 if (!pci_is_root_bus(pdev->bus))
3007 return IOMMU_DOMAIN_DMA;
3008 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3009 return IOMMU_DOMAIN_DMA;
3010 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3011 return IOMMU_DOMAIN_DMA;
3013 if (device_has_rmrr(dev))
3014 return IOMMU_DOMAIN_DMA;
3018 * At boot time, we don't yet know if devices will be 64-bit capable.
3019 * Assume that they will — if they turn out not to be, then we can
3020 * take them out of the 1:1 domain later.
3024 * If the device's dma_mask is less than the system's memory
3025 * size then this is not a candidate for identity mapping.
3027 u64 dma_mask = *dev->dma_mask;
3029 if (dev->coherent_dma_mask &&
3030 dev->coherent_dma_mask < dma_mask)
3031 dma_mask = dev->coherent_dma_mask;
3033 return dma_mask >= dma_get_required_mask(dev);
3036 return (iommu_identity_mapping & IDENTMAP_ALL) ?
3037 IOMMU_DOMAIN_IDENTITY : 0;
3040 static inline int iommu_should_identity_map(struct device *dev, int startup)
3042 return device_def_domain_type(dev, startup) == IOMMU_DOMAIN_IDENTITY;
3045 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3049 if (!iommu_should_identity_map(dev, 1))
3052 ret = domain_add_dev_info(si_domain, dev);
3054 dev_info(dev, "%s identity mapping\n",
3055 hw ? "Hardware" : "Software");
3056 else if (ret == -ENODEV)
3057 /* device not associated with an iommu */
3064 static int __init iommu_prepare_static_identity_mapping(int hw)
3066 struct pci_dev *pdev = NULL;
3067 struct dmar_drhd_unit *drhd;
3068 struct intel_iommu *iommu;
3073 for_each_pci_dev(pdev) {
3074 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3079 for_each_active_iommu(iommu, drhd)
3080 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3081 struct acpi_device_physical_node *pn;
3082 struct acpi_device *adev;
3084 if (dev->bus != &acpi_bus_type)
3087 adev= to_acpi_device(dev);
3088 mutex_lock(&adev->physical_node_lock);
3089 list_for_each_entry(pn, &adev->physical_node_list, node) {
3090 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3094 mutex_unlock(&adev->physical_node_lock);
3102 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3105 * Start from the sane iommu hardware state.
3106 * If the queued invalidation is already initialized by us
3107 * (for example, while enabling interrupt-remapping) then
3108 * we got the things already rolling from a sane state.
3112 * Clear any previous faults.
3114 dmar_fault(-1, iommu);
3116 * Disable queued invalidation if supported and already enabled
3117 * before OS handover.
3119 dmar_disable_qi(iommu);
3122 if (dmar_enable_qi(iommu)) {
3124 * Queued Invalidate not enabled, use Register Based Invalidate
3126 iommu->flush.flush_context = __iommu_flush_context;
3127 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3128 pr_info("%s: Using Register based invalidation\n",
3131 iommu->flush.flush_context = qi_flush_context;
3132 iommu->flush.flush_iotlb = qi_flush_iotlb;
3133 pr_info("%s: Using Queued invalidation\n", iommu->name);
3137 static int copy_context_table(struct intel_iommu *iommu,
3138 struct root_entry *old_re,
3139 struct context_entry **tbl,
3142 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3143 struct context_entry *new_ce = NULL, ce;
3144 struct context_entry *old_ce = NULL;
3145 struct root_entry re;
3146 phys_addr_t old_ce_phys;
3148 tbl_idx = ext ? bus * 2 : bus;
3149 memcpy(&re, old_re, sizeof(re));
3151 for (devfn = 0; devfn < 256; devfn++) {
3152 /* First calculate the correct index */
3153 idx = (ext ? devfn * 2 : devfn) % 256;
3156 /* First save what we may have and clean up */
3158 tbl[tbl_idx] = new_ce;
3159 __iommu_flush_cache(iommu, new_ce,
3169 old_ce_phys = root_entry_lctp(&re);
3171 old_ce_phys = root_entry_uctp(&re);
3174 if (ext && devfn == 0) {
3175 /* No LCTP, try UCTP */
3184 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3189 new_ce = alloc_pgtable_page(iommu->node);
3196 /* Now copy the context entry */
3197 memcpy(&ce, old_ce + idx, sizeof(ce));
3199 if (!__context_present(&ce))
3202 did = context_domain_id(&ce);
3203 if (did >= 0 && did < cap_ndoms(iommu->cap))
3204 set_bit(did, iommu->domain_ids);
3207 * We need a marker for copied context entries. This
3208 * marker needs to work for the old format as well as
3209 * for extended context entries.
3211 * Bit 67 of the context entry is used. In the old
3212 * format this bit is available to software, in the
3213 * extended format it is the PGE bit, but PGE is ignored
3214 * by HW if PASIDs are disabled (and thus still
3217 * So disable PASIDs first and then mark the entry
3218 * copied. This means that we don't copy PASID
3219 * translations from the old kernel, but this is fine as
3220 * faults there are not fatal.
3222 context_clear_pasid_enable(&ce);
3223 context_set_copied(&ce);
3228 tbl[tbl_idx + pos] = new_ce;
3230 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3239 static int copy_translation_tables(struct intel_iommu *iommu)
3241 struct context_entry **ctxt_tbls;
3242 struct root_entry *old_rt;
3243 phys_addr_t old_rt_phys;
3244 int ctxt_table_entries;
3245 unsigned long flags;
3250 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3251 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3252 new_ext = !!ecap_ecs(iommu->ecap);
3255 * The RTT bit can only be changed when translation is disabled,
3256 * but disabling translation means to open a window for data
3257 * corruption. So bail out and don't copy anything if we would
3258 * have to change the bit.
3263 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3267 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3271 /* This is too big for the stack - allocate it from slab */
3272 ctxt_table_entries = ext ? 512 : 256;
3274 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3278 for (bus = 0; bus < 256; bus++) {
3279 ret = copy_context_table(iommu, &old_rt[bus],
3280 ctxt_tbls, bus, ext);
3282 pr_err("%s: Failed to copy context table for bus %d\n",
3288 spin_lock_irqsave(&iommu->lock, flags);
3290 /* Context tables are copied, now write them to the root_entry table */
3291 for (bus = 0; bus < 256; bus++) {
3292 int idx = ext ? bus * 2 : bus;
3295 if (ctxt_tbls[idx]) {
3296 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3297 iommu->root_entry[bus].lo = val;
3300 if (!ext || !ctxt_tbls[idx + 1])
3303 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3304 iommu->root_entry[bus].hi = val;
3307 spin_unlock_irqrestore(&iommu->lock, flags);
3311 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3321 static int __init init_dmars(void)
3323 struct dmar_drhd_unit *drhd;
3324 struct dmar_rmrr_unit *rmrr;
3325 bool copied_tables = false;
3327 struct intel_iommu *iommu;
3333 * initialize and program root entry to not present
3336 for_each_drhd_unit(drhd) {
3338 * lock not needed as this is only incremented in the single
3339 * threaded kernel __init code path all other access are read
3342 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3346 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3349 /* Preallocate enough resources for IOMMU hot-addition */
3350 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3351 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3353 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3356 pr_err("Allocating global iommu array failed\n");
3361 for_each_active_iommu(iommu, drhd) {
3363 * Find the max pasid size of all IOMMU's in the system.
3364 * We need to ensure the system pasid table is no bigger
3365 * than the smallest supported.
3367 if (pasid_supported(iommu)) {
3368 u32 temp = 2 << ecap_pss(iommu->ecap);
3370 intel_pasid_max_id = min_t(u32, temp,
3371 intel_pasid_max_id);
3374 g_iommus[iommu->seq_id] = iommu;
3376 intel_iommu_init_qi(iommu);
3378 ret = iommu_init_domains(iommu);
3382 init_translation_status(iommu);
3384 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3385 iommu_disable_translation(iommu);
3386 clear_translation_pre_enabled(iommu);
3387 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3393 * we could share the same root & context tables
3394 * among all IOMMU's. Need to Split it later.
3396 ret = iommu_alloc_root_entry(iommu);
3400 if (translation_pre_enabled(iommu)) {
3401 pr_info("Translation already enabled - trying to copy translation structures\n");
3403 ret = copy_translation_tables(iommu);
3406 * We found the IOMMU with translation
3407 * enabled - but failed to copy over the
3408 * old root-entry table. Try to proceed
3409 * by disabling translation now and
3410 * allocating a clean root-entry table.
3411 * This might cause DMAR faults, but
3412 * probably the dump will still succeed.
3414 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3416 iommu_disable_translation(iommu);
3417 clear_translation_pre_enabled(iommu);
3419 pr_info("Copied translation tables from previous kernel for %s\n",
3421 copied_tables = true;
3425 if (!ecap_pass_through(iommu->ecap))
3426 hw_pass_through = 0;
3427 #ifdef CONFIG_INTEL_IOMMU_SVM
3428 if (pasid_supported(iommu))
3429 intel_svm_init(iommu);
3434 * Now that qi is enabled on all iommus, set the root entry and flush
3435 * caches. This is required on some Intel X58 chipsets, otherwise the
3436 * flush_context function will loop forever and the boot hangs.
3438 for_each_active_iommu(iommu, drhd) {
3439 iommu_flush_write_buffer(iommu);
3440 iommu_set_root_entry(iommu);
3441 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3442 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3445 if (iommu_pass_through)
3446 iommu_identity_mapping |= IDENTMAP_ALL;
3448 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3453 iommu_identity_mapping |= IDENTMAP_GFX;
3455 check_tylersburg_isoch();
3457 ret = si_domain_init(hw_pass_through);
3463 * If we copied translations from a previous kernel in the kdump
3464 * case, we can not assign the devices to domains now, as that
3465 * would eliminate the old mappings. So skip this part and defer
3466 * the assignment to device driver initialization time.
3472 * If pass through is not set or not enabled, setup context entries for
3473 * identity mappings for rmrr, gfx, and isa and may fall back to static
3474 * identity mapping if iommu_identity_mapping is set.
3476 if (iommu_identity_mapping) {
3477 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3479 pr_crit("Failed to setup IOMMU pass-through\n");
3485 * for each dev attached to rmrr
3487 * locate drhd for dev, alloc domain for dev
3488 * allocate free domain
3489 * allocate page table entries for rmrr
3490 * if context not allocated for bus
3491 * allocate and init context
3492 * set present in root table for this bus
3493 * init context with domain, translation etc
3497 pr_info("Setting RMRR:\n");
3498 for_each_rmrr_units(rmrr) {
3499 /* some BIOS lists non-exist devices in DMAR table. */
3500 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3502 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3504 pr_err("Mapping reserved region failed\n");
3508 iommu_prepare_isa();
3515 * global invalidate context cache
3516 * global invalidate iotlb
3517 * enable translation
3519 for_each_iommu(iommu, drhd) {
3520 if (drhd->ignored) {
3522 * we always have to disable PMRs or DMA may fail on
3526 iommu_disable_protect_mem_regions(iommu);
3530 iommu_flush_write_buffer(iommu);
3532 #ifdef CONFIG_INTEL_IOMMU_SVM
3533 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3535 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3536 * could cause possible lock race condition.
3538 up_write(&dmar_global_lock);
3539 ret = intel_svm_enable_prq(iommu);
3540 down_write(&dmar_global_lock);
3545 ret = dmar_set_interrupt(iommu);
3553 for_each_active_iommu(iommu, drhd) {
3554 disable_dmar_iommu(iommu);
3555 free_dmar_iommu(iommu);
3564 /* This takes a number of _MM_ pages, not VTD pages */
3565 static unsigned long intel_alloc_iova(struct device *dev,
3566 struct dmar_domain *domain,
3567 unsigned long nrpages, uint64_t dma_mask)
3569 unsigned long iova_pfn;
3571 /* Restrict dma_mask to the width that the iommu can handle */
3572 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3573 /* Ensure we reserve the whole size-aligned region */
3574 nrpages = __roundup_pow_of_two(nrpages);
3576 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3578 * First try to allocate an io virtual address in
3579 * DMA_BIT_MASK(32) and if that fails then try allocating
3582 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3583 IOVA_PFN(DMA_BIT_MASK(32)), false);
3587 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3588 IOVA_PFN(dma_mask), true);
3589 if (unlikely(!iova_pfn)) {
3590 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3597 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3599 struct dmar_domain *domain, *tmp;
3600 struct dmar_rmrr_unit *rmrr;
3601 struct device *i_dev;
3604 domain = find_domain(dev);
3608 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3612 /* We have a new domain - setup possible RMRRs for the device */
3614 for_each_rmrr_units(rmrr) {
3615 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3620 ret = domain_prepare_identity_map(dev, domain,
3624 dev_err(dev, "Mapping reserved region failed\n");
3629 tmp = set_domain_for_dev(dev, domain);
3630 if (!tmp || domain != tmp) {
3631 domain_exit(domain);
3638 dev_err(dev, "Allocating domain failed\n");
3644 /* Check if the dev needs to go through non-identity map and unmap process.*/
3645 static bool iommu_need_mapping(struct device *dev)
3649 if (iommu_dummy(dev))
3652 found = identity_mapping(dev);
3654 if (iommu_should_identity_map(dev, 0))
3658 * 32 bit DMA is removed from si_domain and fall back to
3659 * non-identity mapping.
3661 dmar_remove_one_dev_info(dev);
3662 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3665 * In case of a detached 64 bit DMA device from vm, the device
3666 * is put into si_domain for identity mapping.
3668 if (iommu_should_identity_map(dev, 0) &&
3669 !domain_add_dev_info(si_domain, dev)) {
3670 dev_info(dev, "64bit DMA uses identity mapping\n");
3678 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3679 size_t size, int dir, u64 dma_mask)
3681 struct dmar_domain *domain;
3682 phys_addr_t start_paddr;
3683 unsigned long iova_pfn;
3686 struct intel_iommu *iommu;
3687 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3689 BUG_ON(dir == DMA_NONE);
3691 domain = get_valid_domain_for_dev(dev);
3693 return DMA_MAPPING_ERROR;
3695 iommu = domain_get_iommu(domain);
3696 size = aligned_nrpages(paddr, size);
3698 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3703 * Check if DMAR supports zero-length reads on write only
3706 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3707 !cap_zlr(iommu->cap))
3708 prot |= DMA_PTE_READ;
3709 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3710 prot |= DMA_PTE_WRITE;
3712 * paddr - (paddr + size) might be partial page, we should map the whole
3713 * page. Note: if two part of one page are separately mapped, we
3714 * might have two guest_addr mapping to the same host paddr, but this
3715 * is not a big problem
3717 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3718 mm_to_dma_pfn(paddr_pfn), size, prot);
3722 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3723 start_paddr += paddr & ~PAGE_MASK;
3728 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3729 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3730 size, (unsigned long long)paddr, dir);
3731 return DMA_MAPPING_ERROR;
3734 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3735 unsigned long offset, size_t size,
3736 enum dma_data_direction dir,
3737 unsigned long attrs)
3739 if (iommu_need_mapping(dev))
3740 return __intel_map_single(dev, page_to_phys(page) + offset,
3741 size, dir, *dev->dma_mask);
3742 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3745 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3746 size_t size, enum dma_data_direction dir,
3747 unsigned long attrs)
3749 if (iommu_need_mapping(dev))
3750 return __intel_map_single(dev, phys_addr, size, dir,
3752 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3755 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3757 struct dmar_domain *domain;
3758 unsigned long start_pfn, last_pfn;
3759 unsigned long nrpages;
3760 unsigned long iova_pfn;
3761 struct intel_iommu *iommu;
3762 struct page *freelist;
3763 struct pci_dev *pdev = NULL;
3765 domain = find_domain(dev);
3768 iommu = domain_get_iommu(domain);
3770 iova_pfn = IOVA_PFN(dev_addr);
3772 nrpages = aligned_nrpages(dev_addr, size);
3773 start_pfn = mm_to_dma_pfn(iova_pfn);
3774 last_pfn = start_pfn + nrpages - 1;
3776 if (dev_is_pci(dev))
3777 pdev = to_pci_dev(dev);
3779 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3781 freelist = domain_unmap(domain, start_pfn, last_pfn);
3783 if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3784 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3785 nrpages, !freelist, 0);
3787 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3788 dma_free_pagelist(freelist);
3790 queue_iova(&domain->iovad, iova_pfn, nrpages,
3791 (unsigned long)freelist);
3793 * queue up the release of the unmap to save the 1/6th of the
3794 * cpu used up by the iotlb flush operation...
3799 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3800 size_t size, enum dma_data_direction dir,
3801 unsigned long attrs)
3803 if (iommu_need_mapping(dev))
3804 intel_unmap(dev, dev_addr, size);
3806 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3809 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3810 size_t size, enum dma_data_direction dir, unsigned long attrs)
3812 if (iommu_need_mapping(dev))
3813 intel_unmap(dev, dev_addr, size);
3816 static void *intel_alloc_coherent(struct device *dev, size_t size,
3817 dma_addr_t *dma_handle, gfp_t flags,
3818 unsigned long attrs)
3820 struct page *page = NULL;
3823 if (!iommu_need_mapping(dev))
3824 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3826 size = PAGE_ALIGN(size);
3827 order = get_order(size);
3829 if (gfpflags_allow_blocking(flags)) {
3830 unsigned int count = size >> PAGE_SHIFT;
3832 page = dma_alloc_from_contiguous(dev, count, order,
3833 flags & __GFP_NOWARN);
3837 page = alloc_pages(flags, order);
3840 memset(page_address(page), 0, size);
3842 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3844 dev->coherent_dma_mask);
3845 if (*dma_handle != DMA_MAPPING_ERROR)
3846 return page_address(page);
3847 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3848 __free_pages(page, order);
3853 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3854 dma_addr_t dma_handle, unsigned long attrs)
3857 struct page *page = virt_to_page(vaddr);
3859 if (!iommu_need_mapping(dev))
3860 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3862 size = PAGE_ALIGN(size);
3863 order = get_order(size);
3865 intel_unmap(dev, dma_handle, size);
3866 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3867 __free_pages(page, order);
3870 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3871 int nelems, enum dma_data_direction dir,
3872 unsigned long attrs)
3874 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3875 unsigned long nrpages = 0;
3876 struct scatterlist *sg;
3879 if (!iommu_need_mapping(dev))
3880 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3882 for_each_sg(sglist, sg, nelems, i) {
3883 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3886 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3889 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3890 enum dma_data_direction dir, unsigned long attrs)
3893 struct dmar_domain *domain;
3896 unsigned long iova_pfn;
3898 struct scatterlist *sg;
3899 unsigned long start_vpfn;
3900 struct intel_iommu *iommu;
3902 BUG_ON(dir == DMA_NONE);
3903 if (!iommu_need_mapping(dev))
3904 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3906 domain = get_valid_domain_for_dev(dev);
3910 iommu = domain_get_iommu(domain);
3912 for_each_sg(sglist, sg, nelems, i)
3913 size += aligned_nrpages(sg->offset, sg->length);
3915 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3918 sglist->dma_length = 0;
3923 * Check if DMAR supports zero-length reads on write only
3926 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3927 !cap_zlr(iommu->cap))
3928 prot |= DMA_PTE_READ;
3929 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3930 prot |= DMA_PTE_WRITE;
3932 start_vpfn = mm_to_dma_pfn(iova_pfn);
3934 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3935 if (unlikely(ret)) {
3936 dma_pte_free_pagetable(domain, start_vpfn,
3937 start_vpfn + size - 1,
3938 agaw_to_level(domain->agaw) + 1);
3939 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3946 static const struct dma_map_ops intel_dma_ops = {
3947 .alloc = intel_alloc_coherent,
3948 .free = intel_free_coherent,
3949 .map_sg = intel_map_sg,
3950 .unmap_sg = intel_unmap_sg,
3951 .map_page = intel_map_page,
3952 .unmap_page = intel_unmap_page,
3953 .map_resource = intel_map_resource,
3954 .unmap_resource = intel_unmap_resource,
3955 .dma_supported = dma_direct_supported,
3958 static inline int iommu_domain_cache_init(void)
3962 iommu_domain_cache = kmem_cache_create("iommu_domain",
3963 sizeof(struct dmar_domain),
3968 if (!iommu_domain_cache) {
3969 pr_err("Couldn't create iommu_domain cache\n");
3976 static inline int iommu_devinfo_cache_init(void)
3980 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3981 sizeof(struct device_domain_info),
3985 if (!iommu_devinfo_cache) {
3986 pr_err("Couldn't create devinfo cache\n");
3993 static int __init iommu_init_mempool(void)
3996 ret = iova_cache_get();
4000 ret = iommu_domain_cache_init();
4004 ret = iommu_devinfo_cache_init();
4008 kmem_cache_destroy(iommu_domain_cache);
4015 static void __init iommu_exit_mempool(void)
4017 kmem_cache_destroy(iommu_devinfo_cache);
4018 kmem_cache_destroy(iommu_domain_cache);
4022 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4024 struct dmar_drhd_unit *drhd;
4028 /* We know that this device on this chipset has its own IOMMU.
4029 * If we find it under a different IOMMU, then the BIOS is lying
4030 * to us. Hope that the IOMMU for this device is actually
4031 * disabled, and it needs no translation...
4033 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4035 /* "can't" happen */
4036 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4039 vtbar &= 0xffff0000;
4041 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4042 drhd = dmar_find_matched_drhd_unit(pdev);
4043 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4044 TAINT_FIRMWARE_WORKAROUND,
4045 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4046 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4048 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4050 static void __init init_no_remapping_devices(void)
4052 struct dmar_drhd_unit *drhd;
4056 for_each_drhd_unit(drhd) {
4057 if (!drhd->include_all) {
4058 for_each_active_dev_scope(drhd->devices,
4059 drhd->devices_cnt, i, dev)
4061 /* ignore DMAR unit if no devices exist */
4062 if (i == drhd->devices_cnt)
4067 for_each_active_drhd_unit(drhd) {
4068 if (drhd->include_all)
4071 for_each_active_dev_scope(drhd->devices,
4072 drhd->devices_cnt, i, dev)
4073 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4075 if (i < drhd->devices_cnt)
4078 /* This IOMMU has *only* gfx devices. Either bypass it or
4079 set the gfx_mapped flag, as appropriate */
4080 if (!dmar_map_gfx) {
4082 for_each_active_dev_scope(drhd->devices,
4083 drhd->devices_cnt, i, dev)
4084 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4089 #ifdef CONFIG_SUSPEND
4090 static int init_iommu_hw(void)
4092 struct dmar_drhd_unit *drhd;
4093 struct intel_iommu *iommu = NULL;
4095 for_each_active_iommu(iommu, drhd)
4097 dmar_reenable_qi(iommu);
4099 for_each_iommu(iommu, drhd) {
4100 if (drhd->ignored) {
4102 * we always have to disable PMRs or DMA may fail on
4106 iommu_disable_protect_mem_regions(iommu);
4110 iommu_flush_write_buffer(iommu);
4112 iommu_set_root_entry(iommu);
4114 iommu->flush.flush_context(iommu, 0, 0, 0,
4115 DMA_CCMD_GLOBAL_INVL);
4116 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4117 iommu_enable_translation(iommu);
4118 iommu_disable_protect_mem_regions(iommu);
4124 static void iommu_flush_all(void)
4126 struct dmar_drhd_unit *drhd;
4127 struct intel_iommu *iommu;
4129 for_each_active_iommu(iommu, drhd) {
4130 iommu->flush.flush_context(iommu, 0, 0, 0,
4131 DMA_CCMD_GLOBAL_INVL);
4132 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4133 DMA_TLB_GLOBAL_FLUSH);
4137 static int iommu_suspend(void)
4139 struct dmar_drhd_unit *drhd;
4140 struct intel_iommu *iommu = NULL;
4143 for_each_active_iommu(iommu, drhd) {
4144 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4146 if (!iommu->iommu_state)
4152 for_each_active_iommu(iommu, drhd) {
4153 iommu_disable_translation(iommu);
4155 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4157 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4158 readl(iommu->reg + DMAR_FECTL_REG);
4159 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4160 readl(iommu->reg + DMAR_FEDATA_REG);
4161 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4162 readl(iommu->reg + DMAR_FEADDR_REG);
4163 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4164 readl(iommu->reg + DMAR_FEUADDR_REG);
4166 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4171 for_each_active_iommu(iommu, drhd)
4172 kfree(iommu->iommu_state);
4177 static void iommu_resume(void)
4179 struct dmar_drhd_unit *drhd;
4180 struct intel_iommu *iommu = NULL;
4183 if (init_iommu_hw()) {
4185 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4187 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4191 for_each_active_iommu(iommu, drhd) {
4193 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4195 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4196 iommu->reg + DMAR_FECTL_REG);
4197 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4198 iommu->reg + DMAR_FEDATA_REG);
4199 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4200 iommu->reg + DMAR_FEADDR_REG);
4201 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4202 iommu->reg + DMAR_FEUADDR_REG);
4204 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4207 for_each_active_iommu(iommu, drhd)
4208 kfree(iommu->iommu_state);
4211 static struct syscore_ops iommu_syscore_ops = {
4212 .resume = iommu_resume,
4213 .suspend = iommu_suspend,
4216 static void __init init_iommu_pm_ops(void)
4218 register_syscore_ops(&iommu_syscore_ops);
4222 static inline void init_iommu_pm_ops(void) {}
4223 #endif /* CONFIG_PM */
4226 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4228 struct acpi_dmar_reserved_memory *rmrr;
4229 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4230 struct dmar_rmrr_unit *rmrru;
4233 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4237 rmrru->hdr = header;
4238 rmrr = (struct acpi_dmar_reserved_memory *)header;
4239 rmrru->base_address = rmrr->base_address;
4240 rmrru->end_address = rmrr->end_address;
4242 length = rmrr->end_address - rmrr->base_address + 1;
4243 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4248 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4249 ((void *)rmrr) + rmrr->header.length,
4250 &rmrru->devices_cnt);
4251 if (rmrru->devices_cnt && rmrru->devices == NULL)
4254 list_add(&rmrru->list, &dmar_rmrr_units);
4265 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4267 struct dmar_atsr_unit *atsru;
4268 struct acpi_dmar_atsr *tmp;
4270 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4271 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4272 if (atsr->segment != tmp->segment)
4274 if (atsr->header.length != tmp->header.length)
4276 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4283 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4285 struct acpi_dmar_atsr *atsr;
4286 struct dmar_atsr_unit *atsru;
4288 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4291 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4292 atsru = dmar_find_atsr(atsr);
4296 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4301 * If memory is allocated from slab by ACPI _DSM method, we need to
4302 * copy the memory content because the memory buffer will be freed
4305 atsru->hdr = (void *)(atsru + 1);
4306 memcpy(atsru->hdr, hdr, hdr->length);
4307 atsru->include_all = atsr->flags & 0x1;
4308 if (!atsru->include_all) {
4309 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4310 (void *)atsr + atsr->header.length,
4311 &atsru->devices_cnt);
4312 if (atsru->devices_cnt && atsru->devices == NULL) {
4318 list_add_rcu(&atsru->list, &dmar_atsr_units);
4323 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4325 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4329 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4331 struct acpi_dmar_atsr *atsr;
4332 struct dmar_atsr_unit *atsru;
4334 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4335 atsru = dmar_find_atsr(atsr);
4337 list_del_rcu(&atsru->list);
4339 intel_iommu_free_atsr(atsru);
4345 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4349 struct acpi_dmar_atsr *atsr;
4350 struct dmar_atsr_unit *atsru;
4352 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4353 atsru = dmar_find_atsr(atsr);
4357 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4358 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4366 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4369 struct intel_iommu *iommu = dmaru->iommu;
4371 if (g_iommus[iommu->seq_id])
4374 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4375 pr_warn("%s: Doesn't support hardware pass through.\n",
4379 if (!ecap_sc_support(iommu->ecap) &&
4380 domain_update_iommu_snooping(iommu)) {
4381 pr_warn("%s: Doesn't support snooping.\n",
4385 sp = domain_update_iommu_superpage(iommu) - 1;
4386 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4387 pr_warn("%s: Doesn't support large page.\n",
4393 * Disable translation if already enabled prior to OS handover.
4395 if (iommu->gcmd & DMA_GCMD_TE)
4396 iommu_disable_translation(iommu);
4398 g_iommus[iommu->seq_id] = iommu;
4399 ret = iommu_init_domains(iommu);
4401 ret = iommu_alloc_root_entry(iommu);
4405 #ifdef CONFIG_INTEL_IOMMU_SVM
4406 if (pasid_supported(iommu))
4407 intel_svm_init(iommu);
4410 if (dmaru->ignored) {
4412 * we always have to disable PMRs or DMA may fail on this device
4415 iommu_disable_protect_mem_regions(iommu);
4419 intel_iommu_init_qi(iommu);
4420 iommu_flush_write_buffer(iommu);
4422 #ifdef CONFIG_INTEL_IOMMU_SVM
4423 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4424 ret = intel_svm_enable_prq(iommu);
4429 ret = dmar_set_interrupt(iommu);
4433 iommu_set_root_entry(iommu);
4434 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4435 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4436 iommu_enable_translation(iommu);
4438 iommu_disable_protect_mem_regions(iommu);
4442 disable_dmar_iommu(iommu);
4444 free_dmar_iommu(iommu);
4448 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4451 struct intel_iommu *iommu = dmaru->iommu;
4453 if (!intel_iommu_enabled)
4459 ret = intel_iommu_add(dmaru);
4461 disable_dmar_iommu(iommu);
4462 free_dmar_iommu(iommu);
4468 static void intel_iommu_free_dmars(void)
4470 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4471 struct dmar_atsr_unit *atsru, *atsr_n;
4473 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4474 list_del(&rmrru->list);
4475 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4480 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4481 list_del(&atsru->list);
4482 intel_iommu_free_atsr(atsru);
4486 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4489 struct pci_bus *bus;
4490 struct pci_dev *bridge = NULL;
4492 struct acpi_dmar_atsr *atsr;
4493 struct dmar_atsr_unit *atsru;
4495 dev = pci_physfn(dev);
4496 for (bus = dev->bus; bus; bus = bus->parent) {
4498 /* If it's an integrated device, allow ATS */
4501 /* Connected via non-PCIe: no ATS */
4502 if (!pci_is_pcie(bridge) ||
4503 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4505 /* If we found the root port, look it up in the ATSR */
4506 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4511 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4512 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4513 if (atsr->segment != pci_domain_nr(dev->bus))
4516 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4517 if (tmp == &bridge->dev)
4520 if (atsru->include_all)
4530 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4533 struct dmar_rmrr_unit *rmrru;
4534 struct dmar_atsr_unit *atsru;
4535 struct acpi_dmar_atsr *atsr;
4536 struct acpi_dmar_reserved_memory *rmrr;
4538 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4541 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4542 rmrr = container_of(rmrru->hdr,
4543 struct acpi_dmar_reserved_memory, header);
4544 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4545 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4546 ((void *)rmrr) + rmrr->header.length,
4547 rmrr->segment, rmrru->devices,
4548 rmrru->devices_cnt);
4551 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4552 dmar_remove_dev_scope(info, rmrr->segment,
4553 rmrru->devices, rmrru->devices_cnt);
4557 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4558 if (atsru->include_all)
4561 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4562 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4563 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4564 (void *)atsr + atsr->header.length,
4565 atsr->segment, atsru->devices,
4566 atsru->devices_cnt);
4571 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4572 if (dmar_remove_dev_scope(info, atsr->segment,
4573 atsru->devices, atsru->devices_cnt))
4582 * Here we only respond to action of unbound device from driver.
4584 * Added device is not attached to its DMAR domain here yet. That will happen
4585 * when mapping the device to iova.
4587 static int device_notifier(struct notifier_block *nb,
4588 unsigned long action, void *data)
4590 struct device *dev = data;
4591 struct dmar_domain *domain;
4593 if (iommu_dummy(dev))
4596 if (action == BUS_NOTIFY_REMOVED_DEVICE) {
4597 domain = find_domain(dev);
4601 dmar_remove_one_dev_info(dev);
4602 if (!domain_type_is_vm_or_si(domain) &&
4603 list_empty(&domain->devices))
4604 domain_exit(domain);
4605 } else if (action == BUS_NOTIFY_ADD_DEVICE) {
4606 if (iommu_should_identity_map(dev, 1))
4607 domain_add_dev_info(si_domain, dev);
4613 static struct notifier_block device_nb = {
4614 .notifier_call = device_notifier,
4617 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4618 unsigned long val, void *v)
4620 struct memory_notify *mhp = v;
4621 unsigned long long start, end;
4622 unsigned long start_vpfn, last_vpfn;
4625 case MEM_GOING_ONLINE:
4626 start = mhp->start_pfn << PAGE_SHIFT;
4627 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4628 if (iommu_domain_identity_map(si_domain, start, end)) {
4629 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4636 case MEM_CANCEL_ONLINE:
4637 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4638 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4639 while (start_vpfn <= last_vpfn) {
4641 struct dmar_drhd_unit *drhd;
4642 struct intel_iommu *iommu;
4643 struct page *freelist;
4645 iova = find_iova(&si_domain->iovad, start_vpfn);
4647 pr_debug("Failed get IOVA for PFN %lx\n",
4652 iova = split_and_remove_iova(&si_domain->iovad, iova,
4653 start_vpfn, last_vpfn);
4655 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4656 start_vpfn, last_vpfn);
4660 freelist = domain_unmap(si_domain, iova->pfn_lo,
4664 for_each_active_iommu(iommu, drhd)
4665 iommu_flush_iotlb_psi(iommu, si_domain,
4666 iova->pfn_lo, iova_size(iova),
4669 dma_free_pagelist(freelist);
4671 start_vpfn = iova->pfn_hi + 1;
4672 free_iova_mem(iova);
4680 static struct notifier_block intel_iommu_memory_nb = {
4681 .notifier_call = intel_iommu_memory_notifier,
4685 static void free_all_cpu_cached_iovas(unsigned int cpu)
4689 for (i = 0; i < g_num_of_iommus; i++) {
4690 struct intel_iommu *iommu = g_iommus[i];
4691 struct dmar_domain *domain;
4697 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4698 domain = get_iommu_domain(iommu, (u16)did);
4702 free_cpu_cached_iovas(cpu, &domain->iovad);
4707 static int intel_iommu_cpu_dead(unsigned int cpu)
4709 free_all_cpu_cached_iovas(cpu);
4713 static void intel_disable_iommus(void)
4715 struct intel_iommu *iommu = NULL;
4716 struct dmar_drhd_unit *drhd;
4718 for_each_iommu(iommu, drhd)
4719 iommu_disable_translation(iommu);
4722 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4724 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4726 return container_of(iommu_dev, struct intel_iommu, iommu);
4729 static ssize_t intel_iommu_show_version(struct device *dev,
4730 struct device_attribute *attr,
4733 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4734 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4735 return sprintf(buf, "%d:%d\n",
4736 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4738 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4740 static ssize_t intel_iommu_show_address(struct device *dev,
4741 struct device_attribute *attr,
4744 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4745 return sprintf(buf, "%llx\n", iommu->reg_phys);
4747 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4749 static ssize_t intel_iommu_show_cap(struct device *dev,
4750 struct device_attribute *attr,
4753 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4754 return sprintf(buf, "%llx\n", iommu->cap);
4756 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4758 static ssize_t intel_iommu_show_ecap(struct device *dev,
4759 struct device_attribute *attr,
4762 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4763 return sprintf(buf, "%llx\n", iommu->ecap);
4765 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4767 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4768 struct device_attribute *attr,
4771 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4772 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4774 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4776 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4777 struct device_attribute *attr,
4780 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4781 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4782 cap_ndoms(iommu->cap)));
4784 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4786 static struct attribute *intel_iommu_attrs[] = {
4787 &dev_attr_version.attr,
4788 &dev_attr_address.attr,
4790 &dev_attr_ecap.attr,
4791 &dev_attr_domains_supported.attr,
4792 &dev_attr_domains_used.attr,
4796 static struct attribute_group intel_iommu_group = {
4797 .name = "intel-iommu",
4798 .attrs = intel_iommu_attrs,
4801 const struct attribute_group *intel_iommu_groups[] = {
4806 static int __init platform_optin_force_iommu(void)
4808 struct pci_dev *pdev = NULL;
4809 bool has_untrusted_dev = false;
4811 if (!dmar_platform_optin() || no_platform_optin)
4814 for_each_pci_dev(pdev) {
4815 if (pdev->untrusted) {
4816 has_untrusted_dev = true;
4821 if (!has_untrusted_dev)
4824 if (no_iommu || dmar_disabled)
4825 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4828 * If Intel-IOMMU is disabled by default, we will apply identity
4829 * map for all devices except those marked as being untrusted.
4832 iommu_identity_mapping |= IDENTMAP_ALL;
4835 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4843 int __init intel_iommu_init(void)
4846 struct dmar_drhd_unit *drhd;
4847 struct intel_iommu *iommu;
4850 * Intel IOMMU is required for a TXT/tboot launch or platform
4851 * opt in, so enforce that.
4853 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4855 if (iommu_init_mempool()) {
4857 panic("tboot: Failed to initialize iommu memory\n");
4861 down_write(&dmar_global_lock);
4862 if (dmar_table_init()) {
4864 panic("tboot: Failed to initialize DMAR table\n");
4868 if (dmar_dev_scope_init() < 0) {
4870 panic("tboot: Failed to initialize DMAR device scope\n");
4874 up_write(&dmar_global_lock);
4877 * The bus notifier takes the dmar_global_lock, so lockdep will
4878 * complain later when we register it under the lock.
4880 dmar_register_bus_notifier();
4882 down_write(&dmar_global_lock);
4884 if (no_iommu || dmar_disabled) {
4886 * We exit the function here to ensure IOMMU's remapping and
4887 * mempool aren't setup, which means that the IOMMU's PMRs
4888 * won't be disabled via the call to init_dmars(). So disable
4889 * it explicitly here. The PMRs were setup by tboot prior to
4890 * calling SENTER, but the kernel is expected to reset/tear
4893 if (intel_iommu_tboot_noforce) {
4894 for_each_iommu(iommu, drhd)
4895 iommu_disable_protect_mem_regions(iommu);
4899 * Make sure the IOMMUs are switched off, even when we
4900 * boot into a kexec kernel and the previous kernel left
4903 intel_disable_iommus();
4907 if (list_empty(&dmar_rmrr_units))
4908 pr_info("No RMRR found\n");
4910 if (list_empty(&dmar_atsr_units))
4911 pr_info("No ATSR found\n");
4913 if (dmar_init_reserved_ranges()) {
4915 panic("tboot: Failed to reserve iommu ranges\n");
4916 goto out_free_reserved_range;
4920 intel_iommu_gfx_mapped = 1;
4922 init_no_remapping_devices();
4927 panic("tboot: Failed to initialize DMARs\n");
4928 pr_err("Initialization failed\n");
4929 goto out_free_reserved_range;
4931 up_write(&dmar_global_lock);
4933 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4936 dma_ops = &intel_dma_ops;
4938 init_iommu_pm_ops();
4940 for_each_active_iommu(iommu, drhd) {
4941 iommu_device_sysfs_add(&iommu->iommu, NULL,
4944 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4945 iommu_device_register(&iommu->iommu);
4948 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4949 bus_register_notifier(&pci_bus_type, &device_nb);
4950 if (si_domain && !hw_pass_through)
4951 register_memory_notifier(&intel_iommu_memory_nb);
4952 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4953 intel_iommu_cpu_dead);
4955 /* Finally, we enable the DMA remapping hardware. */
4956 for_each_iommu(iommu, drhd) {
4957 if (!translation_pre_enabled(iommu))
4958 iommu_enable_translation(iommu);
4960 iommu_disable_protect_mem_regions(iommu);
4962 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4964 intel_iommu_enabled = 1;
4965 intel_iommu_debugfs_init();
4969 out_free_reserved_range:
4970 put_iova_domain(&reserved_iova_list);
4972 intel_iommu_free_dmars();
4973 up_write(&dmar_global_lock);
4974 iommu_exit_mempool();
4978 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4980 struct intel_iommu *iommu = opaque;
4982 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4987 * NB - intel-iommu lacks any sort of reference counting for the users of
4988 * dependent devices. If multiple endpoints have intersecting dependent
4989 * devices, unbinding the driver from any one of them will possibly leave
4990 * the others unable to operate.
4992 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4994 if (!iommu || !dev || !dev_is_pci(dev))
4997 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5000 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5002 struct intel_iommu *iommu;
5003 unsigned long flags;
5005 assert_spin_locked(&device_domain_lock);
5010 iommu = info->iommu;
5013 if (dev_is_pci(info->dev) && sm_supported(iommu))
5014 intel_pasid_tear_down_entry(iommu, info->dev,
5017 iommu_disable_dev_iotlb(info);
5018 domain_context_clear(iommu, info->dev);
5019 intel_pasid_free_table(info->dev);
5022 unlink_domain_info(info);
5024 spin_lock_irqsave(&iommu->lock, flags);
5025 domain_detach_iommu(info->domain, iommu);
5026 spin_unlock_irqrestore(&iommu->lock, flags);
5028 free_devinfo_mem(info);
5031 static void dmar_remove_one_dev_info(struct device *dev)
5033 struct device_domain_info *info;
5034 unsigned long flags;
5036 spin_lock_irqsave(&device_domain_lock, flags);
5037 info = dev->archdata.iommu;
5038 __dmar_remove_one_dev_info(info);
5039 spin_unlock_irqrestore(&device_domain_lock, flags);
5042 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5046 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5047 domain_reserve_special_ranges(domain);
5049 /* calculate AGAW */
5050 domain->gaw = guest_width;
5051 adjust_width = guestwidth_to_adjustwidth(guest_width);
5052 domain->agaw = width_to_agaw(adjust_width);
5054 domain->iommu_coherency = 0;
5055 domain->iommu_snooping = 0;
5056 domain->iommu_superpage = 0;
5057 domain->max_addr = 0;
5059 /* always allocate the top pgd */
5060 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5063 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5067 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5069 struct dmar_domain *dmar_domain;
5070 struct iommu_domain *domain;
5073 case IOMMU_DOMAIN_UNMANAGED:
5074 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5076 pr_err("Can't allocate dmar_domain\n");
5079 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5080 pr_err("Domain initialization failed\n");
5081 domain_exit(dmar_domain);
5084 domain_update_iommu_cap(dmar_domain);
5086 domain = &dmar_domain->domain;
5087 domain->geometry.aperture_start = 0;
5088 domain->geometry.aperture_end =
5089 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5090 domain->geometry.force_aperture = true;
5093 case IOMMU_DOMAIN_IDENTITY:
5094 return &si_domain->domain;
5102 static void intel_iommu_domain_free(struct iommu_domain *domain)
5104 if (domain != &si_domain->domain)
5105 domain_exit(to_dmar_domain(domain));
5109 * Check whether a @domain could be attached to the @dev through the
5110 * aux-domain attach/detach APIs.
5113 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5115 struct device_domain_info *info = dev->archdata.iommu;
5117 return info && info->auxd_enabled &&
5118 domain->type == IOMMU_DOMAIN_UNMANAGED;
5121 static void auxiliary_link_device(struct dmar_domain *domain,
5124 struct device_domain_info *info = dev->archdata.iommu;
5126 assert_spin_locked(&device_domain_lock);
5130 domain->auxd_refcnt++;
5131 list_add(&domain->auxd, &info->auxiliary_domains);
5134 static void auxiliary_unlink_device(struct dmar_domain *domain,
5137 struct device_domain_info *info = dev->archdata.iommu;
5139 assert_spin_locked(&device_domain_lock);
5143 list_del(&domain->auxd);
5144 domain->auxd_refcnt--;
5146 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5147 intel_pasid_free_id(domain->default_pasid);
5150 static int aux_domain_add_dev(struct dmar_domain *domain,
5155 unsigned long flags;
5156 struct intel_iommu *iommu;
5158 iommu = device_to_iommu(dev, &bus, &devfn);
5162 if (domain->default_pasid <= 0) {
5165 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5166 pci_max_pasids(to_pci_dev(dev)),
5169 pr_err("Can't allocate default pasid\n");
5172 domain->default_pasid = pasid;
5175 spin_lock_irqsave(&device_domain_lock, flags);
5177 * iommu->lock must be held to attach domain to iommu and setup the
5178 * pasid entry for second level translation.
5180 spin_lock(&iommu->lock);
5181 ret = domain_attach_iommu(domain, iommu);
5185 /* Setup the PASID entry for mediated devices: */
5186 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5187 domain->default_pasid);
5190 spin_unlock(&iommu->lock);
5192 auxiliary_link_device(domain, dev);
5194 spin_unlock_irqrestore(&device_domain_lock, flags);
5199 domain_detach_iommu(domain, iommu);
5201 spin_unlock(&iommu->lock);
5202 spin_unlock_irqrestore(&device_domain_lock, flags);
5203 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5204 intel_pasid_free_id(domain->default_pasid);
5209 static void aux_domain_remove_dev(struct dmar_domain *domain,
5212 struct device_domain_info *info;
5213 struct intel_iommu *iommu;
5214 unsigned long flags;
5216 if (!is_aux_domain(dev, &domain->domain))
5219 spin_lock_irqsave(&device_domain_lock, flags);
5220 info = dev->archdata.iommu;
5221 iommu = info->iommu;
5223 auxiliary_unlink_device(domain, dev);
5225 spin_lock(&iommu->lock);
5226 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5227 domain_detach_iommu(domain, iommu);
5228 spin_unlock(&iommu->lock);
5230 spin_unlock_irqrestore(&device_domain_lock, flags);
5233 static int prepare_domain_attach_device(struct iommu_domain *domain,
5236 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5237 struct intel_iommu *iommu;
5241 iommu = device_to_iommu(dev, &bus, &devfn);
5245 /* check if this iommu agaw is sufficient for max mapped address */
5246 addr_width = agaw_to_width(iommu->agaw);
5247 if (addr_width > cap_mgaw(iommu->cap))
5248 addr_width = cap_mgaw(iommu->cap);
5250 if (dmar_domain->max_addr > (1LL << addr_width)) {
5251 dev_err(dev, "%s: iommu width (%d) is not "
5252 "sufficient for the mapped address (%llx)\n",
5253 __func__, addr_width, dmar_domain->max_addr);
5256 dmar_domain->gaw = addr_width;
5259 * Knock out extra levels of page tables if necessary
5261 while (iommu->agaw < dmar_domain->agaw) {
5262 struct dma_pte *pte;
5264 pte = dmar_domain->pgd;
5265 if (dma_pte_present(pte)) {
5266 dmar_domain->pgd = (struct dma_pte *)
5267 phys_to_virt(dma_pte_addr(pte));
5268 free_pgtable_page(pte);
5270 dmar_domain->agaw--;
5276 static int intel_iommu_attach_device(struct iommu_domain *domain,
5281 if (device_is_rmrr_locked(dev)) {
5282 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5286 if (is_aux_domain(dev, domain))
5289 /* normally dev is not mapped */
5290 if (unlikely(domain_context_mapped(dev))) {
5291 struct dmar_domain *old_domain;
5293 old_domain = find_domain(dev);
5295 dmar_remove_one_dev_info(dev);
5297 if (!domain_type_is_vm_or_si(old_domain) &&
5298 list_empty(&old_domain->devices))
5299 domain_exit(old_domain);
5303 ret = prepare_domain_attach_device(domain, dev);
5307 return domain_add_dev_info(to_dmar_domain(domain), dev);
5310 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5315 if (!is_aux_domain(dev, domain))
5318 ret = prepare_domain_attach_device(domain, dev);
5322 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5325 static void intel_iommu_detach_device(struct iommu_domain *domain,
5328 dmar_remove_one_dev_info(dev);
5331 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5334 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5337 static int intel_iommu_map(struct iommu_domain *domain,
5338 unsigned long iova, phys_addr_t hpa,
5339 size_t size, int iommu_prot)
5341 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5346 if (iommu_prot & IOMMU_READ)
5347 prot |= DMA_PTE_READ;
5348 if (iommu_prot & IOMMU_WRITE)
5349 prot |= DMA_PTE_WRITE;
5350 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5351 prot |= DMA_PTE_SNP;
5353 max_addr = iova + size;
5354 if (dmar_domain->max_addr < max_addr) {
5357 /* check if minimum agaw is sufficient for mapped address */
5358 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5359 if (end < max_addr) {
5360 pr_err("%s: iommu width (%d) is not "
5361 "sufficient for the mapped address (%llx)\n",
5362 __func__, dmar_domain->gaw, max_addr);
5365 dmar_domain->max_addr = max_addr;
5367 /* Round up size to next multiple of PAGE_SIZE, if it and
5368 the low bits of hpa would take us onto the next page */
5369 size = aligned_nrpages(hpa, size);
5370 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5371 hpa >> VTD_PAGE_SHIFT, size, prot);
5375 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5376 unsigned long iova, size_t size)
5378 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5379 struct page *freelist = NULL;
5380 unsigned long start_pfn, last_pfn;
5381 unsigned int npages;
5382 int iommu_id, level = 0;
5384 /* Cope with horrid API which requires us to unmap more than the
5385 size argument if it happens to be a large-page mapping. */
5386 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5388 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5389 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5391 start_pfn = iova >> VTD_PAGE_SHIFT;
5392 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5394 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5396 npages = last_pfn - start_pfn + 1;
5398 for_each_domain_iommu(iommu_id, dmar_domain)
5399 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5400 start_pfn, npages, !freelist, 0);
5402 dma_free_pagelist(freelist);
5404 if (dmar_domain->max_addr == iova + size)
5405 dmar_domain->max_addr = iova;
5410 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5413 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5414 struct dma_pte *pte;
5418 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5420 phys = dma_pte_addr(pte);
5425 static inline bool scalable_mode_support(void)
5427 struct dmar_drhd_unit *drhd;
5428 struct intel_iommu *iommu;
5432 for_each_active_iommu(iommu, drhd) {
5433 if (!sm_supported(iommu)) {
5443 static inline bool iommu_pasid_support(void)
5445 struct dmar_drhd_unit *drhd;
5446 struct intel_iommu *iommu;
5450 for_each_active_iommu(iommu, drhd) {
5451 if (!pasid_supported(iommu)) {
5461 static bool intel_iommu_capable(enum iommu_cap cap)
5463 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5464 return domain_update_iommu_snooping(NULL) == 1;
5465 if (cap == IOMMU_CAP_INTR_REMAP)
5466 return irq_remapping_enabled == 1;
5471 static int intel_iommu_add_device(struct device *dev)
5473 struct intel_iommu *iommu;
5474 struct iommu_group *group;
5477 iommu = device_to_iommu(dev, &bus, &devfn);
5481 iommu_device_link(&iommu->iommu, dev);
5483 group = iommu_group_get_for_dev(dev);
5486 return PTR_ERR(group);
5488 iommu_group_put(group);
5492 static void intel_iommu_remove_device(struct device *dev)
5494 struct intel_iommu *iommu;
5497 iommu = device_to_iommu(dev, &bus, &devfn);
5501 iommu_group_remove_device(dev);
5503 iommu_device_unlink(&iommu->iommu, dev);
5506 static void intel_iommu_get_resv_regions(struct device *device,
5507 struct list_head *head)
5509 struct iommu_resv_region *reg;
5510 struct dmar_rmrr_unit *rmrr;
5511 struct device *i_dev;
5515 for_each_rmrr_units(rmrr) {
5516 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5518 if (i_dev != device)
5521 list_add_tail(&rmrr->resv->list, head);
5526 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5527 if (dev_is_pci(device)) {
5528 struct pci_dev *pdev = to_pci_dev(device);
5530 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5531 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5534 list_add_tail(®->list, head);
5537 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5539 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5540 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5544 list_add_tail(®->list, head);
5547 static void intel_iommu_put_resv_regions(struct device *dev,
5548 struct list_head *head)
5550 struct iommu_resv_region *entry, *next;
5552 list_for_each_entry_safe(entry, next, head, list) {
5553 if (entry->type == IOMMU_RESV_MSI)
5558 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5560 struct device_domain_info *info;
5561 struct context_entry *context;
5562 struct dmar_domain *domain;
5563 unsigned long flags;
5567 domain = get_valid_domain_for_dev(dev);
5571 spin_lock_irqsave(&device_domain_lock, flags);
5572 spin_lock(&iommu->lock);
5575 info = dev->archdata.iommu;
5576 if (!info || !info->pasid_supported)
5579 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5580 if (WARN_ON(!context))
5583 ctx_lo = context[0].lo;
5585 if (!(ctx_lo & CONTEXT_PASIDE)) {
5586 ctx_lo |= CONTEXT_PASIDE;
5587 context[0].lo = ctx_lo;
5589 iommu->flush.flush_context(iommu,
5590 domain->iommu_did[iommu->seq_id],
5591 PCI_DEVID(info->bus, info->devfn),
5592 DMA_CCMD_MASK_NOBIT,
5593 DMA_CCMD_DEVICE_INVL);
5596 /* Enable PASID support in the device, if it wasn't already */
5597 if (!info->pasid_enabled)
5598 iommu_enable_dev_iotlb(info);
5603 spin_unlock(&iommu->lock);
5604 spin_unlock_irqrestore(&device_domain_lock, flags);
5609 static void intel_iommu_apply_resv_region(struct device *dev,
5610 struct iommu_domain *domain,
5611 struct iommu_resv_region *region)
5613 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5614 unsigned long start, end;
5616 start = IOVA_PFN(region->start);
5617 end = IOVA_PFN(region->start + region->length - 1);
5619 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5622 #ifdef CONFIG_INTEL_IOMMU_SVM
5623 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5625 struct intel_iommu *iommu;
5628 if (iommu_dummy(dev)) {
5630 "No IOMMU translation for device; cannot enable SVM\n");
5634 iommu = device_to_iommu(dev, &bus, &devfn);
5636 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5642 #endif /* CONFIG_INTEL_IOMMU_SVM */
5644 static int intel_iommu_enable_auxd(struct device *dev)
5646 struct device_domain_info *info;
5647 struct intel_iommu *iommu;
5648 unsigned long flags;
5652 iommu = device_to_iommu(dev, &bus, &devfn);
5653 if (!iommu || dmar_disabled)
5656 if (!sm_supported(iommu) || !pasid_supported(iommu))
5659 ret = intel_iommu_enable_pasid(iommu, dev);
5663 spin_lock_irqsave(&device_domain_lock, flags);
5664 info = dev->archdata.iommu;
5665 info->auxd_enabled = 1;
5666 spin_unlock_irqrestore(&device_domain_lock, flags);
5671 static int intel_iommu_disable_auxd(struct device *dev)
5673 struct device_domain_info *info;
5674 unsigned long flags;
5676 spin_lock_irqsave(&device_domain_lock, flags);
5677 info = dev->archdata.iommu;
5678 if (!WARN_ON(!info))
5679 info->auxd_enabled = 0;
5680 spin_unlock_irqrestore(&device_domain_lock, flags);
5686 * A PCI express designated vendor specific extended capability is defined
5687 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5688 * for system software and tools to detect endpoint devices supporting the
5689 * Intel scalable IO virtualization without host driver dependency.
5691 * Returns the address of the matching extended capability structure within
5692 * the device's PCI configuration space or 0 if the device does not support
5695 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5700 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5702 pci_read_config_word(pdev, pos + 4, &vendor);
5703 pci_read_config_word(pdev, pos + 8, &id);
5704 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5707 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5714 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5716 if (feat == IOMMU_DEV_FEAT_AUX) {
5719 if (!dev_is_pci(dev) || dmar_disabled ||
5720 !scalable_mode_support() || !iommu_pasid_support())
5723 ret = pci_pasid_features(to_pci_dev(dev));
5727 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5734 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5736 if (feat == IOMMU_DEV_FEAT_AUX)
5737 return intel_iommu_enable_auxd(dev);
5743 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5745 if (feat == IOMMU_DEV_FEAT_AUX)
5746 return intel_iommu_disable_auxd(dev);
5752 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5754 struct device_domain_info *info = dev->archdata.iommu;
5756 if (feat == IOMMU_DEV_FEAT_AUX)
5757 return scalable_mode_support() && info && info->auxd_enabled;
5763 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5765 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5767 return dmar_domain->default_pasid > 0 ?
5768 dmar_domain->default_pasid : -EINVAL;
5771 const struct iommu_ops intel_iommu_ops = {
5772 .capable = intel_iommu_capable,
5773 .domain_alloc = intel_iommu_domain_alloc,
5774 .domain_free = intel_iommu_domain_free,
5775 .attach_dev = intel_iommu_attach_device,
5776 .detach_dev = intel_iommu_detach_device,
5777 .aux_attach_dev = intel_iommu_aux_attach_device,
5778 .aux_detach_dev = intel_iommu_aux_detach_device,
5779 .aux_get_pasid = intel_iommu_aux_get_pasid,
5780 .map = intel_iommu_map,
5781 .unmap = intel_iommu_unmap,
5782 .iova_to_phys = intel_iommu_iova_to_phys,
5783 .add_device = intel_iommu_add_device,
5784 .remove_device = intel_iommu_remove_device,
5785 .get_resv_regions = intel_iommu_get_resv_regions,
5786 .put_resv_regions = intel_iommu_put_resv_regions,
5787 .apply_resv_region = intel_iommu_apply_resv_region,
5788 .device_group = pci_device_group,
5789 .dev_has_feat = intel_iommu_dev_has_feat,
5790 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5791 .dev_enable_feat = intel_iommu_dev_enable_feat,
5792 .dev_disable_feat = intel_iommu_dev_disable_feat,
5793 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5796 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5798 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5799 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5811 static void quirk_iommu_rwbf(struct pci_dev *dev)
5814 * Mobile 4 Series Chipset neglects to set RWBF capability,
5815 * but needs it. Same seems to hold for the desktop versions.
5817 pci_info(dev, "Forcing write-buffer flush capability\n");
5821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5830 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5831 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5832 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5833 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5834 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5835 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5836 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5837 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5839 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5843 if (pci_read_config_word(dev, GGC, &ggc))
5846 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5847 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5849 } else if (dmar_map_gfx) {
5850 /* we have to ensure the gfx device is idle before we flush */
5851 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5852 intel_iommu_strict = 1;
5855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5860 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5861 ISOCH DMAR unit for the Azalia sound device, but not give it any
5862 TLB entries, which causes it to deadlock. Check for that. We do
5863 this in a function called from init_dmars(), instead of in a PCI
5864 quirk, because we don't want to print the obnoxious "BIOS broken"
5865 message if VT-d is actually disabled.
5867 static void __init check_tylersburg_isoch(void)
5869 struct pci_dev *pdev;
5870 uint32_t vtisochctrl;
5872 /* If there's no Azalia in the system anyway, forget it. */
5873 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5878 /* System Management Registers. Might be hidden, in which case
5879 we can't do the sanity check. But that's OK, because the
5880 known-broken BIOSes _don't_ actually hide it, so far. */
5881 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5885 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5892 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5893 if (vtisochctrl & 1)
5896 /* Drop all bits other than the number of TLB entries */
5897 vtisochctrl &= 0x1c;
5899 /* If we have the recommended number of TLB entries (16), fine. */
5900 if (vtisochctrl == 0x10)
5903 /* Zero TLB entries? You get to ride the short bus to school. */
5905 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5906 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5907 dmi_get_system_info(DMI_BIOS_VENDOR),
5908 dmi_get_system_info(DMI_BIOS_VERSION),
5909 dmi_get_system_info(DMI_PRODUCT_VERSION));
5910 iommu_identity_mapping |= IDENTMAP_AZALIA;
5914 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",