2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
46 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
180 * 12-63: Context Ptr (12 - (haw-1))
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val |= value & VTD_PAGE_MASK;
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
204 return (struct context_entry *)
205 (root_present(root)?phys_to_virt(
206 root->val & VTD_PAGE_MASK) :
213 * 1: fault processing disable
214 * 2-3: translation type
215 * 12-63: address space root
221 struct context_entry {
226 static inline bool context_present(struct context_entry *context)
228 return (context->lo & 1);
230 static inline void context_set_present(struct context_entry *context)
235 static inline void context_set_fault_enable(struct context_entry *context)
237 context->lo &= (((u64)-1) << 2) | 1;
240 static inline void context_set_translation_type(struct context_entry *context,
243 context->lo &= (((u64)-1) << 4) | 3;
244 context->lo |= (value & 3) << 2;
247 static inline void context_set_address_root(struct context_entry *context,
250 context->lo |= value & VTD_PAGE_MASK;
253 static inline void context_set_address_width(struct context_entry *context,
256 context->hi |= value & 7;
259 static inline void context_set_domain_id(struct context_entry *context,
262 context->hi |= (value & ((1 << 16) - 1)) << 8;
265 static inline void context_clear_entry(struct context_entry *context)
278 * 12-63: Host physcial address
284 static inline void dma_clear_pte(struct dma_pte *pte)
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 return pte->val & VTD_PAGE_MASK;
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
299 static inline bool dma_pte_present(struct dma_pte *pte)
301 return (pte->val & 3) != 0;
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
306 return (pte->val & (1 << 7));
309 static inline int first_pte_in_page(struct dma_pte *pte)
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
326 /* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
334 /* define the limit of IOMMUs supported in each domain */
336 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
338 # define IOMMU_UNITS_SUPPORTED 64
342 int id; /* domain id */
343 int nid; /* node id */
344 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345 /* bitmap of iommus this domain uses*/
347 struct list_head devices; /* all devices' list */
348 struct iova_domain iovad; /* iova's that belong to this domain */
350 struct dma_pte *pgd; /* virtual address */
351 int gaw; /* max guest address width */
353 /* adjusted guest address width, 0 is level 2 30-bit */
356 int flags; /* flags to find out type of domain */
358 int iommu_coherency;/* indicate coherency of iommu access */
359 int iommu_snooping; /* indicate snooping control feature*/
360 int iommu_count; /* reference count of iommu */
361 int iommu_superpage;/* Level of superpages supported:
362 0 == 4KiB (no superpages), 1 == 2MiB,
363 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364 spinlock_t iommu_lock; /* protect iommu set in domain */
365 u64 max_addr; /* maximum mapped address */
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370 struct list_head link; /* link to domain siblings */
371 struct list_head global; /* link to global list */
372 u8 bus; /* PCI bus number */
373 u8 devfn; /* PCI devfn number */
374 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
375 struct intel_iommu *iommu; /* IOMMU used by this device */
376 struct dmar_domain *domain; /* pointer to domain */
379 struct dmar_rmrr_unit {
380 struct list_head list; /* list of rmrr units */
381 struct acpi_dmar_header *hdr; /* ACPI header */
382 u64 base_address; /* reserved base address*/
383 u64 end_address; /* reserved end address */
384 struct dmar_dev_scope *devices; /* target devices */
385 int devices_cnt; /* target device count */
388 struct dmar_atsr_unit {
389 struct list_head list; /* list of ATSR units */
390 struct acpi_dmar_header *hdr; /* ACPI header */
391 struct dmar_dev_scope *devices; /* target devices */
392 int devices_cnt; /* target device count */
393 u8 include_all:1; /* include all ports */
396 static LIST_HEAD(dmar_atsr_units);
397 static LIST_HEAD(dmar_rmrr_units);
399 #define for_each_rmrr_units(rmrr) \
400 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
402 static void flush_unmaps_timeout(unsigned long data);
404 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
406 #define HIGH_WATER_MARK 250
407 struct deferred_flush_tables {
409 struct iova *iova[HIGH_WATER_MARK];
410 struct dmar_domain *domain[HIGH_WATER_MARK];
411 struct page *freelist[HIGH_WATER_MARK];
414 static struct deferred_flush_tables *deferred_flush;
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
423 static long list_size;
425 static void domain_exit(struct dmar_domain *domain);
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427 static void domain_remove_one_dev_info(struct dmar_domain *domain,
428 struct pci_dev *pdev);
429 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
453 static struct iommu_ops intel_iommu_ops;
455 static int __init intel_iommu_setup(char *str)
460 if (!strncmp(str, "on", 2)) {
462 printk(KERN_INFO "Intel-IOMMU: enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
465 printk(KERN_INFO "Intel-IOMMU: disabled\n");
466 } else if (!strncmp(str, "igfx_off", 8)) {
469 "Intel-IOMMU: disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
472 "Intel-IOMMU: Forcing DAC for PCI devices\n");
474 } else if (!strncmp(str, "strict", 6)) {
476 "Intel-IOMMU: disable batched IOTLB flush\n");
477 intel_iommu_strict = 1;
478 } else if (!strncmp(str, "sp_off", 6)) {
480 "Intel-IOMMU: disable supported super page\n");
481 intel_iommu_superpage = 0;
484 str += strcspn(str, ",");
490 __setup("intel_iommu=", intel_iommu_setup);
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
496 static inline void *alloc_pgtable_page(int node)
501 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
503 vaddr = page_address(page);
507 static inline void free_pgtable_page(void *vaddr)
509 free_page((unsigned long)vaddr);
512 static inline void *alloc_domain_mem(void)
514 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
517 static void free_domain_mem(void *vaddr)
519 kmem_cache_free(iommu_domain_cache, vaddr);
522 static inline void * alloc_devinfo_mem(void)
524 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
527 static inline void free_devinfo_mem(void *vaddr)
529 kmem_cache_free(iommu_devinfo_cache, vaddr);
532 struct iova *alloc_iova_mem(void)
534 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
537 void free_iova_mem(struct iova *iova)
539 kmem_cache_free(iommu_iova_cache, iova);
543 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
548 sagaw = cap_sagaw(iommu->cap);
549 for (agaw = width_to_agaw(max_gaw);
551 if (test_bit(agaw, &sagaw))
559 * Calculate max SAGAW for each iommu.
561 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
563 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
567 * calculate agaw for each iommu.
568 * "SAGAW" may be different across iommus, use a default agaw, and
569 * get a supported less agaw for iommus that don't support the default agaw.
571 int iommu_calculate_agaw(struct intel_iommu *iommu)
573 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
576 /* This functionin only returns single iommu in a domain */
577 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
581 /* si_domain and vm domain should not get here. */
582 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
583 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
585 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
586 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
589 return g_iommus[iommu_id];
592 static void domain_update_iommu_coherency(struct dmar_domain *domain)
594 struct dmar_drhd_unit *drhd;
595 struct intel_iommu *iommu;
598 domain->iommu_coherency = 1;
600 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
602 if (!ecap_coherent(g_iommus[i]->ecap)) {
603 domain->iommu_coherency = 0;
610 /* No hardware attached; use lowest common denominator */
612 for_each_active_iommu(iommu, drhd) {
613 if (!ecap_coherent(iommu->ecap)) {
614 domain->iommu_coherency = 0;
621 static void domain_update_iommu_snooping(struct dmar_domain *domain)
625 domain->iommu_snooping = 1;
627 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
628 if (!ecap_sc_support(g_iommus[i]->ecap)) {
629 domain->iommu_snooping = 0;
635 static void domain_update_iommu_superpage(struct dmar_domain *domain)
637 struct dmar_drhd_unit *drhd;
638 struct intel_iommu *iommu = NULL;
641 if (!intel_iommu_superpage) {
642 domain->iommu_superpage = 0;
646 /* set iommu_superpage to the smallest common denominator */
648 for_each_active_iommu(iommu, drhd) {
649 mask &= cap_super_page_val(iommu->cap);
656 domain->iommu_superpage = fls(mask);
659 /* Some capabilities may be different across iommus */
660 static void domain_update_iommu_cap(struct dmar_domain *domain)
662 domain_update_iommu_coherency(domain);
663 domain_update_iommu_snooping(domain);
664 domain_update_iommu_superpage(domain);
667 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
669 struct dmar_drhd_unit *drhd = NULL;
670 struct intel_iommu *iommu;
672 struct pci_dev *pdev;
676 for_each_active_iommu(iommu, drhd) {
677 if (segment != drhd->segment)
680 for_each_active_dev_scope(drhd->devices,
681 drhd->devices_cnt, i, dev) {
682 if (!dev_is_pci(dev))
684 pdev = to_pci_dev(dev);
685 if (pdev->bus->number == bus && pdev->devfn == devfn)
687 if (pdev->subordinate &&
688 pdev->subordinate->number <= bus &&
689 pdev->subordinate->busn_res.end >= bus)
693 if (drhd->include_all)
703 static void domain_flush_cache(struct dmar_domain *domain,
704 void *addr, int size)
706 if (!domain->iommu_coherency)
707 clflush_cache_range(addr, size);
710 /* Gets context entry for a given bus and devfn */
711 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
714 struct root_entry *root;
715 struct context_entry *context;
716 unsigned long phy_addr;
719 spin_lock_irqsave(&iommu->lock, flags);
720 root = &iommu->root_entry[bus];
721 context = get_context_addr_from_root(root);
723 context = (struct context_entry *)
724 alloc_pgtable_page(iommu->node);
726 spin_unlock_irqrestore(&iommu->lock, flags);
729 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
730 phy_addr = virt_to_phys((void *)context);
731 set_root_value(root, phy_addr);
732 set_root_present(root);
733 __iommu_flush_cache(iommu, root, sizeof(*root));
735 spin_unlock_irqrestore(&iommu->lock, flags);
736 return &context[devfn];
739 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
741 struct root_entry *root;
742 struct context_entry *context;
746 spin_lock_irqsave(&iommu->lock, flags);
747 root = &iommu->root_entry[bus];
748 context = get_context_addr_from_root(root);
753 ret = context_present(&context[devfn]);
755 spin_unlock_irqrestore(&iommu->lock, flags);
759 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
761 struct root_entry *root;
762 struct context_entry *context;
765 spin_lock_irqsave(&iommu->lock, flags);
766 root = &iommu->root_entry[bus];
767 context = get_context_addr_from_root(root);
769 context_clear_entry(&context[devfn]);
770 __iommu_flush_cache(iommu, &context[devfn], \
773 spin_unlock_irqrestore(&iommu->lock, flags);
776 static void free_context_table(struct intel_iommu *iommu)
778 struct root_entry *root;
781 struct context_entry *context;
783 spin_lock_irqsave(&iommu->lock, flags);
784 if (!iommu->root_entry) {
787 for (i = 0; i < ROOT_ENTRY_NR; i++) {
788 root = &iommu->root_entry[i];
789 context = get_context_addr_from_root(root);
791 free_pgtable_page(context);
793 free_pgtable_page(iommu->root_entry);
794 iommu->root_entry = NULL;
796 spin_unlock_irqrestore(&iommu->lock, flags);
799 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
800 unsigned long pfn, int *target_level)
802 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
803 struct dma_pte *parent, *pte = NULL;
804 int level = agaw_to_level(domain->agaw);
807 BUG_ON(!domain->pgd);
809 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
810 /* Address beyond IOMMU's addressing capabilities. */
813 parent = domain->pgd;
818 offset = pfn_level_offset(pfn, level);
819 pte = &parent[offset];
820 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
822 if (level == *target_level)
825 if (!dma_pte_present(pte)) {
828 tmp_page = alloc_pgtable_page(domain->nid);
833 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
834 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
835 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
836 /* Someone else set it while we were thinking; use theirs. */
837 free_pgtable_page(tmp_page);
840 domain_flush_cache(domain, pte, sizeof(*pte));
846 parent = phys_to_virt(dma_pte_addr(pte));
851 *target_level = level;
857 /* return address's pte at specific level */
858 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
860 int level, int *large_page)
862 struct dma_pte *parent, *pte = NULL;
863 int total = agaw_to_level(domain->agaw);
866 parent = domain->pgd;
867 while (level <= total) {
868 offset = pfn_level_offset(pfn, total);
869 pte = &parent[offset];
873 if (!dma_pte_present(pte)) {
878 if (pte->val & DMA_PTE_LARGE_PAGE) {
883 parent = phys_to_virt(dma_pte_addr(pte));
889 /* clear last level pte, a tlb flush should be followed */
890 static void dma_pte_clear_range(struct dmar_domain *domain,
891 unsigned long start_pfn,
892 unsigned long last_pfn)
894 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
895 unsigned int large_page = 1;
896 struct dma_pte *first_pte, *pte;
898 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
899 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
900 BUG_ON(start_pfn > last_pfn);
902 /* we don't need lock here; nobody else touches the iova range */
905 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
907 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
912 start_pfn += lvl_to_nr_pages(large_page);
914 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
916 domain_flush_cache(domain, first_pte,
917 (void *)pte - (void *)first_pte);
919 } while (start_pfn && start_pfn <= last_pfn);
922 static void dma_pte_free_level(struct dmar_domain *domain, int level,
923 struct dma_pte *pte, unsigned long pfn,
924 unsigned long start_pfn, unsigned long last_pfn)
926 pfn = max(start_pfn, pfn);
927 pte = &pte[pfn_level_offset(pfn, level)];
930 unsigned long level_pfn;
931 struct dma_pte *level_pte;
933 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
936 level_pfn = pfn & level_mask(level - 1);
937 level_pte = phys_to_virt(dma_pte_addr(pte));
940 dma_pte_free_level(domain, level - 1, level_pte,
941 level_pfn, start_pfn, last_pfn);
943 /* If range covers entire pagetable, free it */
944 if (!(start_pfn > level_pfn ||
945 last_pfn < level_pfn + level_size(level) - 1)) {
947 domain_flush_cache(domain, pte, sizeof(*pte));
948 free_pgtable_page(level_pte);
951 pfn += level_size(level);
952 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
955 /* free page table pages. last level pte should already be cleared */
956 static void dma_pte_free_pagetable(struct dmar_domain *domain,
957 unsigned long start_pfn,
958 unsigned long last_pfn)
960 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
962 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
963 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
964 BUG_ON(start_pfn > last_pfn);
966 /* We don't need lock here; nobody else touches the iova range */
967 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
968 domain->pgd, 0, start_pfn, last_pfn);
971 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
972 free_pgtable_page(domain->pgd);
977 /* When a page at a given level is being unlinked from its parent, we don't
978 need to *modify* it at all. All we need to do is make a list of all the
979 pages which can be freed just as soon as we've flushed the IOTLB and we
980 know the hardware page-walk will no longer touch them.
981 The 'pte' argument is the *parent* PTE, pointing to the page that is to
983 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
984 int level, struct dma_pte *pte,
985 struct page *freelist)
989 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
990 pg->freelist = freelist;
996 for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
997 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
998 freelist = dma_pte_list_pagetables(domain, level - 1,
1005 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1006 struct dma_pte *pte, unsigned long pfn,
1007 unsigned long start_pfn,
1008 unsigned long last_pfn,
1009 struct page *freelist)
1011 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1013 pfn = max(start_pfn, pfn);
1014 pte = &pte[pfn_level_offset(pfn, level)];
1017 unsigned long level_pfn;
1019 if (!dma_pte_present(pte))
1022 level_pfn = pfn & level_mask(level);
1024 /* If range covers entire pagetable, free it */
1025 if (start_pfn <= level_pfn &&
1026 last_pfn >= level_pfn + level_size(level) - 1) {
1027 /* These suborbinate page tables are going away entirely. Don't
1028 bother to clear them; we're just going to *free* them. */
1029 if (level > 1 && !dma_pte_superpage(pte))
1030 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1036 } else if (level > 1) {
1037 /* Recurse down into a level that isn't *entirely* obsolete */
1038 freelist = dma_pte_clear_level(domain, level - 1,
1039 phys_to_virt(dma_pte_addr(pte)),
1040 level_pfn, start_pfn, last_pfn,
1044 pfn += level_size(level);
1045 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1048 domain_flush_cache(domain, first_pte,
1049 (void *)++last_pte - (void *)first_pte);
1054 /* We can't just free the pages because the IOMMU may still be walking
1055 the page tables, and may have cached the intermediate levels. The
1056 pages can only be freed after the IOTLB flush has been done. */
1057 struct page *domain_unmap(struct dmar_domain *domain,
1058 unsigned long start_pfn,
1059 unsigned long last_pfn)
1061 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1062 struct page *freelist = NULL;
1064 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1065 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1066 BUG_ON(start_pfn > last_pfn);
1068 /* we don't need lock here; nobody else touches the iova range */
1069 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1070 domain->pgd, 0, start_pfn, last_pfn, NULL);
1073 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1074 struct page *pgd_page = virt_to_page(domain->pgd);
1075 pgd_page->freelist = freelist;
1076 freelist = pgd_page;
1084 void dma_free_pagelist(struct page *freelist)
1088 while ((pg = freelist)) {
1089 freelist = pg->freelist;
1090 free_pgtable_page(page_address(pg));
1094 /* iommu handling */
1095 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1097 struct root_entry *root;
1098 unsigned long flags;
1100 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1104 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1106 spin_lock_irqsave(&iommu->lock, flags);
1107 iommu->root_entry = root;
1108 spin_unlock_irqrestore(&iommu->lock, flags);
1113 static void iommu_set_root_entry(struct intel_iommu *iommu)
1119 addr = iommu->root_entry;
1121 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1122 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1124 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1126 /* Make sure hardware complete it */
1127 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1128 readl, (sts & DMA_GSTS_RTPS), sts);
1130 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1133 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1138 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1141 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1144 /* Make sure hardware complete it */
1145 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 readl, (!(val & DMA_GSTS_WBFS)), val);
1148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 u16 did, u16 source_id, u8 function_mask,
1160 case DMA_CCMD_GLOBAL_INVL:
1161 val = DMA_CCMD_GLOBAL_INVL;
1163 case DMA_CCMD_DOMAIN_INVL:
1164 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1166 case DMA_CCMD_DEVICE_INVL:
1167 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1173 val |= DMA_CCMD_ICC;
1175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1176 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1178 /* Make sure hardware complete it */
1179 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1180 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1182 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 /* return value determine if we need a write buffer flush */
1186 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1187 u64 addr, unsigned int size_order, u64 type)
1189 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1190 u64 val = 0, val_iva = 0;
1194 case DMA_TLB_GLOBAL_FLUSH:
1195 /* global flush doesn't need set IVA_REG */
1196 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198 case DMA_TLB_DSI_FLUSH:
1199 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201 case DMA_TLB_PSI_FLUSH:
1202 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1203 /* IH bit is passed in as part of address */
1204 val_iva = size_order | addr;
1209 /* Note: set drain read/write */
1212 * This is probably to be super secure.. Looks like we can
1213 * ignore it without any impact.
1215 if (cap_read_drain(iommu->cap))
1216 val |= DMA_TLB_READ_DRAIN;
1218 if (cap_write_drain(iommu->cap))
1219 val |= DMA_TLB_WRITE_DRAIN;
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 /* Note: Only uses first TLB reg currently */
1224 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1225 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1227 /* Make sure hardware complete it */
1228 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1229 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1231 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233 /* check IOTLB invalidation granularity */
1234 if (DMA_TLB_IAIG(val) == 0)
1235 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1236 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1237 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1238 (unsigned long long)DMA_TLB_IIRG(type),
1239 (unsigned long long)DMA_TLB_IAIG(val));
1242 static struct device_domain_info *
1243 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1247 unsigned long flags;
1248 struct device_domain_info *info;
1249 struct pci_dev *pdev;
1251 if (!ecap_dev_iotlb_support(iommu->ecap))
1257 spin_lock_irqsave(&device_domain_lock, flags);
1258 list_for_each_entry(info, &domain->devices, link)
1259 if (info->bus == bus && info->devfn == devfn) {
1263 spin_unlock_irqrestore(&device_domain_lock, flags);
1265 if (!found || !info->dev || !dev_is_pci(info->dev))
1268 pdev = to_pci_dev(info->dev);
1270 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1273 if (!dmar_find_matched_atsr_unit(pdev))
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1281 if (!info || !dev_is_pci(info->dev))
1284 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1289 if (!info->dev || !dev_is_pci(info->dev) ||
1290 !pci_ats_enabled(to_pci_dev(info->dev)))
1293 pci_disable_ats(to_pci_dev(info->dev));
1296 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1297 u64 addr, unsigned mask)
1300 unsigned long flags;
1301 struct device_domain_info *info;
1303 spin_lock_irqsave(&device_domain_lock, flags);
1304 list_for_each_entry(info, &domain->devices, link) {
1305 struct pci_dev *pdev;
1306 if (!info->dev || !dev_is_pci(info->dev))
1309 pdev = to_pci_dev(info->dev);
1310 if (!pci_ats_enabled(pdev))
1313 sid = info->bus << 8 | info->devfn;
1314 qdep = pci_ats_queue_depth(pdev);
1315 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1317 spin_unlock_irqrestore(&device_domain_lock, flags);
1320 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1321 unsigned long pfn, unsigned int pages, int ih, int map)
1323 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1324 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1331 * Fallback to domain selective flush if no PSI support or the size is
1333 * PSI requires page size to be 2 ^ x, and the base address is naturally
1334 * aligned to the size
1336 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1337 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1340 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1344 * In caching mode, changes of pages from non-present to present require
1345 * flush. However, device IOTLB doesn't need to be flushed in this case.
1347 if (!cap_caching_mode(iommu->cap) || !map)
1348 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1351 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1354 unsigned long flags;
1356 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1357 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1358 pmen &= ~DMA_PMEN_EPM;
1359 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1361 /* wait for the protected region status bit to clear */
1362 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1363 readl, !(pmen & DMA_PMEN_PRS), pmen);
1365 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1368 static int iommu_enable_translation(struct intel_iommu *iommu)
1371 unsigned long flags;
1373 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1374 iommu->gcmd |= DMA_GCMD_TE;
1375 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1377 /* Make sure hardware complete it */
1378 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1379 readl, (sts & DMA_GSTS_TES), sts);
1381 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1385 static int iommu_disable_translation(struct intel_iommu *iommu)
1390 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1391 iommu->gcmd &= ~DMA_GCMD_TE;
1392 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1394 /* Make sure hardware complete it */
1395 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1396 readl, (!(sts & DMA_GSTS_TES)), sts);
1398 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1403 static int iommu_init_domains(struct intel_iommu *iommu)
1405 unsigned long ndomains;
1406 unsigned long nlongs;
1408 ndomains = cap_ndoms(iommu->cap);
1409 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1410 iommu->seq_id, ndomains);
1411 nlongs = BITS_TO_LONGS(ndomains);
1413 spin_lock_init(&iommu->lock);
1415 /* TBD: there might be 64K domains,
1416 * consider other allocation for future chip
1418 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1419 if (!iommu->domain_ids) {
1420 pr_err("IOMMU%d: allocating domain id array failed\n",
1424 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1426 if (!iommu->domains) {
1427 pr_err("IOMMU%d: allocating domain array failed\n",
1429 kfree(iommu->domain_ids);
1430 iommu->domain_ids = NULL;
1435 * if Caching mode is set, then invalid translations are tagged
1436 * with domainid 0. Hence we need to pre-allocate it.
1438 if (cap_caching_mode(iommu->cap))
1439 set_bit(0, iommu->domain_ids);
1443 static void free_dmar_iommu(struct intel_iommu *iommu)
1445 struct dmar_domain *domain;
1447 unsigned long flags;
1449 if ((iommu->domains) && (iommu->domain_ids)) {
1450 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1452 * Domain id 0 is reserved for invalid translation
1453 * if hardware supports caching mode.
1455 if (cap_caching_mode(iommu->cap) && i == 0)
1458 domain = iommu->domains[i];
1459 clear_bit(i, iommu->domain_ids);
1461 spin_lock_irqsave(&domain->iommu_lock, flags);
1462 count = --domain->iommu_count;
1463 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1465 domain_exit(domain);
1469 if (iommu->gcmd & DMA_GCMD_TE)
1470 iommu_disable_translation(iommu);
1472 kfree(iommu->domains);
1473 kfree(iommu->domain_ids);
1474 iommu->domains = NULL;
1475 iommu->domain_ids = NULL;
1477 g_iommus[iommu->seq_id] = NULL;
1479 /* free context mapping */
1480 free_context_table(iommu);
1483 static struct dmar_domain *alloc_domain(bool vm)
1485 /* domain id for virtual machine, it won't be set in context */
1486 static atomic_t vm_domid = ATOMIC_INIT(0);
1487 struct dmar_domain *domain;
1489 domain = alloc_domain_mem();
1494 domain->iommu_count = 0;
1495 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1497 spin_lock_init(&domain->iommu_lock);
1498 INIT_LIST_HEAD(&domain->devices);
1500 domain->id = atomic_inc_return(&vm_domid);
1501 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1507 static int iommu_attach_domain(struct dmar_domain *domain,
1508 struct intel_iommu *iommu)
1511 unsigned long ndomains;
1512 unsigned long flags;
1514 ndomains = cap_ndoms(iommu->cap);
1516 spin_lock_irqsave(&iommu->lock, flags);
1518 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1519 if (num >= ndomains) {
1520 spin_unlock_irqrestore(&iommu->lock, flags);
1521 printk(KERN_ERR "IOMMU: no free domain ids\n");
1526 domain->iommu_count++;
1527 set_bit(num, iommu->domain_ids);
1528 set_bit(iommu->seq_id, domain->iommu_bmp);
1529 iommu->domains[num] = domain;
1530 spin_unlock_irqrestore(&iommu->lock, flags);
1535 static void iommu_detach_domain(struct dmar_domain *domain,
1536 struct intel_iommu *iommu)
1538 unsigned long flags;
1541 spin_lock_irqsave(&iommu->lock, flags);
1542 ndomains = cap_ndoms(iommu->cap);
1543 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1544 if (iommu->domains[num] == domain) {
1545 clear_bit(num, iommu->domain_ids);
1546 iommu->domains[num] = NULL;
1550 spin_unlock_irqrestore(&iommu->lock, flags);
1553 static struct iova_domain reserved_iova_list;
1554 static struct lock_class_key reserved_rbtree_key;
1556 static int dmar_init_reserved_ranges(void)
1558 struct pci_dev *pdev = NULL;
1562 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1564 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1565 &reserved_rbtree_key);
1567 /* IOAPIC ranges shouldn't be accessed by DMA */
1568 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1569 IOVA_PFN(IOAPIC_RANGE_END));
1571 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1575 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1576 for_each_pci_dev(pdev) {
1579 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1580 r = &pdev->resource[i];
1581 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1583 iova = reserve_iova(&reserved_iova_list,
1587 printk(KERN_ERR "Reserve iova failed\n");
1595 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1597 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1600 static inline int guestwidth_to_adjustwidth(int gaw)
1603 int r = (gaw - 12) % 9;
1614 static int domain_init(struct dmar_domain *domain, int guest_width)
1616 struct intel_iommu *iommu;
1617 int adjust_width, agaw;
1618 unsigned long sagaw;
1620 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1621 domain_reserve_special_ranges(domain);
1623 /* calculate AGAW */
1624 iommu = domain_get_iommu(domain);
1625 if (guest_width > cap_mgaw(iommu->cap))
1626 guest_width = cap_mgaw(iommu->cap);
1627 domain->gaw = guest_width;
1628 adjust_width = guestwidth_to_adjustwidth(guest_width);
1629 agaw = width_to_agaw(adjust_width);
1630 sagaw = cap_sagaw(iommu->cap);
1631 if (!test_bit(agaw, &sagaw)) {
1632 /* hardware doesn't support it, choose a bigger one */
1633 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1634 agaw = find_next_bit(&sagaw, 5, agaw);
1638 domain->agaw = agaw;
1640 if (ecap_coherent(iommu->ecap))
1641 domain->iommu_coherency = 1;
1643 domain->iommu_coherency = 0;
1645 if (ecap_sc_support(iommu->ecap))
1646 domain->iommu_snooping = 1;
1648 domain->iommu_snooping = 0;
1650 if (intel_iommu_superpage)
1651 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1653 domain->iommu_superpage = 0;
1655 domain->nid = iommu->node;
1657 /* always allocate the top pgd */
1658 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1661 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1665 static void domain_exit(struct dmar_domain *domain)
1667 struct dmar_drhd_unit *drhd;
1668 struct intel_iommu *iommu;
1669 struct page *freelist = NULL;
1671 /* Domain 0 is reserved, so dont process it */
1675 /* Flush any lazy unmaps that may reference this domain */
1676 if (!intel_iommu_strict)
1677 flush_unmaps_timeout(0);
1679 /* remove associated devices */
1680 domain_remove_dev_info(domain);
1683 put_iova_domain(&domain->iovad);
1685 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1687 /* clear attached or cached domains */
1689 for_each_active_iommu(iommu, drhd)
1690 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1691 test_bit(iommu->seq_id, domain->iommu_bmp))
1692 iommu_detach_domain(domain, iommu);
1695 dma_free_pagelist(freelist);
1697 free_domain_mem(domain);
1700 static int domain_context_mapping_one(struct dmar_domain *domain,
1701 struct intel_iommu *iommu,
1702 u8 bus, u8 devfn, int translation)
1704 struct context_entry *context;
1705 unsigned long flags;
1706 struct dma_pte *pgd;
1708 unsigned long ndomains;
1711 struct device_domain_info *info = NULL;
1713 pr_debug("Set context mapping for %02x:%02x.%d\n",
1714 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1716 BUG_ON(!domain->pgd);
1717 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1718 translation != CONTEXT_TT_MULTI_LEVEL);
1720 context = device_to_context_entry(iommu, bus, devfn);
1723 spin_lock_irqsave(&iommu->lock, flags);
1724 if (context_present(context)) {
1725 spin_unlock_irqrestore(&iommu->lock, flags);
1732 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1733 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1736 /* find an available domain id for this device in iommu */
1737 ndomains = cap_ndoms(iommu->cap);
1738 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1739 if (iommu->domains[num] == domain) {
1747 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1748 if (num >= ndomains) {
1749 spin_unlock_irqrestore(&iommu->lock, flags);
1750 printk(KERN_ERR "IOMMU: no free domain ids\n");
1754 set_bit(num, iommu->domain_ids);
1755 iommu->domains[num] = domain;
1759 /* Skip top levels of page tables for
1760 * iommu which has less agaw than default.
1761 * Unnecessary for PT mode.
1763 if (translation != CONTEXT_TT_PASS_THROUGH) {
1764 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1765 pgd = phys_to_virt(dma_pte_addr(pgd));
1766 if (!dma_pte_present(pgd)) {
1767 spin_unlock_irqrestore(&iommu->lock, flags);
1774 context_set_domain_id(context, id);
1776 if (translation != CONTEXT_TT_PASS_THROUGH) {
1777 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1778 translation = info ? CONTEXT_TT_DEV_IOTLB :
1779 CONTEXT_TT_MULTI_LEVEL;
1782 * In pass through mode, AW must be programmed to indicate the largest
1783 * AGAW value supported by hardware. And ASR is ignored by hardware.
1785 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1786 context_set_address_width(context, iommu->msagaw);
1788 context_set_address_root(context, virt_to_phys(pgd));
1789 context_set_address_width(context, iommu->agaw);
1792 context_set_translation_type(context, translation);
1793 context_set_fault_enable(context);
1794 context_set_present(context);
1795 domain_flush_cache(domain, context, sizeof(*context));
1798 * It's a non-present to present mapping. If hardware doesn't cache
1799 * non-present entry we only need to flush the write-buffer. If the
1800 * _does_ cache non-present entries, then it does so in the special
1801 * domain #0, which we have to flush:
1803 if (cap_caching_mode(iommu->cap)) {
1804 iommu->flush.flush_context(iommu, 0,
1805 (((u16)bus) << 8) | devfn,
1806 DMA_CCMD_MASK_NOBIT,
1807 DMA_CCMD_DEVICE_INVL);
1808 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1810 iommu_flush_write_buffer(iommu);
1812 iommu_enable_dev_iotlb(info);
1813 spin_unlock_irqrestore(&iommu->lock, flags);
1815 spin_lock_irqsave(&domain->iommu_lock, flags);
1816 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1817 domain->iommu_count++;
1818 if (domain->iommu_count == 1)
1819 domain->nid = iommu->node;
1820 domain_update_iommu_cap(domain);
1822 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1827 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1831 struct pci_dev *tmp, *parent;
1832 struct intel_iommu *iommu;
1834 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1839 ret = domain_context_mapping_one(domain, iommu,
1840 pdev->bus->number, pdev->devfn,
1845 /* dependent device mapping */
1846 tmp = pci_find_upstream_pcie_bridge(pdev);
1849 /* Secondary interface's bus number and devfn 0 */
1850 parent = pdev->bus->self;
1851 while (parent != tmp) {
1852 ret = domain_context_mapping_one(domain, iommu,
1853 parent->bus->number,
1854 parent->devfn, translation);
1857 parent = parent->bus->self;
1859 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1860 return domain_context_mapping_one(domain, iommu,
1861 tmp->subordinate->number, 0,
1863 else /* this is a legacy PCI bridge */
1864 return domain_context_mapping_one(domain, iommu,
1870 static int domain_context_mapped(struct pci_dev *pdev)
1873 struct pci_dev *tmp, *parent;
1874 struct intel_iommu *iommu;
1876 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1881 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1884 /* dependent device mapping */
1885 tmp = pci_find_upstream_pcie_bridge(pdev);
1888 /* Secondary interface's bus number and devfn 0 */
1889 parent = pdev->bus->self;
1890 while (parent != tmp) {
1891 ret = device_context_mapped(iommu, parent->bus->number,
1895 parent = parent->bus->self;
1897 if (pci_is_pcie(tmp))
1898 return device_context_mapped(iommu, tmp->subordinate->number,
1901 return device_context_mapped(iommu, tmp->bus->number,
1905 /* Returns a number of VTD pages, but aligned to MM page size */
1906 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1909 host_addr &= ~PAGE_MASK;
1910 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1913 /* Return largest possible superpage level for a given mapping */
1914 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1915 unsigned long iov_pfn,
1916 unsigned long phy_pfn,
1917 unsigned long pages)
1919 int support, level = 1;
1920 unsigned long pfnmerge;
1922 support = domain->iommu_superpage;
1924 /* To use a large page, the virtual *and* physical addresses
1925 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1926 of them will mean we have to use smaller pages. So just
1927 merge them and check both at once. */
1928 pfnmerge = iov_pfn | phy_pfn;
1930 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1931 pages >>= VTD_STRIDE_SHIFT;
1934 pfnmerge >>= VTD_STRIDE_SHIFT;
1941 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1942 struct scatterlist *sg, unsigned long phys_pfn,
1943 unsigned long nr_pages, int prot)
1945 struct dma_pte *first_pte = NULL, *pte = NULL;
1946 phys_addr_t uninitialized_var(pteval);
1947 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1948 unsigned long sg_res;
1949 unsigned int largepage_lvl = 0;
1950 unsigned long lvl_pages = 0;
1952 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1954 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1957 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1962 sg_res = nr_pages + 1;
1963 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1966 while (nr_pages > 0) {
1970 sg_res = aligned_nrpages(sg->offset, sg->length);
1971 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1972 sg->dma_length = sg->length;
1973 pteval = page_to_phys(sg_page(sg)) | prot;
1974 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1978 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1980 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1983 /* It is large page*/
1984 if (largepage_lvl > 1) {
1985 pteval |= DMA_PTE_LARGE_PAGE;
1986 /* Ensure that old small page tables are removed to make room
1987 for superpage, if they exist. */
1988 dma_pte_clear_range(domain, iov_pfn,
1989 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1990 dma_pte_free_pagetable(domain, iov_pfn,
1991 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1993 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1997 /* We don't need lock here, nobody else
1998 * touches the iova range
2000 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2002 static int dumps = 5;
2003 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2004 iov_pfn, tmp, (unsigned long long)pteval);
2007 debug_dma_dump_mappings(NULL);
2012 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2014 BUG_ON(nr_pages < lvl_pages);
2015 BUG_ON(sg_res < lvl_pages);
2017 nr_pages -= lvl_pages;
2018 iov_pfn += lvl_pages;
2019 phys_pfn += lvl_pages;
2020 pteval += lvl_pages * VTD_PAGE_SIZE;
2021 sg_res -= lvl_pages;
2023 /* If the next PTE would be the first in a new page, then we
2024 need to flush the cache on the entries we've just written.
2025 And then we'll need to recalculate 'pte', so clear it and
2026 let it get set again in the if (!pte) block above.
2028 If we're done (!nr_pages) we need to flush the cache too.
2030 Also if we've been setting superpages, we may need to
2031 recalculate 'pte' and switch back to smaller pages for the
2032 end of the mapping, if the trailing size is not enough to
2033 use another superpage (i.e. sg_res < lvl_pages). */
2035 if (!nr_pages || first_pte_in_page(pte) ||
2036 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2037 domain_flush_cache(domain, first_pte,
2038 (void *)pte - (void *)first_pte);
2042 if (!sg_res && nr_pages)
2048 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2049 struct scatterlist *sg, unsigned long nr_pages,
2052 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2055 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2056 unsigned long phys_pfn, unsigned long nr_pages,
2059 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2062 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2067 clear_context_table(iommu, bus, devfn);
2068 iommu->flush.flush_context(iommu, 0, 0, 0,
2069 DMA_CCMD_GLOBAL_INVL);
2070 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2073 static inline void unlink_domain_info(struct device_domain_info *info)
2075 assert_spin_locked(&device_domain_lock);
2076 list_del(&info->link);
2077 list_del(&info->global);
2079 info->dev->archdata.iommu = NULL;
2082 static void domain_remove_dev_info(struct dmar_domain *domain)
2084 struct device_domain_info *info;
2085 unsigned long flags, flags2;
2087 spin_lock_irqsave(&device_domain_lock, flags);
2088 while (!list_empty(&domain->devices)) {
2089 info = list_entry(domain->devices.next,
2090 struct device_domain_info, link);
2091 unlink_domain_info(info);
2092 spin_unlock_irqrestore(&device_domain_lock, flags);
2094 iommu_disable_dev_iotlb(info);
2095 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2097 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2098 iommu_detach_dependent_devices(info->iommu, info->dev);
2099 /* clear this iommu in iommu_bmp, update iommu count
2102 spin_lock_irqsave(&domain->iommu_lock, flags2);
2103 if (test_and_clear_bit(info->iommu->seq_id,
2104 domain->iommu_bmp)) {
2105 domain->iommu_count--;
2106 domain_update_iommu_cap(domain);
2108 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2111 free_devinfo_mem(info);
2112 spin_lock_irqsave(&device_domain_lock, flags);
2114 spin_unlock_irqrestore(&device_domain_lock, flags);
2119 * Note: we use struct device->archdata.iommu stores the info
2121 static struct dmar_domain *find_domain(struct device *dev)
2123 struct device_domain_info *info;
2125 /* No lock here, assumes no domain exit in normal case */
2126 info = dev->archdata.iommu;
2128 return info->domain;
2132 static inline struct device_domain_info *
2133 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2135 struct device_domain_info *info;
2137 list_for_each_entry(info, &device_domain_list, global)
2138 if (info->iommu->segment == segment && info->bus == bus &&
2139 info->devfn == devfn)
2145 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2148 struct dmar_domain *domain)
2150 struct dmar_domain *found = NULL;
2151 struct device_domain_info *info;
2152 unsigned long flags;
2154 info = alloc_devinfo_mem();
2159 info->devfn = devfn;
2161 info->domain = domain;
2162 info->iommu = iommu;
2164 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2166 spin_lock_irqsave(&device_domain_lock, flags);
2168 found = find_domain(dev);
2170 struct device_domain_info *info2;
2171 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2173 found = info2->domain;
2176 spin_unlock_irqrestore(&device_domain_lock, flags);
2177 free_devinfo_mem(info);
2178 /* Caller must free the original domain */
2182 list_add(&info->link, &domain->devices);
2183 list_add(&info->global, &device_domain_list);
2185 dev->archdata.iommu = info;
2186 spin_unlock_irqrestore(&device_domain_lock, flags);
2191 /* domain is initialized */
2192 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2194 struct dmar_domain *domain, *free = NULL;
2195 struct intel_iommu *iommu = NULL;
2196 struct device_domain_info *info;
2197 struct dmar_drhd_unit *drhd;
2198 struct pci_dev *dev_tmp;
2199 unsigned long flags;
2200 int bus = 0, devfn = 0;
2203 domain = find_domain(&pdev->dev);
2207 segment = pci_domain_nr(pdev->bus);
2209 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2211 if (pci_is_pcie(dev_tmp)) {
2212 bus = dev_tmp->subordinate->number;
2215 bus = dev_tmp->bus->number;
2216 devfn = dev_tmp->devfn;
2218 spin_lock_irqsave(&device_domain_lock, flags);
2219 info = dmar_search_domain_by_dev_info(segment, bus, devfn);
2221 iommu = info->iommu;
2222 domain = info->domain;
2224 spin_unlock_irqrestore(&device_domain_lock, flags);
2229 drhd = dmar_find_matched_drhd_unit(pdev);
2231 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2235 iommu = drhd->iommu;
2237 /* Allocate and intialize new domain for the device */
2238 domain = alloc_domain(false);
2241 if (iommu_attach_domain(domain, iommu)) {
2242 free_domain_mem(domain);
2246 if (domain_init(domain, gaw))
2249 /* register pcie-to-pci device */
2251 domain = dmar_insert_dev_info(iommu, bus, devfn, NULL,
2258 domain = dmar_insert_dev_info(iommu, pdev->bus->number,
2259 pdev->devfn, &pdev->dev, domain);
2267 static int iommu_identity_mapping;
2268 #define IDENTMAP_ALL 1
2269 #define IDENTMAP_GFX 2
2270 #define IDENTMAP_AZALIA 4
2272 static int iommu_domain_identity_map(struct dmar_domain *domain,
2273 unsigned long long start,
2274 unsigned long long end)
2276 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2277 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2279 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2280 dma_to_mm_pfn(last_vpfn))) {
2281 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2285 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2286 start, end, domain->id);
2288 * RMRR range might have overlap with physical memory range,
2291 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2293 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2294 last_vpfn - first_vpfn + 1,
2295 DMA_PTE_READ|DMA_PTE_WRITE);
2298 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2299 unsigned long long start,
2300 unsigned long long end)
2302 struct dmar_domain *domain;
2305 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2309 /* For _hardware_ passthrough, don't bother. But for software
2310 passthrough, we do it anyway -- it may indicate a memory
2311 range which is reserved in E820, so which didn't get set
2312 up to start with in si_domain */
2313 if (domain == si_domain && hw_pass_through) {
2314 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2315 pci_name(pdev), start, end);
2320 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2321 pci_name(pdev), start, end);
2324 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2325 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2326 dmi_get_system_info(DMI_BIOS_VENDOR),
2327 dmi_get_system_info(DMI_BIOS_VERSION),
2328 dmi_get_system_info(DMI_PRODUCT_VERSION));
2333 if (end >> agaw_to_width(domain->agaw)) {
2334 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2335 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2336 agaw_to_width(domain->agaw),
2337 dmi_get_system_info(DMI_BIOS_VENDOR),
2338 dmi_get_system_info(DMI_BIOS_VERSION),
2339 dmi_get_system_info(DMI_PRODUCT_VERSION));
2344 ret = iommu_domain_identity_map(domain, start, end);
2348 /* context entry init */
2349 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2356 domain_exit(domain);
2360 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2361 struct pci_dev *pdev)
2363 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2365 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2369 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2370 static inline void iommu_prepare_isa(void)
2372 struct pci_dev *pdev;
2375 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2379 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2380 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2383 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2384 "floppy might not work\n");
2388 static inline void iommu_prepare_isa(void)
2392 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2394 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2396 static int __init si_domain_init(int hw)
2398 struct dmar_drhd_unit *drhd;
2399 struct intel_iommu *iommu;
2402 si_domain = alloc_domain(false);
2406 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2408 for_each_active_iommu(iommu, drhd) {
2409 ret = iommu_attach_domain(si_domain, iommu);
2411 domain_exit(si_domain);
2416 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2417 domain_exit(si_domain);
2421 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2427 for_each_online_node(nid) {
2428 unsigned long start_pfn, end_pfn;
2431 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2432 ret = iommu_domain_identity_map(si_domain,
2433 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2442 static int identity_mapping(struct pci_dev *pdev)
2444 struct device_domain_info *info;
2446 if (likely(!iommu_identity_mapping))
2449 info = pdev->dev.archdata.iommu;
2450 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2451 return (info->domain == si_domain);
2456 static int domain_add_dev_info(struct dmar_domain *domain,
2457 struct pci_dev *pdev,
2460 struct dmar_domain *ndomain;
2461 struct intel_iommu *iommu;
2464 iommu = device_to_iommu(pci_domain_nr(pdev->bus),
2465 pdev->bus->number, pdev->devfn);
2469 ndomain = dmar_insert_dev_info(iommu, pdev->bus->number, pdev->devfn,
2470 &pdev->dev, domain);
2471 if (ndomain != domain)
2474 ret = domain_context_mapping(domain, pdev, translation);
2476 domain_remove_one_dev_info(domain, pdev);
2483 static bool device_has_rmrr(struct pci_dev *dev)
2485 struct dmar_rmrr_unit *rmrr;
2490 for_each_rmrr_units(rmrr) {
2492 * Return TRUE if this RMRR contains the device that
2495 for_each_active_dev_scope(rmrr->devices,
2496 rmrr->devices_cnt, i, tmp)
2497 if (tmp == &dev->dev) {
2506 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2510 * We want to prevent any device associated with an RMRR from
2511 * getting placed into the SI Domain. This is done because
2512 * problems exist when devices are moved in and out of domains
2513 * and their respective RMRR info is lost. We exempt USB devices
2514 * from this process due to their usage of RMRRs that are known
2515 * to not be needed after BIOS hand-off to OS.
2517 if (device_has_rmrr(pdev) &&
2518 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2521 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2524 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2527 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2531 * We want to start off with all devices in the 1:1 domain, and
2532 * take them out later if we find they can't access all of memory.
2534 * However, we can't do this for PCI devices behind bridges,
2535 * because all PCI devices behind the same bridge will end up
2536 * with the same source-id on their transactions.
2538 * Practically speaking, we can't change things around for these
2539 * devices at run-time, because we can't be sure there'll be no
2540 * DMA transactions in flight for any of their siblings.
2542 * So PCI devices (unless they're on the root bus) as well as
2543 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2544 * the 1:1 domain, just in _case_ one of their siblings turns out
2545 * not to be able to map all of memory.
2547 if (!pci_is_pcie(pdev)) {
2548 if (!pci_is_root_bus(pdev->bus))
2550 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2552 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2556 * At boot time, we don't yet know if devices will be 64-bit capable.
2557 * Assume that they will -- if they turn out not to be, then we can
2558 * take them out of the 1:1 domain later.
2562 * If the device's dma_mask is less than the system's memory
2563 * size then this is not a candidate for identity mapping.
2565 u64 dma_mask = pdev->dma_mask;
2567 if (pdev->dev.coherent_dma_mask &&
2568 pdev->dev.coherent_dma_mask < dma_mask)
2569 dma_mask = pdev->dev.coherent_dma_mask;
2571 return dma_mask >= dma_get_required_mask(&pdev->dev);
2577 static int __init iommu_prepare_static_identity_mapping(int hw)
2579 struct pci_dev *pdev = NULL;
2582 ret = si_domain_init(hw);
2586 for_each_pci_dev(pdev) {
2587 if (iommu_should_identity_map(pdev, 1)) {
2588 ret = domain_add_dev_info(si_domain, pdev,
2589 hw ? CONTEXT_TT_PASS_THROUGH :
2590 CONTEXT_TT_MULTI_LEVEL);
2592 /* device not associated with an iommu */
2597 pr_info("IOMMU: %s identity mapping for device %s\n",
2598 hw ? "hardware" : "software", pci_name(pdev));
2605 static int __init init_dmars(void)
2607 struct dmar_drhd_unit *drhd;
2608 struct dmar_rmrr_unit *rmrr;
2610 struct intel_iommu *iommu;
2616 * initialize and program root entry to not present
2619 for_each_drhd_unit(drhd) {
2621 * lock not needed as this is only incremented in the single
2622 * threaded kernel __init code path all other access are read
2625 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2629 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2630 IOMMU_UNITS_SUPPORTED);
2633 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2636 printk(KERN_ERR "Allocating global iommu array failed\n");
2641 deferred_flush = kzalloc(g_num_of_iommus *
2642 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2643 if (!deferred_flush) {
2648 for_each_active_iommu(iommu, drhd) {
2649 g_iommus[iommu->seq_id] = iommu;
2651 ret = iommu_init_domains(iommu);
2657 * we could share the same root & context tables
2658 * among all IOMMU's. Need to Split it later.
2660 ret = iommu_alloc_root_entry(iommu);
2662 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2665 if (!ecap_pass_through(iommu->ecap))
2666 hw_pass_through = 0;
2670 * Start from the sane iommu hardware state.
2672 for_each_active_iommu(iommu, drhd) {
2674 * If the queued invalidation is already initialized by us
2675 * (for example, while enabling interrupt-remapping) then
2676 * we got the things already rolling from a sane state.
2682 * Clear any previous faults.
2684 dmar_fault(-1, iommu);
2686 * Disable queued invalidation if supported and already enabled
2687 * before OS handover.
2689 dmar_disable_qi(iommu);
2692 for_each_active_iommu(iommu, drhd) {
2693 if (dmar_enable_qi(iommu)) {
2695 * Queued Invalidate not enabled, use Register Based
2698 iommu->flush.flush_context = __iommu_flush_context;
2699 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2700 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2703 (unsigned long long)drhd->reg_base_addr);
2705 iommu->flush.flush_context = qi_flush_context;
2706 iommu->flush.flush_iotlb = qi_flush_iotlb;
2707 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2710 (unsigned long long)drhd->reg_base_addr);
2714 if (iommu_pass_through)
2715 iommu_identity_mapping |= IDENTMAP_ALL;
2717 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2718 iommu_identity_mapping |= IDENTMAP_GFX;
2721 check_tylersburg_isoch();
2724 * If pass through is not set or not enabled, setup context entries for
2725 * identity mappings for rmrr, gfx, and isa and may fall back to static
2726 * identity mapping if iommu_identity_mapping is set.
2728 if (iommu_identity_mapping) {
2729 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2731 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2737 * for each dev attached to rmrr
2739 * locate drhd for dev, alloc domain for dev
2740 * allocate free domain
2741 * allocate page table entries for rmrr
2742 * if context not allocated for bus
2743 * allocate and init context
2744 * set present in root table for this bus
2745 * init context with domain, translation etc
2749 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2750 for_each_rmrr_units(rmrr) {
2751 /* some BIOS lists non-exist devices in DMAR table. */
2752 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2754 if (!dev_is_pci(dev))
2756 ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2759 "IOMMU: mapping reserved region failed\n");
2763 iommu_prepare_isa();
2768 * global invalidate context cache
2769 * global invalidate iotlb
2770 * enable translation
2772 for_each_iommu(iommu, drhd) {
2773 if (drhd->ignored) {
2775 * we always have to disable PMRs or DMA may fail on
2779 iommu_disable_protect_mem_regions(iommu);
2783 iommu_flush_write_buffer(iommu);
2785 ret = dmar_set_interrupt(iommu);
2789 iommu_set_root_entry(iommu);
2791 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2792 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2794 ret = iommu_enable_translation(iommu);
2798 iommu_disable_protect_mem_regions(iommu);
2804 for_each_active_iommu(iommu, drhd)
2805 free_dmar_iommu(iommu);
2806 kfree(deferred_flush);
2813 /* This takes a number of _MM_ pages, not VTD pages */
2814 static struct iova *intel_alloc_iova(struct device *dev,
2815 struct dmar_domain *domain,
2816 unsigned long nrpages, uint64_t dma_mask)
2818 struct pci_dev *pdev = to_pci_dev(dev);
2819 struct iova *iova = NULL;
2821 /* Restrict dma_mask to the width that the iommu can handle */
2822 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2824 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2826 * First try to allocate an io virtual address in
2827 * DMA_BIT_MASK(32) and if that fails then try allocating
2830 iova = alloc_iova(&domain->iovad, nrpages,
2831 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2835 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2836 if (unlikely(!iova)) {
2837 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2838 nrpages, pci_name(pdev));
2845 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2847 struct dmar_domain *domain;
2850 domain = get_domain_for_dev(pdev,
2851 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2854 "Allocating domain for %s failed", pci_name(pdev));
2858 /* make sure context mapping is ok */
2859 if (unlikely(!domain_context_mapped(pdev))) {
2860 ret = domain_context_mapping(domain, pdev,
2861 CONTEXT_TT_MULTI_LEVEL);
2864 "Domain context map for %s failed",
2873 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2875 struct device_domain_info *info;
2877 /* No lock here, assumes no domain exit in normal case */
2878 info = dev->dev.archdata.iommu;
2880 return info->domain;
2882 return __get_valid_domain_for_dev(dev);
2885 static int iommu_dummy(struct device *dev)
2887 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2890 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2891 static int iommu_no_mapping(struct device *dev)
2893 struct pci_dev *pdev;
2896 if (unlikely(!dev_is_pci(dev)))
2899 if (iommu_dummy(dev))
2902 if (!iommu_identity_mapping)
2905 pdev = to_pci_dev(dev);
2906 found = identity_mapping(pdev);
2908 if (iommu_should_identity_map(pdev, 0))
2912 * 32 bit DMA is removed from si_domain and fall back
2913 * to non-identity mapping.
2915 domain_remove_one_dev_info(si_domain, pdev);
2916 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2922 * In case of a detached 64 bit DMA device from vm, the device
2923 * is put into si_domain for identity mapping.
2925 if (iommu_should_identity_map(pdev, 0)) {
2927 ret = domain_add_dev_info(si_domain, pdev,
2929 CONTEXT_TT_PASS_THROUGH :
2930 CONTEXT_TT_MULTI_LEVEL);
2932 printk(KERN_INFO "64bit %s uses identity mapping\n",
2942 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2943 size_t size, int dir, u64 dma_mask)
2945 struct pci_dev *pdev = to_pci_dev(hwdev);
2946 struct dmar_domain *domain;
2947 phys_addr_t start_paddr;
2951 struct intel_iommu *iommu;
2952 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2954 BUG_ON(dir == DMA_NONE);
2956 if (iommu_no_mapping(hwdev))
2959 domain = get_valid_domain_for_dev(pdev);
2963 iommu = domain_get_iommu(domain);
2964 size = aligned_nrpages(paddr, size);
2966 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2971 * Check if DMAR supports zero-length reads on write only
2974 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2975 !cap_zlr(iommu->cap))
2976 prot |= DMA_PTE_READ;
2977 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2978 prot |= DMA_PTE_WRITE;
2980 * paddr - (paddr + size) might be partial page, we should map the whole
2981 * page. Note: if two part of one page are separately mapped, we
2982 * might have two guest_addr mapping to the same host paddr, but this
2983 * is not a big problem
2985 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2986 mm_to_dma_pfn(paddr_pfn), size, prot);
2990 /* it's a non-present to present mapping. Only flush if caching mode */
2991 if (cap_caching_mode(iommu->cap))
2992 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2994 iommu_flush_write_buffer(iommu);
2996 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2997 start_paddr += paddr & ~PAGE_MASK;
3002 __free_iova(&domain->iovad, iova);
3003 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3004 pci_name(pdev), size, (unsigned long long)paddr, dir);
3008 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3009 unsigned long offset, size_t size,
3010 enum dma_data_direction dir,
3011 struct dma_attrs *attrs)
3013 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3014 dir, to_pci_dev(dev)->dma_mask);
3017 static void flush_unmaps(void)
3023 /* just flush them all */
3024 for (i = 0; i < g_num_of_iommus; i++) {
3025 struct intel_iommu *iommu = g_iommus[i];
3029 if (!deferred_flush[i].next)
3032 /* In caching mode, global flushes turn emulation expensive */
3033 if (!cap_caching_mode(iommu->cap))
3034 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3035 DMA_TLB_GLOBAL_FLUSH);
3036 for (j = 0; j < deferred_flush[i].next; j++) {
3038 struct iova *iova = deferred_flush[i].iova[j];
3039 struct dmar_domain *domain = deferred_flush[i].domain[j];
3041 /* On real hardware multiple invalidations are expensive */
3042 if (cap_caching_mode(iommu->cap))
3043 iommu_flush_iotlb_psi(iommu, domain->id,
3044 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3045 !deferred_flush[i].freelist[j], 0);
3047 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3048 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3049 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3051 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3052 if (deferred_flush[i].freelist[j])
3053 dma_free_pagelist(deferred_flush[i].freelist[j]);
3055 deferred_flush[i].next = 0;
3061 static void flush_unmaps_timeout(unsigned long data)
3063 unsigned long flags;
3065 spin_lock_irqsave(&async_umap_flush_lock, flags);
3067 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3070 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3072 unsigned long flags;
3074 struct intel_iommu *iommu;
3076 spin_lock_irqsave(&async_umap_flush_lock, flags);
3077 if (list_size == HIGH_WATER_MARK)
3080 iommu = domain_get_iommu(dom);
3081 iommu_id = iommu->seq_id;
3083 next = deferred_flush[iommu_id].next;
3084 deferred_flush[iommu_id].domain[next] = dom;
3085 deferred_flush[iommu_id].iova[next] = iova;
3086 deferred_flush[iommu_id].freelist[next] = freelist;
3087 deferred_flush[iommu_id].next++;
3090 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3094 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3097 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3098 size_t size, enum dma_data_direction dir,
3099 struct dma_attrs *attrs)
3101 struct pci_dev *pdev = to_pci_dev(dev);
3102 struct dmar_domain *domain;
3103 unsigned long start_pfn, last_pfn;
3105 struct intel_iommu *iommu;
3106 struct page *freelist;
3108 if (iommu_no_mapping(dev))
3111 domain = find_domain(dev);
3114 iommu = domain_get_iommu(domain);
3116 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3117 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3118 (unsigned long long)dev_addr))
3121 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3122 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3124 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3125 pci_name(pdev), start_pfn, last_pfn);
3127 freelist = domain_unmap(domain, start_pfn, last_pfn);
3129 if (intel_iommu_strict) {
3130 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3131 last_pfn - start_pfn + 1, !freelist, 0);
3133 __free_iova(&domain->iovad, iova);
3134 dma_free_pagelist(freelist);
3136 add_unmap(domain, iova, freelist);
3138 * queue up the release of the unmap to save the 1/6th of the
3139 * cpu used up by the iotlb flush operation...
3144 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3145 dma_addr_t *dma_handle, gfp_t flags,
3146 struct dma_attrs *attrs)
3151 size = PAGE_ALIGN(size);
3152 order = get_order(size);
3154 if (!iommu_no_mapping(hwdev))
3155 flags &= ~(GFP_DMA | GFP_DMA32);
3156 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3157 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3163 vaddr = (void *)__get_free_pages(flags, order);
3166 memset(vaddr, 0, size);
3168 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3170 hwdev->coherent_dma_mask);
3173 free_pages((unsigned long)vaddr, order);
3177 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3178 dma_addr_t dma_handle, struct dma_attrs *attrs)
3182 size = PAGE_ALIGN(size);
3183 order = get_order(size);
3185 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3186 free_pages((unsigned long)vaddr, order);
3189 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3190 int nelems, enum dma_data_direction dir,
3191 struct dma_attrs *attrs)
3193 struct dmar_domain *domain;
3194 unsigned long start_pfn, last_pfn;
3196 struct intel_iommu *iommu;
3197 struct page *freelist;
3199 if (iommu_no_mapping(hwdev))
3202 domain = find_domain(hwdev);
3205 iommu = domain_get_iommu(domain);
3207 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3208 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3209 (unsigned long long)sglist[0].dma_address))
3212 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3213 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3215 freelist = domain_unmap(domain, start_pfn, last_pfn);
3217 if (intel_iommu_strict) {
3218 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219 last_pfn - start_pfn + 1, !freelist, 0);
3221 __free_iova(&domain->iovad, iova);
3222 dma_free_pagelist(freelist);
3224 add_unmap(domain, iova, freelist);
3226 * queue up the release of the unmap to save the 1/6th of the
3227 * cpu used up by the iotlb flush operation...
3232 static int intel_nontranslate_map_sg(struct device *hddev,
3233 struct scatterlist *sglist, int nelems, int dir)
3236 struct scatterlist *sg;
3238 for_each_sg(sglist, sg, nelems, i) {
3239 BUG_ON(!sg_page(sg));
3240 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3241 sg->dma_length = sg->length;
3246 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3247 enum dma_data_direction dir, struct dma_attrs *attrs)
3250 struct pci_dev *pdev = to_pci_dev(hwdev);
3251 struct dmar_domain *domain;
3254 struct iova *iova = NULL;
3256 struct scatterlist *sg;
3257 unsigned long start_vpfn;
3258 struct intel_iommu *iommu;
3260 BUG_ON(dir == DMA_NONE);
3261 if (iommu_no_mapping(hwdev))
3262 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3264 domain = get_valid_domain_for_dev(pdev);
3268 iommu = domain_get_iommu(domain);
3270 for_each_sg(sglist, sg, nelems, i)
3271 size += aligned_nrpages(sg->offset, sg->length);
3273 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3276 sglist->dma_length = 0;
3281 * Check if DMAR supports zero-length reads on write only
3284 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3285 !cap_zlr(iommu->cap))
3286 prot |= DMA_PTE_READ;
3287 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3288 prot |= DMA_PTE_WRITE;
3290 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3292 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3293 if (unlikely(ret)) {
3294 /* clear the page */
3295 dma_pte_clear_range(domain, start_vpfn,
3296 start_vpfn + size - 1);
3297 /* free page tables */
3298 dma_pte_free_pagetable(domain, start_vpfn,
3299 start_vpfn + size - 1);
3301 __free_iova(&domain->iovad, iova);
3305 /* it's a non-present to present mapping. Only flush if caching mode */
3306 if (cap_caching_mode(iommu->cap))
3307 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3309 iommu_flush_write_buffer(iommu);
3314 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3319 struct dma_map_ops intel_dma_ops = {
3320 .alloc = intel_alloc_coherent,
3321 .free = intel_free_coherent,
3322 .map_sg = intel_map_sg,
3323 .unmap_sg = intel_unmap_sg,
3324 .map_page = intel_map_page,
3325 .unmap_page = intel_unmap_page,
3326 .mapping_error = intel_mapping_error,
3329 static inline int iommu_domain_cache_init(void)
3333 iommu_domain_cache = kmem_cache_create("iommu_domain",
3334 sizeof(struct dmar_domain),
3339 if (!iommu_domain_cache) {
3340 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3347 static inline int iommu_devinfo_cache_init(void)
3351 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3352 sizeof(struct device_domain_info),
3356 if (!iommu_devinfo_cache) {
3357 printk(KERN_ERR "Couldn't create devinfo cache\n");
3364 static inline int iommu_iova_cache_init(void)
3368 iommu_iova_cache = kmem_cache_create("iommu_iova",
3369 sizeof(struct iova),
3373 if (!iommu_iova_cache) {
3374 printk(KERN_ERR "Couldn't create iova cache\n");
3381 static int __init iommu_init_mempool(void)
3384 ret = iommu_iova_cache_init();
3388 ret = iommu_domain_cache_init();
3392 ret = iommu_devinfo_cache_init();
3396 kmem_cache_destroy(iommu_domain_cache);
3398 kmem_cache_destroy(iommu_iova_cache);
3403 static void __init iommu_exit_mempool(void)
3405 kmem_cache_destroy(iommu_devinfo_cache);
3406 kmem_cache_destroy(iommu_domain_cache);
3407 kmem_cache_destroy(iommu_iova_cache);
3411 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3413 struct dmar_drhd_unit *drhd;
3417 /* We know that this device on this chipset has its own IOMMU.
3418 * If we find it under a different IOMMU, then the BIOS is lying
3419 * to us. Hope that the IOMMU for this device is actually
3420 * disabled, and it needs no translation...
3422 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3424 /* "can't" happen */
3425 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3428 vtbar &= 0xffff0000;
3430 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3431 drhd = dmar_find_matched_drhd_unit(pdev);
3432 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3433 TAINT_FIRMWARE_WORKAROUND,
3434 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3435 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3437 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3439 static void __init init_no_remapping_devices(void)
3441 struct dmar_drhd_unit *drhd;
3445 for_each_drhd_unit(drhd) {
3446 if (!drhd->include_all) {
3447 for_each_active_dev_scope(drhd->devices,
3448 drhd->devices_cnt, i, dev)
3450 /* ignore DMAR unit if no devices exist */
3451 if (i == drhd->devices_cnt)
3456 for_each_active_drhd_unit(drhd) {
3457 if (drhd->include_all)
3460 for_each_active_dev_scope(drhd->devices,
3461 drhd->devices_cnt, i, dev)
3462 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3464 if (i < drhd->devices_cnt)
3467 /* This IOMMU has *only* gfx devices. Either bypass it or
3468 set the gfx_mapped flag, as appropriate */
3470 intel_iommu_gfx_mapped = 1;
3473 for_each_active_dev_scope(drhd->devices,
3474 drhd->devices_cnt, i, dev)
3475 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3483 struct dmar_drhd_unit *drhd;
3484 struct intel_iommu *iommu = NULL;
3486 for_each_active_iommu(iommu, drhd)
3488 dmar_reenable_qi(iommu);
3490 for_each_iommu(iommu, drhd) {
3491 if (drhd->ignored) {
3493 * we always have to disable PMRs or DMA may fail on
3497 iommu_disable_protect_mem_regions(iommu);
3501 iommu_flush_write_buffer(iommu);
3503 iommu_set_root_entry(iommu);
3505 iommu->flush.flush_context(iommu, 0, 0, 0,
3506 DMA_CCMD_GLOBAL_INVL);
3507 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3508 DMA_TLB_GLOBAL_FLUSH);
3509 if (iommu_enable_translation(iommu))
3511 iommu_disable_protect_mem_regions(iommu);
3517 static void iommu_flush_all(void)
3519 struct dmar_drhd_unit *drhd;
3520 struct intel_iommu *iommu;
3522 for_each_active_iommu(iommu, drhd) {
3523 iommu->flush.flush_context(iommu, 0, 0, 0,
3524 DMA_CCMD_GLOBAL_INVL);
3525 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3526 DMA_TLB_GLOBAL_FLUSH);
3530 static int iommu_suspend(void)
3532 struct dmar_drhd_unit *drhd;
3533 struct intel_iommu *iommu = NULL;
3536 for_each_active_iommu(iommu, drhd) {
3537 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3539 if (!iommu->iommu_state)
3545 for_each_active_iommu(iommu, drhd) {
3546 iommu_disable_translation(iommu);
3548 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3550 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3551 readl(iommu->reg + DMAR_FECTL_REG);
3552 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3553 readl(iommu->reg + DMAR_FEDATA_REG);
3554 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3555 readl(iommu->reg + DMAR_FEADDR_REG);
3556 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3557 readl(iommu->reg + DMAR_FEUADDR_REG);
3559 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3564 for_each_active_iommu(iommu, drhd)
3565 kfree(iommu->iommu_state);
3570 static void iommu_resume(void)
3572 struct dmar_drhd_unit *drhd;
3573 struct intel_iommu *iommu = NULL;
3576 if (init_iommu_hw()) {
3578 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3580 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3584 for_each_active_iommu(iommu, drhd) {
3586 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3588 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3589 iommu->reg + DMAR_FECTL_REG);
3590 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3591 iommu->reg + DMAR_FEDATA_REG);
3592 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3593 iommu->reg + DMAR_FEADDR_REG);
3594 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3595 iommu->reg + DMAR_FEUADDR_REG);
3597 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3600 for_each_active_iommu(iommu, drhd)
3601 kfree(iommu->iommu_state);
3604 static struct syscore_ops iommu_syscore_ops = {
3605 .resume = iommu_resume,
3606 .suspend = iommu_suspend,
3609 static void __init init_iommu_pm_ops(void)
3611 register_syscore_ops(&iommu_syscore_ops);
3615 static inline void init_iommu_pm_ops(void) {}
3616 #endif /* CONFIG_PM */
3619 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3621 struct acpi_dmar_reserved_memory *rmrr;
3622 struct dmar_rmrr_unit *rmrru;
3624 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3628 rmrru->hdr = header;
3629 rmrr = (struct acpi_dmar_reserved_memory *)header;
3630 rmrru->base_address = rmrr->base_address;
3631 rmrru->end_address = rmrr->end_address;
3632 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3633 ((void *)rmrr) + rmrr->header.length,
3634 &rmrru->devices_cnt);
3635 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3640 list_add(&rmrru->list, &dmar_rmrr_units);
3645 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3647 struct acpi_dmar_atsr *atsr;
3648 struct dmar_atsr_unit *atsru;
3650 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3651 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3656 atsru->include_all = atsr->flags & 0x1;
3657 if (!atsru->include_all) {
3658 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3659 (void *)atsr + atsr->header.length,
3660 &atsru->devices_cnt);
3661 if (atsru->devices_cnt && atsru->devices == NULL) {
3667 list_add_rcu(&atsru->list, &dmar_atsr_units);
3672 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3674 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3678 static void intel_iommu_free_dmars(void)
3680 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3681 struct dmar_atsr_unit *atsru, *atsr_n;
3683 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3684 list_del(&rmrru->list);
3685 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3689 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3690 list_del(&atsru->list);
3691 intel_iommu_free_atsr(atsru);
3695 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3698 struct pci_bus *bus;
3699 struct pci_dev *bridge = NULL;
3701 struct acpi_dmar_atsr *atsr;
3702 struct dmar_atsr_unit *atsru;
3704 dev = pci_physfn(dev);
3705 for (bus = dev->bus; bus; bus = bus->parent) {
3707 if (!bridge || !pci_is_pcie(bridge) ||
3708 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3710 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3717 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3718 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3719 if (atsr->segment != pci_domain_nr(dev->bus))
3722 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3723 if (tmp == &bridge->dev)
3726 if (atsru->include_all)
3736 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3739 struct dmar_rmrr_unit *rmrru;
3740 struct dmar_atsr_unit *atsru;
3741 struct acpi_dmar_atsr *atsr;
3742 struct acpi_dmar_reserved_memory *rmrr;
3744 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3747 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3748 rmrr = container_of(rmrru->hdr,
3749 struct acpi_dmar_reserved_memory, header);
3750 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3751 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3752 ((void *)rmrr) + rmrr->header.length,
3753 rmrr->segment, rmrru->devices,
3754 rmrru->devices_cnt);
3759 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3760 if (dmar_remove_dev_scope(info, rmrr->segment,
3761 rmrru->devices, rmrru->devices_cnt))
3766 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3767 if (atsru->include_all)
3770 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3771 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3772 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3773 (void *)atsr + atsr->header.length,
3774 atsr->segment, atsru->devices,
3775 atsru->devices_cnt);
3780 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3781 if (dmar_remove_dev_scope(info, atsr->segment,
3782 atsru->devices, atsru->devices_cnt))
3791 * Here we only respond to action of unbound device from driver.
3793 * Added device is not attached to its DMAR domain here yet. That will happen
3794 * when mapping the device to iova.
3796 static int device_notifier(struct notifier_block *nb,
3797 unsigned long action, void *data)
3799 struct device *dev = data;
3800 struct pci_dev *pdev = to_pci_dev(dev);
3801 struct dmar_domain *domain;
3803 if (iommu_dummy(dev))
3806 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3807 action != BUS_NOTIFY_DEL_DEVICE)
3810 domain = find_domain(dev);
3814 down_read(&dmar_global_lock);
3815 domain_remove_one_dev_info(domain, pdev);
3816 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3818 list_empty(&domain->devices))
3819 domain_exit(domain);
3820 up_read(&dmar_global_lock);
3825 static struct notifier_block device_nb = {
3826 .notifier_call = device_notifier,
3829 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3830 unsigned long val, void *v)
3832 struct memory_notify *mhp = v;
3833 unsigned long long start, end;
3834 unsigned long start_vpfn, last_vpfn;
3837 case MEM_GOING_ONLINE:
3838 start = mhp->start_pfn << PAGE_SHIFT;
3839 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3840 if (iommu_domain_identity_map(si_domain, start, end)) {
3841 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3848 case MEM_CANCEL_ONLINE:
3849 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3850 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3851 while (start_vpfn <= last_vpfn) {
3853 struct dmar_drhd_unit *drhd;
3854 struct intel_iommu *iommu;
3855 struct page *freelist;
3857 iova = find_iova(&si_domain->iovad, start_vpfn);
3859 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3864 iova = split_and_remove_iova(&si_domain->iovad, iova,
3865 start_vpfn, last_vpfn);
3867 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3868 start_vpfn, last_vpfn);
3872 freelist = domain_unmap(si_domain, iova->pfn_lo,
3876 for_each_active_iommu(iommu, drhd)
3877 iommu_flush_iotlb_psi(iommu, si_domain->id,
3879 iova->pfn_hi - iova->pfn_lo + 1,
3882 dma_free_pagelist(freelist);
3884 start_vpfn = iova->pfn_hi + 1;
3885 free_iova_mem(iova);
3893 static struct notifier_block intel_iommu_memory_nb = {
3894 .notifier_call = intel_iommu_memory_notifier,
3898 int __init intel_iommu_init(void)
3901 struct dmar_drhd_unit *drhd;
3902 struct intel_iommu *iommu;
3904 /* VT-d is required for a TXT/tboot launch, so enforce that */
3905 force_on = tboot_force_iommu();
3907 if (iommu_init_mempool()) {
3909 panic("tboot: Failed to initialize iommu memory\n");
3913 down_write(&dmar_global_lock);
3914 if (dmar_table_init()) {
3916 panic("tboot: Failed to initialize DMAR table\n");
3921 * Disable translation if already enabled prior to OS handover.
3923 for_each_active_iommu(iommu, drhd)
3924 if (iommu->gcmd & DMA_GCMD_TE)
3925 iommu_disable_translation(iommu);
3927 if (dmar_dev_scope_init() < 0) {
3929 panic("tboot: Failed to initialize DMAR device scope\n");
3933 if (no_iommu || dmar_disabled)
3936 if (list_empty(&dmar_rmrr_units))
3937 printk(KERN_INFO "DMAR: No RMRR found\n");
3939 if (list_empty(&dmar_atsr_units))
3940 printk(KERN_INFO "DMAR: No ATSR found\n");
3942 if (dmar_init_reserved_ranges()) {
3944 panic("tboot: Failed to reserve iommu ranges\n");
3945 goto out_free_reserved_range;
3948 init_no_remapping_devices();
3953 panic("tboot: Failed to initialize DMARs\n");
3954 printk(KERN_ERR "IOMMU: dmar init failed\n");
3955 goto out_free_reserved_range;
3957 up_write(&dmar_global_lock);
3959 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3961 init_timer(&unmap_timer);
3962 #ifdef CONFIG_SWIOTLB
3965 dma_ops = &intel_dma_ops;
3967 init_iommu_pm_ops();
3969 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3970 bus_register_notifier(&pci_bus_type, &device_nb);
3971 if (si_domain && !hw_pass_through)
3972 register_memory_notifier(&intel_iommu_memory_nb);
3974 intel_iommu_enabled = 1;
3978 out_free_reserved_range:
3979 put_iova_domain(&reserved_iova_list);
3981 intel_iommu_free_dmars();
3982 up_write(&dmar_global_lock);
3983 iommu_exit_mempool();
3987 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3990 struct pci_dev *tmp, *parent, *pdev;
3992 if (!iommu || !dev || !dev_is_pci(dev))
3995 pdev = to_pci_dev(dev);
3997 /* dependent device detach */
3998 tmp = pci_find_upstream_pcie_bridge(pdev);
3999 /* Secondary interface's bus number and devfn 0 */
4001 parent = pdev->bus->self;
4002 while (parent != tmp) {
4003 iommu_detach_dev(iommu, parent->bus->number,
4005 parent = parent->bus->self;
4007 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4008 iommu_detach_dev(iommu,
4009 tmp->subordinate->number, 0);
4010 else /* this is a legacy PCI bridge */
4011 iommu_detach_dev(iommu, tmp->bus->number,
4016 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4017 struct pci_dev *pdev)
4019 struct device_domain_info *info, *tmp;
4020 struct intel_iommu *iommu;
4021 unsigned long flags;
4024 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4029 spin_lock_irqsave(&device_domain_lock, flags);
4030 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4031 if (info->iommu->segment == pci_domain_nr(pdev->bus) &&
4032 info->bus == pdev->bus->number &&
4033 info->devfn == pdev->devfn) {
4034 unlink_domain_info(info);
4035 spin_unlock_irqrestore(&device_domain_lock, flags);
4037 iommu_disable_dev_iotlb(info);
4038 iommu_detach_dev(iommu, info->bus, info->devfn);
4039 iommu_detach_dependent_devices(iommu, &pdev->dev);
4040 free_devinfo_mem(info);
4042 spin_lock_irqsave(&device_domain_lock, flags);
4050 /* if there is no other devices under the same iommu
4051 * owned by this domain, clear this iommu in iommu_bmp
4052 * update iommu count and coherency
4054 if (info->iommu == iommu)
4058 spin_unlock_irqrestore(&device_domain_lock, flags);
4061 unsigned long tmp_flags;
4062 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4063 clear_bit(iommu->seq_id, domain->iommu_bmp);
4064 domain->iommu_count--;
4065 domain_update_iommu_cap(domain);
4066 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4068 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4069 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4070 spin_lock_irqsave(&iommu->lock, tmp_flags);
4071 clear_bit(domain->id, iommu->domain_ids);
4072 iommu->domains[domain->id] = NULL;
4073 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4078 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4082 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4083 domain_reserve_special_ranges(domain);
4085 /* calculate AGAW */
4086 domain->gaw = guest_width;
4087 adjust_width = guestwidth_to_adjustwidth(guest_width);
4088 domain->agaw = width_to_agaw(adjust_width);
4090 domain->iommu_coherency = 0;
4091 domain->iommu_snooping = 0;
4092 domain->iommu_superpage = 0;
4093 domain->max_addr = 0;
4096 /* always allocate the top pgd */
4097 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4100 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4104 static int intel_iommu_domain_init(struct iommu_domain *domain)
4106 struct dmar_domain *dmar_domain;
4108 dmar_domain = alloc_domain(true);
4111 "intel_iommu_domain_init: dmar_domain == NULL\n");
4114 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4116 "intel_iommu_domain_init() failed\n");
4117 domain_exit(dmar_domain);
4120 domain_update_iommu_cap(dmar_domain);
4121 domain->priv = dmar_domain;
4123 domain->geometry.aperture_start = 0;
4124 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4125 domain->geometry.force_aperture = true;
4130 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4132 struct dmar_domain *dmar_domain = domain->priv;
4134 domain->priv = NULL;
4135 domain_exit(dmar_domain);
4138 static int intel_iommu_attach_device(struct iommu_domain *domain,
4141 struct dmar_domain *dmar_domain = domain->priv;
4142 struct pci_dev *pdev = to_pci_dev(dev);
4143 struct intel_iommu *iommu;
4146 /* normally pdev is not mapped */
4147 if (unlikely(domain_context_mapped(pdev))) {
4148 struct dmar_domain *old_domain;
4150 old_domain = find_domain(dev);
4152 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4153 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4154 domain_remove_one_dev_info(old_domain, pdev);
4156 domain_remove_dev_info(old_domain);
4160 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4165 /* check if this iommu agaw is sufficient for max mapped address */
4166 addr_width = agaw_to_width(iommu->agaw);
4167 if (addr_width > cap_mgaw(iommu->cap))
4168 addr_width = cap_mgaw(iommu->cap);
4170 if (dmar_domain->max_addr > (1LL << addr_width)) {
4171 printk(KERN_ERR "%s: iommu width (%d) is not "
4172 "sufficient for the mapped address (%llx)\n",
4173 __func__, addr_width, dmar_domain->max_addr);
4176 dmar_domain->gaw = addr_width;
4179 * Knock out extra levels of page tables if necessary
4181 while (iommu->agaw < dmar_domain->agaw) {
4182 struct dma_pte *pte;
4184 pte = dmar_domain->pgd;
4185 if (dma_pte_present(pte)) {
4186 dmar_domain->pgd = (struct dma_pte *)
4187 phys_to_virt(dma_pte_addr(pte));
4188 free_pgtable_page(pte);
4190 dmar_domain->agaw--;
4193 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4196 static void intel_iommu_detach_device(struct iommu_domain *domain,
4199 struct dmar_domain *dmar_domain = domain->priv;
4200 struct pci_dev *pdev = to_pci_dev(dev);
4202 domain_remove_one_dev_info(dmar_domain, pdev);
4205 static int intel_iommu_map(struct iommu_domain *domain,
4206 unsigned long iova, phys_addr_t hpa,
4207 size_t size, int iommu_prot)
4209 struct dmar_domain *dmar_domain = domain->priv;
4214 if (iommu_prot & IOMMU_READ)
4215 prot |= DMA_PTE_READ;
4216 if (iommu_prot & IOMMU_WRITE)
4217 prot |= DMA_PTE_WRITE;
4218 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4219 prot |= DMA_PTE_SNP;
4221 max_addr = iova + size;
4222 if (dmar_domain->max_addr < max_addr) {
4225 /* check if minimum agaw is sufficient for mapped address */
4226 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4227 if (end < max_addr) {
4228 printk(KERN_ERR "%s: iommu width (%d) is not "
4229 "sufficient for the mapped address (%llx)\n",
4230 __func__, dmar_domain->gaw, max_addr);
4233 dmar_domain->max_addr = max_addr;
4235 /* Round up size to next multiple of PAGE_SIZE, if it and
4236 the low bits of hpa would take us onto the next page */
4237 size = aligned_nrpages(hpa, size);
4238 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4239 hpa >> VTD_PAGE_SHIFT, size, prot);
4243 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4244 unsigned long iova, size_t size)
4246 struct dmar_domain *dmar_domain = domain->priv;
4247 struct page *freelist = NULL;
4248 struct intel_iommu *iommu;
4249 unsigned long start_pfn, last_pfn;
4250 unsigned int npages;
4251 int iommu_id, num, ndomains, level = 0;
4253 /* Cope with horrid API which requires us to unmap more than the
4254 size argument if it happens to be a large-page mapping. */
4255 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4258 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4259 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4261 start_pfn = iova >> VTD_PAGE_SHIFT;
4262 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4264 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4266 npages = last_pfn - start_pfn + 1;
4268 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4269 iommu = g_iommus[iommu_id];
4272 * find bit position of dmar_domain
4274 ndomains = cap_ndoms(iommu->cap);
4275 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4276 if (iommu->domains[num] == dmar_domain)
4277 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4278 npages, !freelist, 0);
4283 dma_free_pagelist(freelist);
4285 if (dmar_domain->max_addr == iova + size)
4286 dmar_domain->max_addr = iova;
4291 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4294 struct dmar_domain *dmar_domain = domain->priv;
4295 struct dma_pte *pte;
4299 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4301 phys = dma_pte_addr(pte);
4306 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4309 struct dmar_domain *dmar_domain = domain->priv;
4311 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4312 return dmar_domain->iommu_snooping;
4313 if (cap == IOMMU_CAP_INTR_REMAP)
4314 return irq_remapping_enabled;
4319 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4321 static int intel_iommu_add_device(struct device *dev)
4323 struct pci_dev *pdev = to_pci_dev(dev);
4324 struct pci_dev *bridge, *dma_pdev = NULL;
4325 struct iommu_group *group;
4328 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4329 pdev->bus->number, pdev->devfn))
4332 bridge = pci_find_upstream_pcie_bridge(pdev);
4334 if (pci_is_pcie(bridge))
4335 dma_pdev = pci_get_domain_bus_and_slot(
4336 pci_domain_nr(pdev->bus),
4337 bridge->subordinate->number, 0);
4339 dma_pdev = pci_dev_get(bridge);
4341 dma_pdev = pci_dev_get(pdev);
4343 /* Account for quirked devices */
4344 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4347 * If it's a multifunction device that does not support our
4348 * required ACS flags, add to the same group as lowest numbered
4349 * function that also does not suport the required ACS flags.
4351 if (dma_pdev->multifunction &&
4352 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4353 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4355 for (i = 0; i < 8; i++) {
4356 struct pci_dev *tmp;
4358 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4362 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4363 swap_pci_ref(&dma_pdev, tmp);
4371 * Devices on the root bus go through the iommu. If that's not us,
4372 * find the next upstream device and test ACS up to the root bus.
4373 * Finding the next device may require skipping virtual buses.
4375 while (!pci_is_root_bus(dma_pdev->bus)) {
4376 struct pci_bus *bus = dma_pdev->bus;
4378 while (!bus->self) {
4379 if (!pci_is_root_bus(bus))
4385 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4388 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4392 group = iommu_group_get(&dma_pdev->dev);
4393 pci_dev_put(dma_pdev);
4395 group = iommu_group_alloc();
4397 return PTR_ERR(group);
4400 ret = iommu_group_add_device(group, dev);
4402 iommu_group_put(group);
4406 static void intel_iommu_remove_device(struct device *dev)
4408 iommu_group_remove_device(dev);
4411 static struct iommu_ops intel_iommu_ops = {
4412 .domain_init = intel_iommu_domain_init,
4413 .domain_destroy = intel_iommu_domain_destroy,
4414 .attach_dev = intel_iommu_attach_device,
4415 .detach_dev = intel_iommu_detach_device,
4416 .map = intel_iommu_map,
4417 .unmap = intel_iommu_unmap,
4418 .iova_to_phys = intel_iommu_iova_to_phys,
4419 .domain_has_cap = intel_iommu_domain_has_cap,
4420 .add_device = intel_iommu_add_device,
4421 .remove_device = intel_iommu_remove_device,
4422 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4425 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4427 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4428 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4440 static void quirk_iommu_rwbf(struct pci_dev *dev)
4443 * Mobile 4 Series Chipset neglects to set RWBF capability,
4444 * but needs it. Same seems to hold for the desktop versions.
4446 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4459 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4460 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4461 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4462 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4463 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4464 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4465 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4466 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4468 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4472 if (pci_read_config_word(dev, GGC, &ggc))
4475 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4476 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4478 } else if (dmar_map_gfx) {
4479 /* we have to ensure the gfx device is idle before we flush */
4480 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4481 intel_iommu_strict = 1;
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4489 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4490 ISOCH DMAR unit for the Azalia sound device, but not give it any
4491 TLB entries, which causes it to deadlock. Check for that. We do
4492 this in a function called from init_dmars(), instead of in a PCI
4493 quirk, because we don't want to print the obnoxious "BIOS broken"
4494 message if VT-d is actually disabled.
4496 static void __init check_tylersburg_isoch(void)
4498 struct pci_dev *pdev;
4499 uint32_t vtisochctrl;
4501 /* If there's no Azalia in the system anyway, forget it. */
4502 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4507 /* System Management Registers. Might be hidden, in which case
4508 we can't do the sanity check. But that's OK, because the
4509 known-broken BIOSes _don't_ actually hide it, so far. */
4510 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4514 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4521 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4522 if (vtisochctrl & 1)
4525 /* Drop all bits other than the number of TLB entries */
4526 vtisochctrl &= 0x1c;
4528 /* If we have the recommended number of TLB entries (16), fine. */
4529 if (vtisochctrl == 0x10)
4532 /* Zero TLB entries? You get to ride the short bus to school. */
4534 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4535 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4536 dmi_get_system_info(DMI_BIOS_VENDOR),
4537 dmi_get_system_info(DMI_BIOS_VERSION),
4538 dmi_get_system_info(DMI_PRODUCT_VERSION));
4539 iommu_identity_mapping |= IDENTMAP_AZALIA;
4543 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",