2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
46 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
180 * 12-63: Context Ptr (12 - (haw-1))
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val |= value & VTD_PAGE_MASK;
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
204 return (struct context_entry *)
205 (root_present(root)?phys_to_virt(
206 root->val & VTD_PAGE_MASK) :
213 * 1: fault processing disable
214 * 2-3: translation type
215 * 12-63: address space root
221 struct context_entry {
226 static inline bool context_present(struct context_entry *context)
228 return (context->lo & 1);
230 static inline void context_set_present(struct context_entry *context)
235 static inline void context_set_fault_enable(struct context_entry *context)
237 context->lo &= (((u64)-1) << 2) | 1;
240 static inline void context_set_translation_type(struct context_entry *context,
243 context->lo &= (((u64)-1) << 4) | 3;
244 context->lo |= (value & 3) << 2;
247 static inline void context_set_address_root(struct context_entry *context,
250 context->lo |= value & VTD_PAGE_MASK;
253 static inline void context_set_address_width(struct context_entry *context,
256 context->hi |= value & 7;
259 static inline void context_set_domain_id(struct context_entry *context,
262 context->hi |= (value & ((1 << 16) - 1)) << 8;
265 static inline void context_clear_entry(struct context_entry *context)
278 * 12-63: Host physcial address
284 static inline void dma_clear_pte(struct dma_pte *pte)
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 return pte->val & VTD_PAGE_MASK;
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
299 static inline bool dma_pte_present(struct dma_pte *pte)
301 return (pte->val & 3) != 0;
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
306 return (pte->val & (1 << 7));
309 static inline int first_pte_in_page(struct dma_pte *pte)
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
326 /* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
334 /* define the limit of IOMMUs supported in each domain */
336 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
338 # define IOMMU_UNITS_SUPPORTED 64
342 int id; /* domain id */
343 int nid; /* node id */
344 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345 /* bitmap of iommus this domain uses*/
347 struct list_head devices; /* all devices' list */
348 struct iova_domain iovad; /* iova's that belong to this domain */
350 struct dma_pte *pgd; /* virtual address */
351 int gaw; /* max guest address width */
353 /* adjusted guest address width, 0 is level 2 30-bit */
356 int flags; /* flags to find out type of domain */
358 int iommu_coherency;/* indicate coherency of iommu access */
359 int iommu_snooping; /* indicate snooping control feature*/
360 int iommu_count; /* reference count of iommu */
361 int iommu_superpage;/* Level of superpages supported:
362 0 == 4KiB (no superpages), 1 == 2MiB,
363 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364 spinlock_t iommu_lock; /* protect iommu set in domain */
365 u64 max_addr; /* maximum mapped address */
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370 struct list_head link; /* link to domain siblings */
371 struct list_head global; /* link to global list */
372 int segment; /* PCI domain */
373 u8 bus; /* PCI bus number */
374 u8 devfn; /* PCI devfn number */
375 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
376 struct intel_iommu *iommu; /* IOMMU used by this device */
377 struct dmar_domain *domain; /* pointer to domain */
380 struct dmar_rmrr_unit {
381 struct list_head list; /* list of rmrr units */
382 struct acpi_dmar_header *hdr; /* ACPI header */
383 u64 base_address; /* reserved base address*/
384 u64 end_address; /* reserved end address */
385 struct dmar_dev_scope *devices; /* target devices */
386 int devices_cnt; /* target device count */
389 struct dmar_atsr_unit {
390 struct list_head list; /* list of ATSR units */
391 struct acpi_dmar_header *hdr; /* ACPI header */
392 struct dmar_dev_scope *devices; /* target devices */
393 int devices_cnt; /* target device count */
394 u8 include_all:1; /* include all ports */
397 static LIST_HEAD(dmar_atsr_units);
398 static LIST_HEAD(dmar_rmrr_units);
400 #define for_each_rmrr_units(rmrr) \
401 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
403 static void flush_unmaps_timeout(unsigned long data);
405 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
410 struct iova *iova[HIGH_WATER_MARK];
411 struct dmar_domain *domain[HIGH_WATER_MARK];
412 struct page *freelist[HIGH_WATER_MARK];
415 static struct deferred_flush_tables *deferred_flush;
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
424 static long list_size;
426 static void domain_exit(struct dmar_domain *domain);
427 static void domain_remove_dev_info(struct dmar_domain *domain);
428 static void domain_remove_one_dev_info(struct dmar_domain *domain,
429 struct pci_dev *pdev);
430 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
431 struct pci_dev *pdev);
433 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
434 int dmar_disabled = 0;
436 int dmar_disabled = 1;
437 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
439 int intel_iommu_enabled = 0;
440 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
442 static int dmar_map_gfx = 1;
443 static int dmar_forcedac;
444 static int intel_iommu_strict;
445 static int intel_iommu_superpage = 1;
447 int intel_iommu_gfx_mapped;
448 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
450 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
451 static DEFINE_SPINLOCK(device_domain_lock);
452 static LIST_HEAD(device_domain_list);
454 static struct iommu_ops intel_iommu_ops;
456 static int __init intel_iommu_setup(char *str)
461 if (!strncmp(str, "on", 2)) {
463 printk(KERN_INFO "Intel-IOMMU: enabled\n");
464 } else if (!strncmp(str, "off", 3)) {
466 printk(KERN_INFO "Intel-IOMMU: disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
470 "Intel-IOMMU: disable GFX device mapping\n");
471 } else if (!strncmp(str, "forcedac", 8)) {
473 "Intel-IOMMU: Forcing DAC for PCI devices\n");
475 } else if (!strncmp(str, "strict", 6)) {
477 "Intel-IOMMU: disable batched IOTLB flush\n");
478 intel_iommu_strict = 1;
479 } else if (!strncmp(str, "sp_off", 6)) {
481 "Intel-IOMMU: disable supported super page\n");
482 intel_iommu_superpage = 0;
485 str += strcspn(str, ",");
491 __setup("intel_iommu=", intel_iommu_setup);
493 static struct kmem_cache *iommu_domain_cache;
494 static struct kmem_cache *iommu_devinfo_cache;
495 static struct kmem_cache *iommu_iova_cache;
497 static inline void *alloc_pgtable_page(int node)
502 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
504 vaddr = page_address(page);
508 static inline void free_pgtable_page(void *vaddr)
510 free_page((unsigned long)vaddr);
513 static inline void *alloc_domain_mem(void)
515 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
518 static void free_domain_mem(void *vaddr)
520 kmem_cache_free(iommu_domain_cache, vaddr);
523 static inline void * alloc_devinfo_mem(void)
525 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
528 static inline void free_devinfo_mem(void *vaddr)
530 kmem_cache_free(iommu_devinfo_cache, vaddr);
533 struct iova *alloc_iova_mem(void)
535 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
538 void free_iova_mem(struct iova *iova)
540 kmem_cache_free(iommu_iova_cache, iova);
544 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
549 sagaw = cap_sagaw(iommu->cap);
550 for (agaw = width_to_agaw(max_gaw);
552 if (test_bit(agaw, &sagaw))
560 * Calculate max SAGAW for each iommu.
562 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
564 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
568 * calculate agaw for each iommu.
569 * "SAGAW" may be different across iommus, use a default agaw, and
570 * get a supported less agaw for iommus that don't support the default agaw.
572 int iommu_calculate_agaw(struct intel_iommu *iommu)
574 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
577 /* This functionin only returns single iommu in a domain */
578 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
582 /* si_domain and vm domain should not get here. */
583 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
584 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
586 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
587 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
590 return g_iommus[iommu_id];
593 static void domain_update_iommu_coherency(struct dmar_domain *domain)
595 struct dmar_drhd_unit *drhd;
596 struct intel_iommu *iommu;
599 domain->iommu_coherency = 1;
601 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
603 if (!ecap_coherent(g_iommus[i]->ecap)) {
604 domain->iommu_coherency = 0;
611 /* No hardware attached; use lowest common denominator */
613 for_each_active_iommu(iommu, drhd) {
614 if (!ecap_coherent(iommu->ecap)) {
615 domain->iommu_coherency = 0;
622 static void domain_update_iommu_snooping(struct dmar_domain *domain)
626 domain->iommu_snooping = 1;
628 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
629 if (!ecap_sc_support(g_iommus[i]->ecap)) {
630 domain->iommu_snooping = 0;
636 static void domain_update_iommu_superpage(struct dmar_domain *domain)
638 struct dmar_drhd_unit *drhd;
639 struct intel_iommu *iommu = NULL;
642 if (!intel_iommu_superpage) {
643 domain->iommu_superpage = 0;
647 /* set iommu_superpage to the smallest common denominator */
649 for_each_active_iommu(iommu, drhd) {
650 mask &= cap_super_page_val(iommu->cap);
657 domain->iommu_superpage = fls(mask);
660 /* Some capabilities may be different across iommus */
661 static void domain_update_iommu_cap(struct dmar_domain *domain)
663 domain_update_iommu_coherency(domain);
664 domain_update_iommu_snooping(domain);
665 domain_update_iommu_superpage(domain);
668 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
670 struct dmar_drhd_unit *drhd = NULL;
671 struct intel_iommu *iommu;
673 struct pci_dev *pdev;
677 for_each_active_iommu(iommu, drhd) {
678 if (segment != drhd->segment)
681 for_each_active_dev_scope(drhd->devices,
682 drhd->devices_cnt, i, dev) {
683 if (!dev_is_pci(dev))
685 pdev = to_pci_dev(dev);
686 if (pdev->bus->number == bus && pdev->devfn == devfn)
688 if (pdev->subordinate &&
689 pdev->subordinate->number <= bus &&
690 pdev->subordinate->busn_res.end >= bus)
694 if (drhd->include_all)
704 static void domain_flush_cache(struct dmar_domain *domain,
705 void *addr, int size)
707 if (!domain->iommu_coherency)
708 clflush_cache_range(addr, size);
711 /* Gets context entry for a given bus and devfn */
712 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
715 struct root_entry *root;
716 struct context_entry *context;
717 unsigned long phy_addr;
720 spin_lock_irqsave(&iommu->lock, flags);
721 root = &iommu->root_entry[bus];
722 context = get_context_addr_from_root(root);
724 context = (struct context_entry *)
725 alloc_pgtable_page(iommu->node);
727 spin_unlock_irqrestore(&iommu->lock, flags);
730 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731 phy_addr = virt_to_phys((void *)context);
732 set_root_value(root, phy_addr);
733 set_root_present(root);
734 __iommu_flush_cache(iommu, root, sizeof(*root));
736 spin_unlock_irqrestore(&iommu->lock, flags);
737 return &context[devfn];
740 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
742 struct root_entry *root;
743 struct context_entry *context;
747 spin_lock_irqsave(&iommu->lock, flags);
748 root = &iommu->root_entry[bus];
749 context = get_context_addr_from_root(root);
754 ret = context_present(&context[devfn]);
756 spin_unlock_irqrestore(&iommu->lock, flags);
760 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
762 struct root_entry *root;
763 struct context_entry *context;
766 spin_lock_irqsave(&iommu->lock, flags);
767 root = &iommu->root_entry[bus];
768 context = get_context_addr_from_root(root);
770 context_clear_entry(&context[devfn]);
771 __iommu_flush_cache(iommu, &context[devfn], \
774 spin_unlock_irqrestore(&iommu->lock, flags);
777 static void free_context_table(struct intel_iommu *iommu)
779 struct root_entry *root;
782 struct context_entry *context;
784 spin_lock_irqsave(&iommu->lock, flags);
785 if (!iommu->root_entry) {
788 for (i = 0; i < ROOT_ENTRY_NR; i++) {
789 root = &iommu->root_entry[i];
790 context = get_context_addr_from_root(root);
792 free_pgtable_page(context);
794 free_pgtable_page(iommu->root_entry);
795 iommu->root_entry = NULL;
797 spin_unlock_irqrestore(&iommu->lock, flags);
800 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
801 unsigned long pfn, int *target_level)
803 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
804 struct dma_pte *parent, *pte = NULL;
805 int level = agaw_to_level(domain->agaw);
808 BUG_ON(!domain->pgd);
810 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
811 /* Address beyond IOMMU's addressing capabilities. */
814 parent = domain->pgd;
819 offset = pfn_level_offset(pfn, level);
820 pte = &parent[offset];
821 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
823 if (level == *target_level)
826 if (!dma_pte_present(pte)) {
829 tmp_page = alloc_pgtable_page(domain->nid);
834 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
835 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
836 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
837 /* Someone else set it while we were thinking; use theirs. */
838 free_pgtable_page(tmp_page);
841 domain_flush_cache(domain, pte, sizeof(*pte));
847 parent = phys_to_virt(dma_pte_addr(pte));
852 *target_level = level;
858 /* return address's pte at specific level */
859 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
861 int level, int *large_page)
863 struct dma_pte *parent, *pte = NULL;
864 int total = agaw_to_level(domain->agaw);
867 parent = domain->pgd;
868 while (level <= total) {
869 offset = pfn_level_offset(pfn, total);
870 pte = &parent[offset];
874 if (!dma_pte_present(pte)) {
879 if (pte->val & DMA_PTE_LARGE_PAGE) {
884 parent = phys_to_virt(dma_pte_addr(pte));
890 /* clear last level pte, a tlb flush should be followed */
891 static void dma_pte_clear_range(struct dmar_domain *domain,
892 unsigned long start_pfn,
893 unsigned long last_pfn)
895 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896 unsigned int large_page = 1;
897 struct dma_pte *first_pte, *pte;
899 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901 BUG_ON(start_pfn > last_pfn);
903 /* we don't need lock here; nobody else touches the iova range */
906 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
908 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
913 start_pfn += lvl_to_nr_pages(large_page);
915 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
917 domain_flush_cache(domain, first_pte,
918 (void *)pte - (void *)first_pte);
920 } while (start_pfn && start_pfn <= last_pfn);
923 static void dma_pte_free_level(struct dmar_domain *domain, int level,
924 struct dma_pte *pte, unsigned long pfn,
925 unsigned long start_pfn, unsigned long last_pfn)
927 pfn = max(start_pfn, pfn);
928 pte = &pte[pfn_level_offset(pfn, level)];
931 unsigned long level_pfn;
932 struct dma_pte *level_pte;
934 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
937 level_pfn = pfn & level_mask(level - 1);
938 level_pte = phys_to_virt(dma_pte_addr(pte));
941 dma_pte_free_level(domain, level - 1, level_pte,
942 level_pfn, start_pfn, last_pfn);
944 /* If range covers entire pagetable, free it */
945 if (!(start_pfn > level_pfn ||
946 last_pfn < level_pfn + level_size(level) - 1)) {
948 domain_flush_cache(domain, pte, sizeof(*pte));
949 free_pgtable_page(level_pte);
952 pfn += level_size(level);
953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
956 /* free page table pages. last level pte should already be cleared */
957 static void dma_pte_free_pagetable(struct dmar_domain *domain,
958 unsigned long start_pfn,
959 unsigned long last_pfn)
961 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
963 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
964 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
965 BUG_ON(start_pfn > last_pfn);
967 /* We don't need lock here; nobody else touches the iova range */
968 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
969 domain->pgd, 0, start_pfn, last_pfn);
972 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
973 free_pgtable_page(domain->pgd);
978 /* When a page at a given level is being unlinked from its parent, we don't
979 need to *modify* it at all. All we need to do is make a list of all the
980 pages which can be freed just as soon as we've flushed the IOTLB and we
981 know the hardware page-walk will no longer touch them.
982 The 'pte' argument is the *parent* PTE, pointing to the page that is to
984 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
985 int level, struct dma_pte *pte,
986 struct page *freelist)
990 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
991 pg->freelist = freelist;
997 for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
998 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
999 freelist = dma_pte_list_pagetables(domain, level - 1,
1006 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1007 struct dma_pte *pte, unsigned long pfn,
1008 unsigned long start_pfn,
1009 unsigned long last_pfn,
1010 struct page *freelist)
1012 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1014 pfn = max(start_pfn, pfn);
1015 pte = &pte[pfn_level_offset(pfn, level)];
1018 unsigned long level_pfn;
1020 if (!dma_pte_present(pte))
1023 level_pfn = pfn & level_mask(level);
1025 /* If range covers entire pagetable, free it */
1026 if (start_pfn <= level_pfn &&
1027 last_pfn >= level_pfn + level_size(level) - 1) {
1028 /* These suborbinate page tables are going away entirely. Don't
1029 bother to clear them; we're just going to *free* them. */
1030 if (level > 1 && !dma_pte_superpage(pte))
1031 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1037 } else if (level > 1) {
1038 /* Recurse down into a level that isn't *entirely* obsolete */
1039 freelist = dma_pte_clear_level(domain, level - 1,
1040 phys_to_virt(dma_pte_addr(pte)),
1041 level_pfn, start_pfn, last_pfn,
1045 pfn += level_size(level);
1046 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1049 domain_flush_cache(domain, first_pte,
1050 (void *)++last_pte - (void *)first_pte);
1055 /* We can't just free the pages because the IOMMU may still be walking
1056 the page tables, and may have cached the intermediate levels. The
1057 pages can only be freed after the IOTLB flush has been done. */
1058 struct page *domain_unmap(struct dmar_domain *domain,
1059 unsigned long start_pfn,
1060 unsigned long last_pfn)
1062 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1063 struct page *freelist = NULL;
1065 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1066 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1067 BUG_ON(start_pfn > last_pfn);
1069 /* we don't need lock here; nobody else touches the iova range */
1070 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 domain->pgd, 0, start_pfn, last_pfn, NULL);
1074 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 struct page *pgd_page = virt_to_page(domain->pgd);
1076 pgd_page->freelist = freelist;
1077 freelist = pgd_page;
1085 void dma_free_pagelist(struct page *freelist)
1089 while ((pg = freelist)) {
1090 freelist = pg->freelist;
1091 free_pgtable_page(page_address(pg));
1095 /* iommu handling */
1096 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098 struct root_entry *root;
1099 unsigned long flags;
1101 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1105 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1107 spin_lock_irqsave(&iommu->lock, flags);
1108 iommu->root_entry = root;
1109 spin_unlock_irqrestore(&iommu->lock, flags);
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1120 addr = iommu->root_entry;
1122 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1123 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1125 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1127 /* Make sure hardware complete it */
1128 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1129 readl, (sts & DMA_GSTS_RTPS), sts);
1131 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1139 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1142 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1143 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1145 /* Make sure hardware complete it */
1146 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1147 readl, (!(val & DMA_GSTS_WBFS)), val);
1149 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1152 /* return value determine if we need a write buffer flush */
1153 static void __iommu_flush_context(struct intel_iommu *iommu,
1154 u16 did, u16 source_id, u8 function_mask,
1161 case DMA_CCMD_GLOBAL_INVL:
1162 val = DMA_CCMD_GLOBAL_INVL;
1164 case DMA_CCMD_DOMAIN_INVL:
1165 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1167 case DMA_CCMD_DEVICE_INVL:
1168 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1169 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1174 val |= DMA_CCMD_ICC;
1176 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179 /* Make sure hardware complete it */
1180 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1181 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1188 u64 addr, unsigned int size_order, u64 type)
1190 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 u64 val = 0, val_iva = 0;
1195 case DMA_TLB_GLOBAL_FLUSH:
1196 /* global flush doesn't need set IVA_REG */
1197 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1199 case DMA_TLB_DSI_FLUSH:
1200 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1202 case DMA_TLB_PSI_FLUSH:
1203 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 /* IH bit is passed in as part of address */
1205 val_iva = size_order | addr;
1210 /* Note: set drain read/write */
1213 * This is probably to be super secure.. Looks like we can
1214 * ignore it without any impact.
1216 if (cap_read_drain(iommu->cap))
1217 val |= DMA_TLB_READ_DRAIN;
1219 if (cap_write_drain(iommu->cap))
1220 val |= DMA_TLB_WRITE_DRAIN;
1222 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1223 /* Note: Only uses first TLB reg currently */
1225 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1226 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1228 /* Make sure hardware complete it */
1229 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1230 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1232 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234 /* check IOTLB invalidation granularity */
1235 if (DMA_TLB_IAIG(val) == 0)
1236 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1237 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1238 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1239 (unsigned long long)DMA_TLB_IIRG(type),
1240 (unsigned long long)DMA_TLB_IAIG(val));
1243 static struct device_domain_info *iommu_support_dev_iotlb(
1244 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1247 unsigned long flags;
1248 struct device_domain_info *info;
1249 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1251 if (!ecap_dev_iotlb_support(iommu->ecap))
1257 spin_lock_irqsave(&device_domain_lock, flags);
1258 list_for_each_entry(info, &domain->devices, link)
1259 if (info->bus == bus && info->devfn == devfn) {
1263 spin_unlock_irqrestore(&device_domain_lock, flags);
1265 if (!found || !info->dev)
1268 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1271 if (!dmar_find_matched_atsr_unit(info->dev))
1274 info->iommu = iommu;
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1284 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1289 if (!info->dev || !pci_ats_enabled(info->dev))
1292 pci_disable_ats(info->dev);
1295 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1296 u64 addr, unsigned mask)
1299 unsigned long flags;
1300 struct device_domain_info *info;
1302 spin_lock_irqsave(&device_domain_lock, flags);
1303 list_for_each_entry(info, &domain->devices, link) {
1304 if (!info->dev || !pci_ats_enabled(info->dev))
1307 sid = info->bus << 8 | info->devfn;
1308 qdep = pci_ats_queue_depth(info->dev);
1309 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1311 spin_unlock_irqrestore(&device_domain_lock, flags);
1314 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1315 unsigned long pfn, unsigned int pages, int ih, int map)
1317 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1318 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1325 * Fallback to domain selective flush if no PSI support or the size is
1327 * PSI requires page size to be 2 ^ x, and the base address is naturally
1328 * aligned to the size
1330 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1331 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1334 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1338 * In caching mode, changes of pages from non-present to present require
1339 * flush. However, device IOTLB doesn't need to be flushed in this case.
1341 if (!cap_caching_mode(iommu->cap) || !map)
1342 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1345 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1348 unsigned long flags;
1350 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1351 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1352 pmen &= ~DMA_PMEN_EPM;
1353 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1355 /* wait for the protected region status bit to clear */
1356 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1357 readl, !(pmen & DMA_PMEN_PRS), pmen);
1359 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1362 static int iommu_enable_translation(struct intel_iommu *iommu)
1365 unsigned long flags;
1367 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1368 iommu->gcmd |= DMA_GCMD_TE;
1369 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1371 /* Make sure hardware complete it */
1372 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1373 readl, (sts & DMA_GSTS_TES), sts);
1375 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1379 static int iommu_disable_translation(struct intel_iommu *iommu)
1384 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 iommu->gcmd &= ~DMA_GCMD_TE;
1386 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1388 /* Make sure hardware complete it */
1389 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1390 readl, (!(sts & DMA_GSTS_TES)), sts);
1392 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1397 static int iommu_init_domains(struct intel_iommu *iommu)
1399 unsigned long ndomains;
1400 unsigned long nlongs;
1402 ndomains = cap_ndoms(iommu->cap);
1403 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1404 iommu->seq_id, ndomains);
1405 nlongs = BITS_TO_LONGS(ndomains);
1407 spin_lock_init(&iommu->lock);
1409 /* TBD: there might be 64K domains,
1410 * consider other allocation for future chip
1412 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1413 if (!iommu->domain_ids) {
1414 pr_err("IOMMU%d: allocating domain id array failed\n",
1418 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1420 if (!iommu->domains) {
1421 pr_err("IOMMU%d: allocating domain array failed\n",
1423 kfree(iommu->domain_ids);
1424 iommu->domain_ids = NULL;
1429 * if Caching mode is set, then invalid translations are tagged
1430 * with domainid 0. Hence we need to pre-allocate it.
1432 if (cap_caching_mode(iommu->cap))
1433 set_bit(0, iommu->domain_ids);
1437 static void free_dmar_iommu(struct intel_iommu *iommu)
1439 struct dmar_domain *domain;
1441 unsigned long flags;
1443 if ((iommu->domains) && (iommu->domain_ids)) {
1444 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1446 * Domain id 0 is reserved for invalid translation
1447 * if hardware supports caching mode.
1449 if (cap_caching_mode(iommu->cap) && i == 0)
1452 domain = iommu->domains[i];
1453 clear_bit(i, iommu->domain_ids);
1455 spin_lock_irqsave(&domain->iommu_lock, flags);
1456 count = --domain->iommu_count;
1457 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1459 domain_exit(domain);
1463 if (iommu->gcmd & DMA_GCMD_TE)
1464 iommu_disable_translation(iommu);
1466 kfree(iommu->domains);
1467 kfree(iommu->domain_ids);
1468 iommu->domains = NULL;
1469 iommu->domain_ids = NULL;
1471 g_iommus[iommu->seq_id] = NULL;
1473 /* free context mapping */
1474 free_context_table(iommu);
1477 static struct dmar_domain *alloc_domain(bool vm)
1479 /* domain id for virtual machine, it won't be set in context */
1480 static atomic_t vm_domid = ATOMIC_INIT(0);
1481 struct dmar_domain *domain;
1483 domain = alloc_domain_mem();
1488 domain->iommu_count = 0;
1489 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1491 spin_lock_init(&domain->iommu_lock);
1492 INIT_LIST_HEAD(&domain->devices);
1494 domain->id = atomic_inc_return(&vm_domid);
1495 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1501 static int iommu_attach_domain(struct dmar_domain *domain,
1502 struct intel_iommu *iommu)
1505 unsigned long ndomains;
1506 unsigned long flags;
1508 ndomains = cap_ndoms(iommu->cap);
1510 spin_lock_irqsave(&iommu->lock, flags);
1512 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513 if (num >= ndomains) {
1514 spin_unlock_irqrestore(&iommu->lock, flags);
1515 printk(KERN_ERR "IOMMU: no free domain ids\n");
1520 domain->iommu_count++;
1521 set_bit(num, iommu->domain_ids);
1522 set_bit(iommu->seq_id, domain->iommu_bmp);
1523 iommu->domains[num] = domain;
1524 spin_unlock_irqrestore(&iommu->lock, flags);
1529 static void iommu_detach_domain(struct dmar_domain *domain,
1530 struct intel_iommu *iommu)
1532 unsigned long flags;
1535 spin_lock_irqsave(&iommu->lock, flags);
1536 ndomains = cap_ndoms(iommu->cap);
1537 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1538 if (iommu->domains[num] == domain) {
1539 clear_bit(num, iommu->domain_ids);
1540 iommu->domains[num] = NULL;
1544 spin_unlock_irqrestore(&iommu->lock, flags);
1547 static struct iova_domain reserved_iova_list;
1548 static struct lock_class_key reserved_rbtree_key;
1550 static int dmar_init_reserved_ranges(void)
1552 struct pci_dev *pdev = NULL;
1556 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1558 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1559 &reserved_rbtree_key);
1561 /* IOAPIC ranges shouldn't be accessed by DMA */
1562 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1563 IOVA_PFN(IOAPIC_RANGE_END));
1565 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1569 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1570 for_each_pci_dev(pdev) {
1573 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1574 r = &pdev->resource[i];
1575 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1577 iova = reserve_iova(&reserved_iova_list,
1581 printk(KERN_ERR "Reserve iova failed\n");
1589 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1591 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1594 static inline int guestwidth_to_adjustwidth(int gaw)
1597 int r = (gaw - 12) % 9;
1608 static int domain_init(struct dmar_domain *domain, int guest_width)
1610 struct intel_iommu *iommu;
1611 int adjust_width, agaw;
1612 unsigned long sagaw;
1614 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1615 domain_reserve_special_ranges(domain);
1617 /* calculate AGAW */
1618 iommu = domain_get_iommu(domain);
1619 if (guest_width > cap_mgaw(iommu->cap))
1620 guest_width = cap_mgaw(iommu->cap);
1621 domain->gaw = guest_width;
1622 adjust_width = guestwidth_to_adjustwidth(guest_width);
1623 agaw = width_to_agaw(adjust_width);
1624 sagaw = cap_sagaw(iommu->cap);
1625 if (!test_bit(agaw, &sagaw)) {
1626 /* hardware doesn't support it, choose a bigger one */
1627 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1628 agaw = find_next_bit(&sagaw, 5, agaw);
1632 domain->agaw = agaw;
1634 if (ecap_coherent(iommu->ecap))
1635 domain->iommu_coherency = 1;
1637 domain->iommu_coherency = 0;
1639 if (ecap_sc_support(iommu->ecap))
1640 domain->iommu_snooping = 1;
1642 domain->iommu_snooping = 0;
1644 if (intel_iommu_superpage)
1645 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1647 domain->iommu_superpage = 0;
1649 domain->nid = iommu->node;
1651 /* always allocate the top pgd */
1652 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1655 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1659 static void domain_exit(struct dmar_domain *domain)
1661 struct dmar_drhd_unit *drhd;
1662 struct intel_iommu *iommu;
1663 struct page *freelist = NULL;
1665 /* Domain 0 is reserved, so dont process it */
1669 /* Flush any lazy unmaps that may reference this domain */
1670 if (!intel_iommu_strict)
1671 flush_unmaps_timeout(0);
1673 /* remove associated devices */
1674 domain_remove_dev_info(domain);
1677 put_iova_domain(&domain->iovad);
1679 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1681 /* clear attached or cached domains */
1683 for_each_active_iommu(iommu, drhd)
1684 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1685 test_bit(iommu->seq_id, domain->iommu_bmp))
1686 iommu_detach_domain(domain, iommu);
1689 dma_free_pagelist(freelist);
1691 free_domain_mem(domain);
1694 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1695 u8 bus, u8 devfn, int translation)
1697 struct context_entry *context;
1698 unsigned long flags;
1699 struct intel_iommu *iommu;
1700 struct dma_pte *pgd;
1702 unsigned long ndomains;
1705 struct device_domain_info *info = NULL;
1707 pr_debug("Set context mapping for %02x:%02x.%d\n",
1708 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1710 BUG_ON(!domain->pgd);
1711 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1712 translation != CONTEXT_TT_MULTI_LEVEL);
1714 iommu = device_to_iommu(segment, bus, devfn);
1718 context = device_to_context_entry(iommu, bus, devfn);
1721 spin_lock_irqsave(&iommu->lock, flags);
1722 if (context_present(context)) {
1723 spin_unlock_irqrestore(&iommu->lock, flags);
1730 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1731 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1734 /* find an available domain id for this device in iommu */
1735 ndomains = cap_ndoms(iommu->cap);
1736 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1737 if (iommu->domains[num] == domain) {
1745 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1746 if (num >= ndomains) {
1747 spin_unlock_irqrestore(&iommu->lock, flags);
1748 printk(KERN_ERR "IOMMU: no free domain ids\n");
1752 set_bit(num, iommu->domain_ids);
1753 iommu->domains[num] = domain;
1757 /* Skip top levels of page tables for
1758 * iommu which has less agaw than default.
1759 * Unnecessary for PT mode.
1761 if (translation != CONTEXT_TT_PASS_THROUGH) {
1762 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1763 pgd = phys_to_virt(dma_pte_addr(pgd));
1764 if (!dma_pte_present(pgd)) {
1765 spin_unlock_irqrestore(&iommu->lock, flags);
1772 context_set_domain_id(context, id);
1774 if (translation != CONTEXT_TT_PASS_THROUGH) {
1775 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1776 translation = info ? CONTEXT_TT_DEV_IOTLB :
1777 CONTEXT_TT_MULTI_LEVEL;
1780 * In pass through mode, AW must be programmed to indicate the largest
1781 * AGAW value supported by hardware. And ASR is ignored by hardware.
1783 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1784 context_set_address_width(context, iommu->msagaw);
1786 context_set_address_root(context, virt_to_phys(pgd));
1787 context_set_address_width(context, iommu->agaw);
1790 context_set_translation_type(context, translation);
1791 context_set_fault_enable(context);
1792 context_set_present(context);
1793 domain_flush_cache(domain, context, sizeof(*context));
1796 * It's a non-present to present mapping. If hardware doesn't cache
1797 * non-present entry we only need to flush the write-buffer. If the
1798 * _does_ cache non-present entries, then it does so in the special
1799 * domain #0, which we have to flush:
1801 if (cap_caching_mode(iommu->cap)) {
1802 iommu->flush.flush_context(iommu, 0,
1803 (((u16)bus) << 8) | devfn,
1804 DMA_CCMD_MASK_NOBIT,
1805 DMA_CCMD_DEVICE_INVL);
1806 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1808 iommu_flush_write_buffer(iommu);
1810 iommu_enable_dev_iotlb(info);
1811 spin_unlock_irqrestore(&iommu->lock, flags);
1813 spin_lock_irqsave(&domain->iommu_lock, flags);
1814 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1815 domain->iommu_count++;
1816 if (domain->iommu_count == 1)
1817 domain->nid = iommu->node;
1818 domain_update_iommu_cap(domain);
1820 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1825 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1829 struct pci_dev *tmp, *parent;
1831 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1832 pdev->bus->number, pdev->devfn,
1837 /* dependent device mapping */
1838 tmp = pci_find_upstream_pcie_bridge(pdev);
1841 /* Secondary interface's bus number and devfn 0 */
1842 parent = pdev->bus->self;
1843 while (parent != tmp) {
1844 ret = domain_context_mapping_one(domain,
1845 pci_domain_nr(parent->bus),
1846 parent->bus->number,
1847 parent->devfn, translation);
1850 parent = parent->bus->self;
1852 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1853 return domain_context_mapping_one(domain,
1854 pci_domain_nr(tmp->subordinate),
1855 tmp->subordinate->number, 0,
1857 else /* this is a legacy PCI bridge */
1858 return domain_context_mapping_one(domain,
1859 pci_domain_nr(tmp->bus),
1865 static int domain_context_mapped(struct pci_dev *pdev)
1868 struct pci_dev *tmp, *parent;
1869 struct intel_iommu *iommu;
1871 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1876 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1879 /* dependent device mapping */
1880 tmp = pci_find_upstream_pcie_bridge(pdev);
1883 /* Secondary interface's bus number and devfn 0 */
1884 parent = pdev->bus->self;
1885 while (parent != tmp) {
1886 ret = device_context_mapped(iommu, parent->bus->number,
1890 parent = parent->bus->self;
1892 if (pci_is_pcie(tmp))
1893 return device_context_mapped(iommu, tmp->subordinate->number,
1896 return device_context_mapped(iommu, tmp->bus->number,
1900 /* Returns a number of VTD pages, but aligned to MM page size */
1901 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1904 host_addr &= ~PAGE_MASK;
1905 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1908 /* Return largest possible superpage level for a given mapping */
1909 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1910 unsigned long iov_pfn,
1911 unsigned long phy_pfn,
1912 unsigned long pages)
1914 int support, level = 1;
1915 unsigned long pfnmerge;
1917 support = domain->iommu_superpage;
1919 /* To use a large page, the virtual *and* physical addresses
1920 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1921 of them will mean we have to use smaller pages. So just
1922 merge them and check both at once. */
1923 pfnmerge = iov_pfn | phy_pfn;
1925 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1926 pages >>= VTD_STRIDE_SHIFT;
1929 pfnmerge >>= VTD_STRIDE_SHIFT;
1936 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1937 struct scatterlist *sg, unsigned long phys_pfn,
1938 unsigned long nr_pages, int prot)
1940 struct dma_pte *first_pte = NULL, *pte = NULL;
1941 phys_addr_t uninitialized_var(pteval);
1942 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1943 unsigned long sg_res;
1944 unsigned int largepage_lvl = 0;
1945 unsigned long lvl_pages = 0;
1947 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1949 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1952 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1957 sg_res = nr_pages + 1;
1958 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1961 while (nr_pages > 0) {
1965 sg_res = aligned_nrpages(sg->offset, sg->length);
1966 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1967 sg->dma_length = sg->length;
1968 pteval = page_to_phys(sg_page(sg)) | prot;
1969 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1973 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1975 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1978 /* It is large page*/
1979 if (largepage_lvl > 1) {
1980 pteval |= DMA_PTE_LARGE_PAGE;
1981 /* Ensure that old small page tables are removed to make room
1982 for superpage, if they exist. */
1983 dma_pte_clear_range(domain, iov_pfn,
1984 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1985 dma_pte_free_pagetable(domain, iov_pfn,
1986 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1988 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1992 /* We don't need lock here, nobody else
1993 * touches the iova range
1995 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1997 static int dumps = 5;
1998 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1999 iov_pfn, tmp, (unsigned long long)pteval);
2002 debug_dma_dump_mappings(NULL);
2007 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2009 BUG_ON(nr_pages < lvl_pages);
2010 BUG_ON(sg_res < lvl_pages);
2012 nr_pages -= lvl_pages;
2013 iov_pfn += lvl_pages;
2014 phys_pfn += lvl_pages;
2015 pteval += lvl_pages * VTD_PAGE_SIZE;
2016 sg_res -= lvl_pages;
2018 /* If the next PTE would be the first in a new page, then we
2019 need to flush the cache on the entries we've just written.
2020 And then we'll need to recalculate 'pte', so clear it and
2021 let it get set again in the if (!pte) block above.
2023 If we're done (!nr_pages) we need to flush the cache too.
2025 Also if we've been setting superpages, we may need to
2026 recalculate 'pte' and switch back to smaller pages for the
2027 end of the mapping, if the trailing size is not enough to
2028 use another superpage (i.e. sg_res < lvl_pages). */
2030 if (!nr_pages || first_pte_in_page(pte) ||
2031 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2032 domain_flush_cache(domain, first_pte,
2033 (void *)pte - (void *)first_pte);
2037 if (!sg_res && nr_pages)
2043 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2044 struct scatterlist *sg, unsigned long nr_pages,
2047 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2050 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051 unsigned long phys_pfn, unsigned long nr_pages,
2054 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2057 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2062 clear_context_table(iommu, bus, devfn);
2063 iommu->flush.flush_context(iommu, 0, 0, 0,
2064 DMA_CCMD_GLOBAL_INVL);
2065 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2068 static inline void unlink_domain_info(struct device_domain_info *info)
2070 assert_spin_locked(&device_domain_lock);
2071 list_del(&info->link);
2072 list_del(&info->global);
2074 info->dev->dev.archdata.iommu = NULL;
2077 static void domain_remove_dev_info(struct dmar_domain *domain)
2079 struct device_domain_info *info;
2080 unsigned long flags, flags2;
2081 struct intel_iommu *iommu;
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 while (!list_empty(&domain->devices)) {
2085 info = list_entry(domain->devices.next,
2086 struct device_domain_info, link);
2087 unlink_domain_info(info);
2088 spin_unlock_irqrestore(&device_domain_lock, flags);
2090 iommu_disable_dev_iotlb(info);
2091 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2092 iommu_detach_dev(iommu, info->bus, info->devfn);
2094 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2095 iommu_detach_dependent_devices(iommu, info->dev);
2096 /* clear this iommu in iommu_bmp, update iommu count
2099 spin_lock_irqsave(&domain->iommu_lock, flags2);
2100 if (test_and_clear_bit(iommu->seq_id,
2101 domain->iommu_bmp)) {
2102 domain->iommu_count--;
2103 domain_update_iommu_cap(domain);
2105 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2108 free_devinfo_mem(info);
2109 spin_lock_irqsave(&device_domain_lock, flags);
2111 spin_unlock_irqrestore(&device_domain_lock, flags);
2116 * Note: we use struct device->archdata.iommu stores the info
2118 static struct dmar_domain *find_domain(struct device *dev)
2120 struct device_domain_info *info;
2122 /* No lock here, assumes no domain exit in normal case */
2123 info = dev->archdata.iommu;
2125 return info->domain;
2129 static inline struct dmar_domain *
2130 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2132 struct device_domain_info *info;
2134 list_for_each_entry(info, &device_domain_list, global)
2135 if (info->segment == segment && info->bus == bus &&
2136 info->devfn == devfn)
2137 return info->domain;
2142 static int dmar_insert_dev_info(int segment, int bus, int devfn,
2143 struct pci_dev *dev, struct dmar_domain **domp)
2145 struct dmar_domain *found, *domain = *domp;
2146 struct device_domain_info *info;
2147 unsigned long flags;
2149 info = alloc_devinfo_mem();
2153 info->segment = segment;
2155 info->devfn = devfn;
2157 info->domain = domain;
2159 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2161 spin_lock_irqsave(&device_domain_lock, flags);
2163 found = find_domain(&dev->dev);
2165 found = dmar_search_domain_by_dev_info(segment, bus, devfn);
2167 spin_unlock_irqrestore(&device_domain_lock, flags);
2168 free_devinfo_mem(info);
2169 if (found != domain) {
2170 domain_exit(domain);
2174 list_add(&info->link, &domain->devices);
2175 list_add(&info->global, &device_domain_list);
2177 dev->dev.archdata.iommu = info;
2178 spin_unlock_irqrestore(&device_domain_lock, flags);
2184 /* domain is initialized */
2185 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2187 struct dmar_domain *domain, *free = NULL;
2188 struct intel_iommu *iommu;
2189 struct dmar_drhd_unit *drhd;
2190 struct pci_dev *dev_tmp;
2191 unsigned long flags;
2192 int bus = 0, devfn = 0;
2195 domain = find_domain(&pdev->dev);
2199 segment = pci_domain_nr(pdev->bus);
2201 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2203 if (pci_is_pcie(dev_tmp)) {
2204 bus = dev_tmp->subordinate->number;
2207 bus = dev_tmp->bus->number;
2208 devfn = dev_tmp->devfn;
2210 spin_lock_irqsave(&device_domain_lock, flags);
2211 domain = dmar_search_domain_by_dev_info(segment, bus, devfn);
2212 spin_unlock_irqrestore(&device_domain_lock, flags);
2213 /* pcie-pci bridge already has a domain, uses it */
2218 drhd = dmar_find_matched_drhd_unit(pdev);
2220 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2224 iommu = drhd->iommu;
2226 /* Allocate and intialize new domain for the device */
2227 domain = alloc_domain(false);
2230 if (iommu_attach_domain(domain, iommu)) {
2231 free_domain_mem(domain);
2235 if (domain_init(domain, gaw))
2238 /* register pcie-to-pci device */
2240 if (dmar_insert_dev_info(segment, bus, devfn, NULL, &domain))
2247 if (dmar_insert_dev_info(segment, pdev->bus->number, pdev->devfn,
2248 pdev, &domain) == 0)
2253 /* recheck it here, maybe others set it */
2254 return find_domain(&pdev->dev);
2257 static int iommu_identity_mapping;
2258 #define IDENTMAP_ALL 1
2259 #define IDENTMAP_GFX 2
2260 #define IDENTMAP_AZALIA 4
2262 static int iommu_domain_identity_map(struct dmar_domain *domain,
2263 unsigned long long start,
2264 unsigned long long end)
2266 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2267 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2269 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2270 dma_to_mm_pfn(last_vpfn))) {
2271 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2275 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2276 start, end, domain->id);
2278 * RMRR range might have overlap with physical memory range,
2281 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2283 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2284 last_vpfn - first_vpfn + 1,
2285 DMA_PTE_READ|DMA_PTE_WRITE);
2288 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2289 unsigned long long start,
2290 unsigned long long end)
2292 struct dmar_domain *domain;
2295 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2299 /* For _hardware_ passthrough, don't bother. But for software
2300 passthrough, we do it anyway -- it may indicate a memory
2301 range which is reserved in E820, so which didn't get set
2302 up to start with in si_domain */
2303 if (domain == si_domain && hw_pass_through) {
2304 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2305 pci_name(pdev), start, end);
2310 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2311 pci_name(pdev), start, end);
2314 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2315 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2316 dmi_get_system_info(DMI_BIOS_VENDOR),
2317 dmi_get_system_info(DMI_BIOS_VERSION),
2318 dmi_get_system_info(DMI_PRODUCT_VERSION));
2323 if (end >> agaw_to_width(domain->agaw)) {
2324 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2325 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2326 agaw_to_width(domain->agaw),
2327 dmi_get_system_info(DMI_BIOS_VENDOR),
2328 dmi_get_system_info(DMI_BIOS_VERSION),
2329 dmi_get_system_info(DMI_PRODUCT_VERSION));
2334 ret = iommu_domain_identity_map(domain, start, end);
2338 /* context entry init */
2339 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2346 domain_exit(domain);
2350 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2351 struct pci_dev *pdev)
2353 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2355 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2359 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2360 static inline void iommu_prepare_isa(void)
2362 struct pci_dev *pdev;
2365 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2369 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2370 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2373 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2374 "floppy might not work\n");
2378 static inline void iommu_prepare_isa(void)
2382 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2384 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2386 static int __init si_domain_init(int hw)
2388 struct dmar_drhd_unit *drhd;
2389 struct intel_iommu *iommu;
2392 si_domain = alloc_domain(false);
2396 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2398 for_each_active_iommu(iommu, drhd) {
2399 ret = iommu_attach_domain(si_domain, iommu);
2401 domain_exit(si_domain);
2406 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2407 domain_exit(si_domain);
2411 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2417 for_each_online_node(nid) {
2418 unsigned long start_pfn, end_pfn;
2421 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2422 ret = iommu_domain_identity_map(si_domain,
2423 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2432 static int identity_mapping(struct pci_dev *pdev)
2434 struct device_domain_info *info;
2436 if (likely(!iommu_identity_mapping))
2439 info = pdev->dev.archdata.iommu;
2440 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2441 return (info->domain == si_domain);
2446 static int domain_add_dev_info(struct dmar_domain *domain,
2447 struct pci_dev *pdev,
2450 struct device_domain_info *info;
2451 unsigned long flags;
2454 info = alloc_devinfo_mem();
2458 info->segment = pci_domain_nr(pdev->bus);
2459 info->bus = pdev->bus->number;
2460 info->devfn = pdev->devfn;
2462 info->domain = domain;
2464 spin_lock_irqsave(&device_domain_lock, flags);
2465 list_add(&info->link, &domain->devices);
2466 list_add(&info->global, &device_domain_list);
2467 pdev->dev.archdata.iommu = info;
2468 spin_unlock_irqrestore(&device_domain_lock, flags);
2470 ret = domain_context_mapping(domain, pdev, translation);
2472 spin_lock_irqsave(&device_domain_lock, flags);
2473 unlink_domain_info(info);
2474 spin_unlock_irqrestore(&device_domain_lock, flags);
2475 free_devinfo_mem(info);
2482 static bool device_has_rmrr(struct pci_dev *dev)
2484 struct dmar_rmrr_unit *rmrr;
2489 for_each_rmrr_units(rmrr) {
2491 * Return TRUE if this RMRR contains the device that
2494 for_each_active_dev_scope(rmrr->devices,
2495 rmrr->devices_cnt, i, tmp)
2496 if (tmp == &dev->dev) {
2505 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2509 * We want to prevent any device associated with an RMRR from
2510 * getting placed into the SI Domain. This is done because
2511 * problems exist when devices are moved in and out of domains
2512 * and their respective RMRR info is lost. We exempt USB devices
2513 * from this process due to their usage of RMRRs that are known
2514 * to not be needed after BIOS hand-off to OS.
2516 if (device_has_rmrr(pdev) &&
2517 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2520 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2523 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2526 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2530 * We want to start off with all devices in the 1:1 domain, and
2531 * take them out later if we find they can't access all of memory.
2533 * However, we can't do this for PCI devices behind bridges,
2534 * because all PCI devices behind the same bridge will end up
2535 * with the same source-id on their transactions.
2537 * Practically speaking, we can't change things around for these
2538 * devices at run-time, because we can't be sure there'll be no
2539 * DMA transactions in flight for any of their siblings.
2541 * So PCI devices (unless they're on the root bus) as well as
2542 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2543 * the 1:1 domain, just in _case_ one of their siblings turns out
2544 * not to be able to map all of memory.
2546 if (!pci_is_pcie(pdev)) {
2547 if (!pci_is_root_bus(pdev->bus))
2549 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2551 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2555 * At boot time, we don't yet know if devices will be 64-bit capable.
2556 * Assume that they will -- if they turn out not to be, then we can
2557 * take them out of the 1:1 domain later.
2561 * If the device's dma_mask is less than the system's memory
2562 * size then this is not a candidate for identity mapping.
2564 u64 dma_mask = pdev->dma_mask;
2566 if (pdev->dev.coherent_dma_mask &&
2567 pdev->dev.coherent_dma_mask < dma_mask)
2568 dma_mask = pdev->dev.coherent_dma_mask;
2570 return dma_mask >= dma_get_required_mask(&pdev->dev);
2576 static int __init iommu_prepare_static_identity_mapping(int hw)
2578 struct pci_dev *pdev = NULL;
2581 ret = si_domain_init(hw);
2585 for_each_pci_dev(pdev) {
2586 if (iommu_should_identity_map(pdev, 1)) {
2587 ret = domain_add_dev_info(si_domain, pdev,
2588 hw ? CONTEXT_TT_PASS_THROUGH :
2589 CONTEXT_TT_MULTI_LEVEL);
2591 /* device not associated with an iommu */
2596 pr_info("IOMMU: %s identity mapping for device %s\n",
2597 hw ? "hardware" : "software", pci_name(pdev));
2604 static int __init init_dmars(void)
2606 struct dmar_drhd_unit *drhd;
2607 struct dmar_rmrr_unit *rmrr;
2609 struct intel_iommu *iommu;
2615 * initialize and program root entry to not present
2618 for_each_drhd_unit(drhd) {
2620 * lock not needed as this is only incremented in the single
2621 * threaded kernel __init code path all other access are read
2624 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2628 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2629 IOMMU_UNITS_SUPPORTED);
2632 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2635 printk(KERN_ERR "Allocating global iommu array failed\n");
2640 deferred_flush = kzalloc(g_num_of_iommus *
2641 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2642 if (!deferred_flush) {
2647 for_each_active_iommu(iommu, drhd) {
2648 g_iommus[iommu->seq_id] = iommu;
2650 ret = iommu_init_domains(iommu);
2656 * we could share the same root & context tables
2657 * among all IOMMU's. Need to Split it later.
2659 ret = iommu_alloc_root_entry(iommu);
2661 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2664 if (!ecap_pass_through(iommu->ecap))
2665 hw_pass_through = 0;
2669 * Start from the sane iommu hardware state.
2671 for_each_active_iommu(iommu, drhd) {
2673 * If the queued invalidation is already initialized by us
2674 * (for example, while enabling interrupt-remapping) then
2675 * we got the things already rolling from a sane state.
2681 * Clear any previous faults.
2683 dmar_fault(-1, iommu);
2685 * Disable queued invalidation if supported and already enabled
2686 * before OS handover.
2688 dmar_disable_qi(iommu);
2691 for_each_active_iommu(iommu, drhd) {
2692 if (dmar_enable_qi(iommu)) {
2694 * Queued Invalidate not enabled, use Register Based
2697 iommu->flush.flush_context = __iommu_flush_context;
2698 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2699 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2702 (unsigned long long)drhd->reg_base_addr);
2704 iommu->flush.flush_context = qi_flush_context;
2705 iommu->flush.flush_iotlb = qi_flush_iotlb;
2706 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2709 (unsigned long long)drhd->reg_base_addr);
2713 if (iommu_pass_through)
2714 iommu_identity_mapping |= IDENTMAP_ALL;
2716 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2717 iommu_identity_mapping |= IDENTMAP_GFX;
2720 check_tylersburg_isoch();
2723 * If pass through is not set or not enabled, setup context entries for
2724 * identity mappings for rmrr, gfx, and isa and may fall back to static
2725 * identity mapping if iommu_identity_mapping is set.
2727 if (iommu_identity_mapping) {
2728 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2730 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2736 * for each dev attached to rmrr
2738 * locate drhd for dev, alloc domain for dev
2739 * allocate free domain
2740 * allocate page table entries for rmrr
2741 * if context not allocated for bus
2742 * allocate and init context
2743 * set present in root table for this bus
2744 * init context with domain, translation etc
2748 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2749 for_each_rmrr_units(rmrr) {
2750 /* some BIOS lists non-exist devices in DMAR table. */
2751 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2753 if (!dev_is_pci(dev))
2755 ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2758 "IOMMU: mapping reserved region failed\n");
2762 iommu_prepare_isa();
2767 * global invalidate context cache
2768 * global invalidate iotlb
2769 * enable translation
2771 for_each_iommu(iommu, drhd) {
2772 if (drhd->ignored) {
2774 * we always have to disable PMRs or DMA may fail on
2778 iommu_disable_protect_mem_regions(iommu);
2782 iommu_flush_write_buffer(iommu);
2784 ret = dmar_set_interrupt(iommu);
2788 iommu_set_root_entry(iommu);
2790 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2791 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2793 ret = iommu_enable_translation(iommu);
2797 iommu_disable_protect_mem_regions(iommu);
2803 for_each_active_iommu(iommu, drhd)
2804 free_dmar_iommu(iommu);
2805 kfree(deferred_flush);
2812 /* This takes a number of _MM_ pages, not VTD pages */
2813 static struct iova *intel_alloc_iova(struct device *dev,
2814 struct dmar_domain *domain,
2815 unsigned long nrpages, uint64_t dma_mask)
2817 struct pci_dev *pdev = to_pci_dev(dev);
2818 struct iova *iova = NULL;
2820 /* Restrict dma_mask to the width that the iommu can handle */
2821 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2823 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2825 * First try to allocate an io virtual address in
2826 * DMA_BIT_MASK(32) and if that fails then try allocating
2829 iova = alloc_iova(&domain->iovad, nrpages,
2830 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2834 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2835 if (unlikely(!iova)) {
2836 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2837 nrpages, pci_name(pdev));
2844 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2846 struct dmar_domain *domain;
2849 domain = get_domain_for_dev(pdev,
2850 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2853 "Allocating domain for %s failed", pci_name(pdev));
2857 /* make sure context mapping is ok */
2858 if (unlikely(!domain_context_mapped(pdev))) {
2859 ret = domain_context_mapping(domain, pdev,
2860 CONTEXT_TT_MULTI_LEVEL);
2863 "Domain context map for %s failed",
2872 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2874 struct device_domain_info *info;
2876 /* No lock here, assumes no domain exit in normal case */
2877 info = dev->dev.archdata.iommu;
2879 return info->domain;
2881 return __get_valid_domain_for_dev(dev);
2884 static int iommu_dummy(struct device *dev)
2886 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2889 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2890 static int iommu_no_mapping(struct device *dev)
2892 struct pci_dev *pdev;
2895 if (unlikely(!dev_is_pci(dev)))
2898 if (iommu_dummy(dev))
2901 if (!iommu_identity_mapping)
2904 pdev = to_pci_dev(dev);
2905 found = identity_mapping(pdev);
2907 if (iommu_should_identity_map(pdev, 0))
2911 * 32 bit DMA is removed from si_domain and fall back
2912 * to non-identity mapping.
2914 domain_remove_one_dev_info(si_domain, pdev);
2915 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2921 * In case of a detached 64 bit DMA device from vm, the device
2922 * is put into si_domain for identity mapping.
2924 if (iommu_should_identity_map(pdev, 0)) {
2926 ret = domain_add_dev_info(si_domain, pdev,
2928 CONTEXT_TT_PASS_THROUGH :
2929 CONTEXT_TT_MULTI_LEVEL);
2931 printk(KERN_INFO "64bit %s uses identity mapping\n",
2941 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2942 size_t size, int dir, u64 dma_mask)
2944 struct pci_dev *pdev = to_pci_dev(hwdev);
2945 struct dmar_domain *domain;
2946 phys_addr_t start_paddr;
2950 struct intel_iommu *iommu;
2951 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2953 BUG_ON(dir == DMA_NONE);
2955 if (iommu_no_mapping(hwdev))
2958 domain = get_valid_domain_for_dev(pdev);
2962 iommu = domain_get_iommu(domain);
2963 size = aligned_nrpages(paddr, size);
2965 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2970 * Check if DMAR supports zero-length reads on write only
2973 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2974 !cap_zlr(iommu->cap))
2975 prot |= DMA_PTE_READ;
2976 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2977 prot |= DMA_PTE_WRITE;
2979 * paddr - (paddr + size) might be partial page, we should map the whole
2980 * page. Note: if two part of one page are separately mapped, we
2981 * might have two guest_addr mapping to the same host paddr, but this
2982 * is not a big problem
2984 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2985 mm_to_dma_pfn(paddr_pfn), size, prot);
2989 /* it's a non-present to present mapping. Only flush if caching mode */
2990 if (cap_caching_mode(iommu->cap))
2991 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2993 iommu_flush_write_buffer(iommu);
2995 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2996 start_paddr += paddr & ~PAGE_MASK;
3001 __free_iova(&domain->iovad, iova);
3002 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3003 pci_name(pdev), size, (unsigned long long)paddr, dir);
3007 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3008 unsigned long offset, size_t size,
3009 enum dma_data_direction dir,
3010 struct dma_attrs *attrs)
3012 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3013 dir, to_pci_dev(dev)->dma_mask);
3016 static void flush_unmaps(void)
3022 /* just flush them all */
3023 for (i = 0; i < g_num_of_iommus; i++) {
3024 struct intel_iommu *iommu = g_iommus[i];
3028 if (!deferred_flush[i].next)
3031 /* In caching mode, global flushes turn emulation expensive */
3032 if (!cap_caching_mode(iommu->cap))
3033 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3034 DMA_TLB_GLOBAL_FLUSH);
3035 for (j = 0; j < deferred_flush[i].next; j++) {
3037 struct iova *iova = deferred_flush[i].iova[j];
3038 struct dmar_domain *domain = deferred_flush[i].domain[j];
3040 /* On real hardware multiple invalidations are expensive */
3041 if (cap_caching_mode(iommu->cap))
3042 iommu_flush_iotlb_psi(iommu, domain->id,
3043 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3044 !deferred_flush[i].freelist[j], 0);
3046 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3047 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3048 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3050 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3051 if (deferred_flush[i].freelist[j])
3052 dma_free_pagelist(deferred_flush[i].freelist[j]);
3054 deferred_flush[i].next = 0;
3060 static void flush_unmaps_timeout(unsigned long data)
3062 unsigned long flags;
3064 spin_lock_irqsave(&async_umap_flush_lock, flags);
3066 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3069 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3071 unsigned long flags;
3073 struct intel_iommu *iommu;
3075 spin_lock_irqsave(&async_umap_flush_lock, flags);
3076 if (list_size == HIGH_WATER_MARK)
3079 iommu = domain_get_iommu(dom);
3080 iommu_id = iommu->seq_id;
3082 next = deferred_flush[iommu_id].next;
3083 deferred_flush[iommu_id].domain[next] = dom;
3084 deferred_flush[iommu_id].iova[next] = iova;
3085 deferred_flush[iommu_id].freelist[next] = freelist;
3086 deferred_flush[iommu_id].next++;
3089 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3093 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3096 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3097 size_t size, enum dma_data_direction dir,
3098 struct dma_attrs *attrs)
3100 struct pci_dev *pdev = to_pci_dev(dev);
3101 struct dmar_domain *domain;
3102 unsigned long start_pfn, last_pfn;
3104 struct intel_iommu *iommu;
3105 struct page *freelist;
3107 if (iommu_no_mapping(dev))
3110 domain = find_domain(dev);
3113 iommu = domain_get_iommu(domain);
3115 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3116 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3117 (unsigned long long)dev_addr))
3120 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3121 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3123 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3124 pci_name(pdev), start_pfn, last_pfn);
3126 freelist = domain_unmap(domain, start_pfn, last_pfn);
3128 if (intel_iommu_strict) {
3129 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3130 last_pfn - start_pfn + 1, !freelist, 0);
3132 __free_iova(&domain->iovad, iova);
3133 dma_free_pagelist(freelist);
3135 add_unmap(domain, iova, freelist);
3137 * queue up the release of the unmap to save the 1/6th of the
3138 * cpu used up by the iotlb flush operation...
3143 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3144 dma_addr_t *dma_handle, gfp_t flags,
3145 struct dma_attrs *attrs)
3150 size = PAGE_ALIGN(size);
3151 order = get_order(size);
3153 if (!iommu_no_mapping(hwdev))
3154 flags &= ~(GFP_DMA | GFP_DMA32);
3155 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3156 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3162 vaddr = (void *)__get_free_pages(flags, order);
3165 memset(vaddr, 0, size);
3167 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3169 hwdev->coherent_dma_mask);
3172 free_pages((unsigned long)vaddr, order);
3176 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3177 dma_addr_t dma_handle, struct dma_attrs *attrs)
3181 size = PAGE_ALIGN(size);
3182 order = get_order(size);
3184 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3185 free_pages((unsigned long)vaddr, order);
3188 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3189 int nelems, enum dma_data_direction dir,
3190 struct dma_attrs *attrs)
3192 struct pci_dev *pdev = to_pci_dev(hwdev);
3193 struct dmar_domain *domain;
3194 unsigned long start_pfn, last_pfn;
3196 struct intel_iommu *iommu;
3197 struct page *freelist;
3199 if (iommu_no_mapping(hwdev))
3202 domain = find_domain(hwdev);
3205 iommu = domain_get_iommu(domain);
3207 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3208 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3209 (unsigned long long)sglist[0].dma_address))
3212 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3213 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3215 freelist = domain_unmap(domain, start_pfn, last_pfn);
3217 if (intel_iommu_strict) {
3218 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219 last_pfn - start_pfn + 1, !freelist, 0);
3221 __free_iova(&domain->iovad, iova);
3222 dma_free_pagelist(freelist);
3224 add_unmap(domain, iova, freelist);
3226 * queue up the release of the unmap to save the 1/6th of the
3227 * cpu used up by the iotlb flush operation...
3232 static int intel_nontranslate_map_sg(struct device *hddev,
3233 struct scatterlist *sglist, int nelems, int dir)
3236 struct scatterlist *sg;
3238 for_each_sg(sglist, sg, nelems, i) {
3239 BUG_ON(!sg_page(sg));
3240 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3241 sg->dma_length = sg->length;
3246 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3247 enum dma_data_direction dir, struct dma_attrs *attrs)
3250 struct pci_dev *pdev = to_pci_dev(hwdev);
3251 struct dmar_domain *domain;
3254 struct iova *iova = NULL;
3256 struct scatterlist *sg;
3257 unsigned long start_vpfn;
3258 struct intel_iommu *iommu;
3260 BUG_ON(dir == DMA_NONE);
3261 if (iommu_no_mapping(hwdev))
3262 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3264 domain = get_valid_domain_for_dev(pdev);
3268 iommu = domain_get_iommu(domain);
3270 for_each_sg(sglist, sg, nelems, i)
3271 size += aligned_nrpages(sg->offset, sg->length);
3273 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3276 sglist->dma_length = 0;
3281 * Check if DMAR supports zero-length reads on write only
3284 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3285 !cap_zlr(iommu->cap))
3286 prot |= DMA_PTE_READ;
3287 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3288 prot |= DMA_PTE_WRITE;
3290 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3292 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3293 if (unlikely(ret)) {
3294 /* clear the page */
3295 dma_pte_clear_range(domain, start_vpfn,
3296 start_vpfn + size - 1);
3297 /* free page tables */
3298 dma_pte_free_pagetable(domain, start_vpfn,
3299 start_vpfn + size - 1);
3301 __free_iova(&domain->iovad, iova);
3305 /* it's a non-present to present mapping. Only flush if caching mode */
3306 if (cap_caching_mode(iommu->cap))
3307 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3309 iommu_flush_write_buffer(iommu);
3314 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3319 struct dma_map_ops intel_dma_ops = {
3320 .alloc = intel_alloc_coherent,
3321 .free = intel_free_coherent,
3322 .map_sg = intel_map_sg,
3323 .unmap_sg = intel_unmap_sg,
3324 .map_page = intel_map_page,
3325 .unmap_page = intel_unmap_page,
3326 .mapping_error = intel_mapping_error,
3329 static inline int iommu_domain_cache_init(void)
3333 iommu_domain_cache = kmem_cache_create("iommu_domain",
3334 sizeof(struct dmar_domain),
3339 if (!iommu_domain_cache) {
3340 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3347 static inline int iommu_devinfo_cache_init(void)
3351 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3352 sizeof(struct device_domain_info),
3356 if (!iommu_devinfo_cache) {
3357 printk(KERN_ERR "Couldn't create devinfo cache\n");
3364 static inline int iommu_iova_cache_init(void)
3368 iommu_iova_cache = kmem_cache_create("iommu_iova",
3369 sizeof(struct iova),
3373 if (!iommu_iova_cache) {
3374 printk(KERN_ERR "Couldn't create iova cache\n");
3381 static int __init iommu_init_mempool(void)
3384 ret = iommu_iova_cache_init();
3388 ret = iommu_domain_cache_init();
3392 ret = iommu_devinfo_cache_init();
3396 kmem_cache_destroy(iommu_domain_cache);
3398 kmem_cache_destroy(iommu_iova_cache);
3403 static void __init iommu_exit_mempool(void)
3405 kmem_cache_destroy(iommu_devinfo_cache);
3406 kmem_cache_destroy(iommu_domain_cache);
3407 kmem_cache_destroy(iommu_iova_cache);
3411 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3413 struct dmar_drhd_unit *drhd;
3417 /* We know that this device on this chipset has its own IOMMU.
3418 * If we find it under a different IOMMU, then the BIOS is lying
3419 * to us. Hope that the IOMMU for this device is actually
3420 * disabled, and it needs no translation...
3422 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3424 /* "can't" happen */
3425 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3428 vtbar &= 0xffff0000;
3430 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3431 drhd = dmar_find_matched_drhd_unit(pdev);
3432 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3433 TAINT_FIRMWARE_WORKAROUND,
3434 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3435 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3437 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3439 static void __init init_no_remapping_devices(void)
3441 struct dmar_drhd_unit *drhd;
3445 for_each_drhd_unit(drhd) {
3446 if (!drhd->include_all) {
3447 for_each_active_dev_scope(drhd->devices,
3448 drhd->devices_cnt, i, dev)
3450 /* ignore DMAR unit if no devices exist */
3451 if (i == drhd->devices_cnt)
3456 for_each_active_drhd_unit(drhd) {
3457 if (drhd->include_all)
3460 for_each_active_dev_scope(drhd->devices,
3461 drhd->devices_cnt, i, dev)
3462 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3464 if (i < drhd->devices_cnt)
3467 /* This IOMMU has *only* gfx devices. Either bypass it or
3468 set the gfx_mapped flag, as appropriate */
3470 intel_iommu_gfx_mapped = 1;
3473 for_each_active_dev_scope(drhd->devices,
3474 drhd->devices_cnt, i, dev)
3475 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3483 struct dmar_drhd_unit *drhd;
3484 struct intel_iommu *iommu = NULL;
3486 for_each_active_iommu(iommu, drhd)
3488 dmar_reenable_qi(iommu);
3490 for_each_iommu(iommu, drhd) {
3491 if (drhd->ignored) {
3493 * we always have to disable PMRs or DMA may fail on
3497 iommu_disable_protect_mem_regions(iommu);
3501 iommu_flush_write_buffer(iommu);
3503 iommu_set_root_entry(iommu);
3505 iommu->flush.flush_context(iommu, 0, 0, 0,
3506 DMA_CCMD_GLOBAL_INVL);
3507 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3508 DMA_TLB_GLOBAL_FLUSH);
3509 if (iommu_enable_translation(iommu))
3511 iommu_disable_protect_mem_regions(iommu);
3517 static void iommu_flush_all(void)
3519 struct dmar_drhd_unit *drhd;
3520 struct intel_iommu *iommu;
3522 for_each_active_iommu(iommu, drhd) {
3523 iommu->flush.flush_context(iommu, 0, 0, 0,
3524 DMA_CCMD_GLOBAL_INVL);
3525 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3526 DMA_TLB_GLOBAL_FLUSH);
3530 static int iommu_suspend(void)
3532 struct dmar_drhd_unit *drhd;
3533 struct intel_iommu *iommu = NULL;
3536 for_each_active_iommu(iommu, drhd) {
3537 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3539 if (!iommu->iommu_state)
3545 for_each_active_iommu(iommu, drhd) {
3546 iommu_disable_translation(iommu);
3548 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3550 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3551 readl(iommu->reg + DMAR_FECTL_REG);
3552 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3553 readl(iommu->reg + DMAR_FEDATA_REG);
3554 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3555 readl(iommu->reg + DMAR_FEADDR_REG);
3556 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3557 readl(iommu->reg + DMAR_FEUADDR_REG);
3559 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3564 for_each_active_iommu(iommu, drhd)
3565 kfree(iommu->iommu_state);
3570 static void iommu_resume(void)
3572 struct dmar_drhd_unit *drhd;
3573 struct intel_iommu *iommu = NULL;
3576 if (init_iommu_hw()) {
3578 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3580 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3584 for_each_active_iommu(iommu, drhd) {
3586 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3588 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3589 iommu->reg + DMAR_FECTL_REG);
3590 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3591 iommu->reg + DMAR_FEDATA_REG);
3592 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3593 iommu->reg + DMAR_FEADDR_REG);
3594 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3595 iommu->reg + DMAR_FEUADDR_REG);
3597 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3600 for_each_active_iommu(iommu, drhd)
3601 kfree(iommu->iommu_state);
3604 static struct syscore_ops iommu_syscore_ops = {
3605 .resume = iommu_resume,
3606 .suspend = iommu_suspend,
3609 static void __init init_iommu_pm_ops(void)
3611 register_syscore_ops(&iommu_syscore_ops);
3615 static inline void init_iommu_pm_ops(void) {}
3616 #endif /* CONFIG_PM */
3619 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3621 struct acpi_dmar_reserved_memory *rmrr;
3622 struct dmar_rmrr_unit *rmrru;
3624 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3628 rmrru->hdr = header;
3629 rmrr = (struct acpi_dmar_reserved_memory *)header;
3630 rmrru->base_address = rmrr->base_address;
3631 rmrru->end_address = rmrr->end_address;
3632 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3633 ((void *)rmrr) + rmrr->header.length,
3634 &rmrru->devices_cnt);
3635 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3640 list_add(&rmrru->list, &dmar_rmrr_units);
3645 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3647 struct acpi_dmar_atsr *atsr;
3648 struct dmar_atsr_unit *atsru;
3650 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3651 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3656 atsru->include_all = atsr->flags & 0x1;
3657 if (!atsru->include_all) {
3658 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3659 (void *)atsr + atsr->header.length,
3660 &atsru->devices_cnt);
3661 if (atsru->devices_cnt && atsru->devices == NULL) {
3667 list_add_rcu(&atsru->list, &dmar_atsr_units);
3672 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3674 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3678 static void intel_iommu_free_dmars(void)
3680 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3681 struct dmar_atsr_unit *atsru, *atsr_n;
3683 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3684 list_del(&rmrru->list);
3685 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3689 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3690 list_del(&atsru->list);
3691 intel_iommu_free_atsr(atsru);
3695 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3698 struct pci_bus *bus;
3699 struct pci_dev *bridge = NULL;
3701 struct acpi_dmar_atsr *atsr;
3702 struct dmar_atsr_unit *atsru;
3704 dev = pci_physfn(dev);
3705 for (bus = dev->bus; bus; bus = bus->parent) {
3707 if (!bridge || !pci_is_pcie(bridge) ||
3708 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3710 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3717 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3718 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3719 if (atsr->segment != pci_domain_nr(dev->bus))
3722 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3723 if (tmp == &bridge->dev)
3726 if (atsru->include_all)
3736 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3739 struct dmar_rmrr_unit *rmrru;
3740 struct dmar_atsr_unit *atsru;
3741 struct acpi_dmar_atsr *atsr;
3742 struct acpi_dmar_reserved_memory *rmrr;
3744 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3747 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3748 rmrr = container_of(rmrru->hdr,
3749 struct acpi_dmar_reserved_memory, header);
3750 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3751 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3752 ((void *)rmrr) + rmrr->header.length,
3753 rmrr->segment, rmrru->devices,
3754 rmrru->devices_cnt);
3759 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3760 if (dmar_remove_dev_scope(info, rmrr->segment,
3761 rmrru->devices, rmrru->devices_cnt))
3766 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3767 if (atsru->include_all)
3770 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3771 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3772 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3773 (void *)atsr + atsr->header.length,
3774 atsr->segment, atsru->devices,
3775 atsru->devices_cnt);
3780 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3781 if (dmar_remove_dev_scope(info, atsr->segment,
3782 atsru->devices, atsru->devices_cnt))
3791 * Here we only respond to action of unbound device from driver.
3793 * Added device is not attached to its DMAR domain here yet. That will happen
3794 * when mapping the device to iova.
3796 static int device_notifier(struct notifier_block *nb,
3797 unsigned long action, void *data)
3799 struct device *dev = data;
3800 struct pci_dev *pdev = to_pci_dev(dev);
3801 struct dmar_domain *domain;
3803 if (iommu_dummy(dev))
3806 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3807 action != BUS_NOTIFY_DEL_DEVICE)
3810 domain = find_domain(dev);
3814 down_read(&dmar_global_lock);
3815 domain_remove_one_dev_info(domain, pdev);
3816 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3818 list_empty(&domain->devices))
3819 domain_exit(domain);
3820 up_read(&dmar_global_lock);
3825 static struct notifier_block device_nb = {
3826 .notifier_call = device_notifier,
3829 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3830 unsigned long val, void *v)
3832 struct memory_notify *mhp = v;
3833 unsigned long long start, end;
3834 unsigned long start_vpfn, last_vpfn;
3837 case MEM_GOING_ONLINE:
3838 start = mhp->start_pfn << PAGE_SHIFT;
3839 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3840 if (iommu_domain_identity_map(si_domain, start, end)) {
3841 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3848 case MEM_CANCEL_ONLINE:
3849 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3850 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3851 while (start_vpfn <= last_vpfn) {
3853 struct dmar_drhd_unit *drhd;
3854 struct intel_iommu *iommu;
3855 struct page *freelist;
3857 iova = find_iova(&si_domain->iovad, start_vpfn);
3859 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3864 iova = split_and_remove_iova(&si_domain->iovad, iova,
3865 start_vpfn, last_vpfn);
3867 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3868 start_vpfn, last_vpfn);
3872 freelist = domain_unmap(si_domain, iova->pfn_lo,
3876 for_each_active_iommu(iommu, drhd)
3877 iommu_flush_iotlb_psi(iommu, si_domain->id,
3879 iova->pfn_hi - iova->pfn_lo + 1,
3882 dma_free_pagelist(freelist);
3884 start_vpfn = iova->pfn_hi + 1;
3885 free_iova_mem(iova);
3893 static struct notifier_block intel_iommu_memory_nb = {
3894 .notifier_call = intel_iommu_memory_notifier,
3898 int __init intel_iommu_init(void)
3901 struct dmar_drhd_unit *drhd;
3902 struct intel_iommu *iommu;
3904 /* VT-d is required for a TXT/tboot launch, so enforce that */
3905 force_on = tboot_force_iommu();
3907 if (iommu_init_mempool()) {
3909 panic("tboot: Failed to initialize iommu memory\n");
3913 down_write(&dmar_global_lock);
3914 if (dmar_table_init()) {
3916 panic("tboot: Failed to initialize DMAR table\n");
3921 * Disable translation if already enabled prior to OS handover.
3923 for_each_active_iommu(iommu, drhd)
3924 if (iommu->gcmd & DMA_GCMD_TE)
3925 iommu_disable_translation(iommu);
3927 if (dmar_dev_scope_init() < 0) {
3929 panic("tboot: Failed to initialize DMAR device scope\n");
3933 if (no_iommu || dmar_disabled)
3936 if (list_empty(&dmar_rmrr_units))
3937 printk(KERN_INFO "DMAR: No RMRR found\n");
3939 if (list_empty(&dmar_atsr_units))
3940 printk(KERN_INFO "DMAR: No ATSR found\n");
3942 if (dmar_init_reserved_ranges()) {
3944 panic("tboot: Failed to reserve iommu ranges\n");
3945 goto out_free_reserved_range;
3948 init_no_remapping_devices();
3953 panic("tboot: Failed to initialize DMARs\n");
3954 printk(KERN_ERR "IOMMU: dmar init failed\n");
3955 goto out_free_reserved_range;
3957 up_write(&dmar_global_lock);
3959 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3961 init_timer(&unmap_timer);
3962 #ifdef CONFIG_SWIOTLB
3965 dma_ops = &intel_dma_ops;
3967 init_iommu_pm_ops();
3969 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3970 bus_register_notifier(&pci_bus_type, &device_nb);
3971 if (si_domain && !hw_pass_through)
3972 register_memory_notifier(&intel_iommu_memory_nb);
3974 intel_iommu_enabled = 1;
3978 out_free_reserved_range:
3979 put_iova_domain(&reserved_iova_list);
3981 intel_iommu_free_dmars();
3982 up_write(&dmar_global_lock);
3983 iommu_exit_mempool();
3987 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3988 struct pci_dev *pdev)
3990 struct pci_dev *tmp, *parent;
3992 if (!iommu || !pdev)
3995 /* dependent device detach */
3996 tmp = pci_find_upstream_pcie_bridge(pdev);
3997 /* Secondary interface's bus number and devfn 0 */
3999 parent = pdev->bus->self;
4000 while (parent != tmp) {
4001 iommu_detach_dev(iommu, parent->bus->number,
4003 parent = parent->bus->self;
4005 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4006 iommu_detach_dev(iommu,
4007 tmp->subordinate->number, 0);
4008 else /* this is a legacy PCI bridge */
4009 iommu_detach_dev(iommu, tmp->bus->number,
4014 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4015 struct pci_dev *pdev)
4017 struct device_domain_info *info, *tmp;
4018 struct intel_iommu *iommu;
4019 unsigned long flags;
4022 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4027 spin_lock_irqsave(&device_domain_lock, flags);
4028 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4029 if (info->segment == pci_domain_nr(pdev->bus) &&
4030 info->bus == pdev->bus->number &&
4031 info->devfn == pdev->devfn) {
4032 unlink_domain_info(info);
4033 spin_unlock_irqrestore(&device_domain_lock, flags);
4035 iommu_disable_dev_iotlb(info);
4036 iommu_detach_dev(iommu, info->bus, info->devfn);
4037 iommu_detach_dependent_devices(iommu, pdev);
4038 free_devinfo_mem(info);
4040 spin_lock_irqsave(&device_domain_lock, flags);
4048 /* if there is no other devices under the same iommu
4049 * owned by this domain, clear this iommu in iommu_bmp
4050 * update iommu count and coherency
4052 if (iommu == device_to_iommu(info->segment, info->bus,
4057 spin_unlock_irqrestore(&device_domain_lock, flags);
4060 unsigned long tmp_flags;
4061 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4062 clear_bit(iommu->seq_id, domain->iommu_bmp);
4063 domain->iommu_count--;
4064 domain_update_iommu_cap(domain);
4065 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4067 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4068 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4069 spin_lock_irqsave(&iommu->lock, tmp_flags);
4070 clear_bit(domain->id, iommu->domain_ids);
4071 iommu->domains[domain->id] = NULL;
4072 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4077 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4081 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4082 domain_reserve_special_ranges(domain);
4084 /* calculate AGAW */
4085 domain->gaw = guest_width;
4086 adjust_width = guestwidth_to_adjustwidth(guest_width);
4087 domain->agaw = width_to_agaw(adjust_width);
4089 domain->iommu_coherency = 0;
4090 domain->iommu_snooping = 0;
4091 domain->iommu_superpage = 0;
4092 domain->max_addr = 0;
4095 /* always allocate the top pgd */
4096 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4099 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4103 static int intel_iommu_domain_init(struct iommu_domain *domain)
4105 struct dmar_domain *dmar_domain;
4107 dmar_domain = alloc_domain(true);
4110 "intel_iommu_domain_init: dmar_domain == NULL\n");
4113 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4115 "intel_iommu_domain_init() failed\n");
4116 domain_exit(dmar_domain);
4119 domain_update_iommu_cap(dmar_domain);
4120 domain->priv = dmar_domain;
4122 domain->geometry.aperture_start = 0;
4123 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4124 domain->geometry.force_aperture = true;
4129 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4131 struct dmar_domain *dmar_domain = domain->priv;
4133 domain->priv = NULL;
4134 domain_exit(dmar_domain);
4137 static int intel_iommu_attach_device(struct iommu_domain *domain,
4140 struct dmar_domain *dmar_domain = domain->priv;
4141 struct pci_dev *pdev = to_pci_dev(dev);
4142 struct intel_iommu *iommu;
4145 /* normally pdev is not mapped */
4146 if (unlikely(domain_context_mapped(pdev))) {
4147 struct dmar_domain *old_domain;
4149 old_domain = find_domain(dev);
4151 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4152 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4153 domain_remove_one_dev_info(old_domain, pdev);
4155 domain_remove_dev_info(old_domain);
4159 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4164 /* check if this iommu agaw is sufficient for max mapped address */
4165 addr_width = agaw_to_width(iommu->agaw);
4166 if (addr_width > cap_mgaw(iommu->cap))
4167 addr_width = cap_mgaw(iommu->cap);
4169 if (dmar_domain->max_addr > (1LL << addr_width)) {
4170 printk(KERN_ERR "%s: iommu width (%d) is not "
4171 "sufficient for the mapped address (%llx)\n",
4172 __func__, addr_width, dmar_domain->max_addr);
4175 dmar_domain->gaw = addr_width;
4178 * Knock out extra levels of page tables if necessary
4180 while (iommu->agaw < dmar_domain->agaw) {
4181 struct dma_pte *pte;
4183 pte = dmar_domain->pgd;
4184 if (dma_pte_present(pte)) {
4185 dmar_domain->pgd = (struct dma_pte *)
4186 phys_to_virt(dma_pte_addr(pte));
4187 free_pgtable_page(pte);
4189 dmar_domain->agaw--;
4192 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4195 static void intel_iommu_detach_device(struct iommu_domain *domain,
4198 struct dmar_domain *dmar_domain = domain->priv;
4199 struct pci_dev *pdev = to_pci_dev(dev);
4201 domain_remove_one_dev_info(dmar_domain, pdev);
4204 static int intel_iommu_map(struct iommu_domain *domain,
4205 unsigned long iova, phys_addr_t hpa,
4206 size_t size, int iommu_prot)
4208 struct dmar_domain *dmar_domain = domain->priv;
4213 if (iommu_prot & IOMMU_READ)
4214 prot |= DMA_PTE_READ;
4215 if (iommu_prot & IOMMU_WRITE)
4216 prot |= DMA_PTE_WRITE;
4217 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4218 prot |= DMA_PTE_SNP;
4220 max_addr = iova + size;
4221 if (dmar_domain->max_addr < max_addr) {
4224 /* check if minimum agaw is sufficient for mapped address */
4225 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4226 if (end < max_addr) {
4227 printk(KERN_ERR "%s: iommu width (%d) is not "
4228 "sufficient for the mapped address (%llx)\n",
4229 __func__, dmar_domain->gaw, max_addr);
4232 dmar_domain->max_addr = max_addr;
4234 /* Round up size to next multiple of PAGE_SIZE, if it and
4235 the low bits of hpa would take us onto the next page */
4236 size = aligned_nrpages(hpa, size);
4237 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4238 hpa >> VTD_PAGE_SHIFT, size, prot);
4242 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4243 unsigned long iova, size_t size)
4245 struct dmar_domain *dmar_domain = domain->priv;
4246 struct page *freelist = NULL;
4247 struct intel_iommu *iommu;
4248 unsigned long start_pfn, last_pfn;
4249 unsigned int npages;
4250 int iommu_id, num, ndomains, level = 0;
4252 /* Cope with horrid API which requires us to unmap more than the
4253 size argument if it happens to be a large-page mapping. */
4254 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4257 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4258 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4260 start_pfn = iova >> VTD_PAGE_SHIFT;
4261 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4263 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4265 npages = last_pfn - start_pfn + 1;
4267 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4268 iommu = g_iommus[iommu_id];
4271 * find bit position of dmar_domain
4273 ndomains = cap_ndoms(iommu->cap);
4274 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4275 if (iommu->domains[num] == dmar_domain)
4276 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4277 npages, !freelist, 0);
4282 dma_free_pagelist(freelist);
4284 if (dmar_domain->max_addr == iova + size)
4285 dmar_domain->max_addr = iova;
4290 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4293 struct dmar_domain *dmar_domain = domain->priv;
4294 struct dma_pte *pte;
4298 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4300 phys = dma_pte_addr(pte);
4305 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4308 struct dmar_domain *dmar_domain = domain->priv;
4310 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4311 return dmar_domain->iommu_snooping;
4312 if (cap == IOMMU_CAP_INTR_REMAP)
4313 return irq_remapping_enabled;
4318 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4320 static int intel_iommu_add_device(struct device *dev)
4322 struct pci_dev *pdev = to_pci_dev(dev);
4323 struct pci_dev *bridge, *dma_pdev = NULL;
4324 struct iommu_group *group;
4327 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4328 pdev->bus->number, pdev->devfn))
4331 bridge = pci_find_upstream_pcie_bridge(pdev);
4333 if (pci_is_pcie(bridge))
4334 dma_pdev = pci_get_domain_bus_and_slot(
4335 pci_domain_nr(pdev->bus),
4336 bridge->subordinate->number, 0);
4338 dma_pdev = pci_dev_get(bridge);
4340 dma_pdev = pci_dev_get(pdev);
4342 /* Account for quirked devices */
4343 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4346 * If it's a multifunction device that does not support our
4347 * required ACS flags, add to the same group as lowest numbered
4348 * function that also does not suport the required ACS flags.
4350 if (dma_pdev->multifunction &&
4351 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4352 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4354 for (i = 0; i < 8; i++) {
4355 struct pci_dev *tmp;
4357 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4361 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4362 swap_pci_ref(&dma_pdev, tmp);
4370 * Devices on the root bus go through the iommu. If that's not us,
4371 * find the next upstream device and test ACS up to the root bus.
4372 * Finding the next device may require skipping virtual buses.
4374 while (!pci_is_root_bus(dma_pdev->bus)) {
4375 struct pci_bus *bus = dma_pdev->bus;
4377 while (!bus->self) {
4378 if (!pci_is_root_bus(bus))
4384 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4387 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4391 group = iommu_group_get(&dma_pdev->dev);
4392 pci_dev_put(dma_pdev);
4394 group = iommu_group_alloc();
4396 return PTR_ERR(group);
4399 ret = iommu_group_add_device(group, dev);
4401 iommu_group_put(group);
4405 static void intel_iommu_remove_device(struct device *dev)
4407 iommu_group_remove_device(dev);
4410 static struct iommu_ops intel_iommu_ops = {
4411 .domain_init = intel_iommu_domain_init,
4412 .domain_destroy = intel_iommu_domain_destroy,
4413 .attach_dev = intel_iommu_attach_device,
4414 .detach_dev = intel_iommu_detach_device,
4415 .map = intel_iommu_map,
4416 .unmap = intel_iommu_unmap,
4417 .iova_to_phys = intel_iommu_iova_to_phys,
4418 .domain_has_cap = intel_iommu_domain_has_cap,
4419 .add_device = intel_iommu_add_device,
4420 .remove_device = intel_iommu_remove_device,
4421 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4424 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4426 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4427 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4439 static void quirk_iommu_rwbf(struct pci_dev *dev)
4442 * Mobile 4 Series Chipset neglects to set RWBF capability,
4443 * but needs it. Same seems to hold for the desktop versions.
4445 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4458 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4459 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4460 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4461 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4462 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4463 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4464 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4465 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4467 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4471 if (pci_read_config_word(dev, GGC, &ggc))
4474 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4475 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4477 } else if (dmar_map_gfx) {
4478 /* we have to ensure the gfx device is idle before we flush */
4479 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4480 intel_iommu_strict = 1;
4483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4488 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4489 ISOCH DMAR unit for the Azalia sound device, but not give it any
4490 TLB entries, which causes it to deadlock. Check for that. We do
4491 this in a function called from init_dmars(), instead of in a PCI
4492 quirk, because we don't want to print the obnoxious "BIOS broken"
4493 message if VT-d is actually disabled.
4495 static void __init check_tylersburg_isoch(void)
4497 struct pci_dev *pdev;
4498 uint32_t vtisochctrl;
4500 /* If there's no Azalia in the system anyway, forget it. */
4501 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4506 /* System Management Registers. Might be hidden, in which case
4507 we can't do the sanity check. But that's OK, because the
4508 known-broken BIOSes _don't_ actually hide it, so far. */
4509 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4513 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4520 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4521 if (vtisochctrl & 1)
4524 /* Drop all bits other than the number of TLB entries */
4525 vtisochctrl &= 0x1c;
4527 /* If we have the recommended number of TLB entries (16), fine. */
4528 if (vtisochctrl == 0x10)
4531 /* Zero TLB entries? You get to ride the short bus to school. */
4533 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4534 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4535 dmi_get_system_info(DMI_BIOS_VENDOR),
4536 dmi_get_system_info(DMI_BIOS_VERSION),
4537 dmi_get_system_info(DMI_PRODUCT_VERSION));
4538 iommu_identity_mapping |= IDENTMAP_AZALIA;
4542 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",