iommu/vt-d: Unify format of the printed messages
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static int intel_iommu_attach_device(struct iommu_domain *domain,
359                                      struct device *dev);
360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
361                                             dma_addr_t iova);
362
363 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
364 int dmar_disabled = 0;
365 #else
366 int dmar_disabled = 1;
367 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
368
369 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
370 int intel_iommu_sm = 1;
371 #else
372 int intel_iommu_sm;
373 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
374
375 int intel_iommu_enabled = 0;
376 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
377
378 static int dmar_map_gfx = 1;
379 static int dmar_forcedac;
380 static int intel_iommu_strict;
381 static int intel_iommu_superpage = 1;
382 static int iommu_identity_mapping;
383 static int intel_no_bounce;
384
385 #define IDENTMAP_GFX            2
386 #define IDENTMAP_AZALIA         4
387
388 int intel_iommu_gfx_mapped;
389 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
390
391 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
392 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
393 DEFINE_SPINLOCK(device_domain_lock);
394 static LIST_HEAD(device_domain_list);
395
396 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
397                                 to_pci_dev(d)->untrusted)
398
399 /*
400  * Iterate over elements in device_domain_list and call the specified
401  * callback @fn against each element.
402  */
403 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
404                                      void *data), void *data)
405 {
406         int ret = 0;
407         unsigned long flags;
408         struct device_domain_info *info;
409
410         spin_lock_irqsave(&device_domain_lock, flags);
411         list_for_each_entry(info, &device_domain_list, global) {
412                 ret = fn(info, data);
413                 if (ret) {
414                         spin_unlock_irqrestore(&device_domain_lock, flags);
415                         return ret;
416                 }
417         }
418         spin_unlock_irqrestore(&device_domain_lock, flags);
419
420         return 0;
421 }
422
423 const struct iommu_ops intel_iommu_ops;
424
425 static bool translation_pre_enabled(struct intel_iommu *iommu)
426 {
427         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
428 }
429
430 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
433 }
434
435 static void init_translation_status(struct intel_iommu *iommu)
436 {
437         u32 gsts;
438
439         gsts = readl(iommu->reg + DMAR_GSTS_REG);
440         if (gsts & DMA_GSTS_TES)
441                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
442 }
443
444 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
445 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
446 {
447         return container_of(dom, struct dmar_domain, domain);
448 }
449
450 static int __init intel_iommu_setup(char *str)
451 {
452         if (!str)
453                 return -EINVAL;
454         while (*str) {
455                 if (!strncmp(str, "on", 2)) {
456                         dmar_disabled = 0;
457                         pr_info("IOMMU enabled\n");
458                 } else if (!strncmp(str, "off", 3)) {
459                         dmar_disabled = 1;
460                         no_platform_optin = 1;
461                         pr_info("IOMMU disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         pr_info("Disable GFX device mapping\n");
465                 } else if (!strncmp(str, "forcedac", 8)) {
466                         pr_info("Forcing DAC for PCI devices\n");
467                         dmar_forcedac = 1;
468                 } else if (!strncmp(str, "strict", 6)) {
469                         pr_info("Disable batched IOTLB flush\n");
470                         intel_iommu_strict = 1;
471                 } else if (!strncmp(str, "sp_off", 6)) {
472                         pr_info("Disable supported super page\n");
473                         intel_iommu_superpage = 0;
474                 } else if (!strncmp(str, "sm_on", 5)) {
475                         pr_info("Intel-IOMMU: scalable mode supported\n");
476                         intel_iommu_sm = 1;
477                 } else if (!strncmp(str, "tboot_noforce", 13)) {
478                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
479                         intel_iommu_tboot_noforce = 1;
480                 } else if (!strncmp(str, "nobounce", 8)) {
481                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
482                         intel_no_bounce = 1;
483                 }
484
485                 str += strcspn(str, ",");
486                 while (*str == ',')
487                         str++;
488         }
489         return 0;
490 }
491 __setup("intel_iommu=", intel_iommu_setup);
492
493 static struct kmem_cache *iommu_domain_cache;
494 static struct kmem_cache *iommu_devinfo_cache;
495
496 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
497 {
498         struct dmar_domain **domains;
499         int idx = did >> 8;
500
501         domains = iommu->domains[idx];
502         if (!domains)
503                 return NULL;
504
505         return domains[did & 0xff];
506 }
507
508 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
509                              struct dmar_domain *domain)
510 {
511         struct dmar_domain **domains;
512         int idx = did >> 8;
513
514         if (!iommu->domains[idx]) {
515                 size_t size = 256 * sizeof(struct dmar_domain *);
516                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
517         }
518
519         domains = iommu->domains[idx];
520         if (WARN_ON(!domains))
521                 return;
522         else
523                 domains[did & 0xff] = domain;
524 }
525
526 void *alloc_pgtable_page(int node)
527 {
528         struct page *page;
529         void *vaddr = NULL;
530
531         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
532         if (page)
533                 vaddr = page_address(page);
534         return vaddr;
535 }
536
537 void free_pgtable_page(void *vaddr)
538 {
539         free_page((unsigned long)vaddr);
540 }
541
542 static inline void *alloc_domain_mem(void)
543 {
544         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
545 }
546
547 static void free_domain_mem(void *vaddr)
548 {
549         kmem_cache_free(iommu_domain_cache, vaddr);
550 }
551
552 static inline void * alloc_devinfo_mem(void)
553 {
554         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
555 }
556
557 static inline void free_devinfo_mem(void *vaddr)
558 {
559         kmem_cache_free(iommu_devinfo_cache, vaddr);
560 }
561
562 static inline int domain_type_is_si(struct dmar_domain *domain)
563 {
564         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
565 }
566
567 static inline bool domain_use_first_level(struct dmar_domain *domain)
568 {
569         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
570 }
571
572 static inline int domain_pfn_supported(struct dmar_domain *domain,
573                                        unsigned long pfn)
574 {
575         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
576
577         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
578 }
579
580 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
581 {
582         unsigned long sagaw;
583         int agaw = -1;
584
585         sagaw = cap_sagaw(iommu->cap);
586         for (agaw = width_to_agaw(max_gaw);
587              agaw >= 0; agaw--) {
588                 if (test_bit(agaw, &sagaw))
589                         break;
590         }
591
592         return agaw;
593 }
594
595 /*
596  * Calculate max SAGAW for each iommu.
597  */
598 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
599 {
600         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
601 }
602
603 /*
604  * calculate agaw for each iommu.
605  * "SAGAW" may be different across iommus, use a default agaw, and
606  * get a supported less agaw for iommus that don't support the default agaw.
607  */
608 int iommu_calculate_agaw(struct intel_iommu *iommu)
609 {
610         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
611 }
612
613 /* This functionin only returns single iommu in a domain */
614 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
615 {
616         int iommu_id;
617
618         /* si_domain and vm domain should not get here. */
619         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
620                 return NULL;
621
622         for_each_domain_iommu(iommu_id, domain)
623                 break;
624
625         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
626                 return NULL;
627
628         return g_iommus[iommu_id];
629 }
630
631 static void domain_update_iommu_coherency(struct dmar_domain *domain)
632 {
633         struct dmar_drhd_unit *drhd;
634         struct intel_iommu *iommu;
635         bool found = false;
636         int i;
637
638         domain->iommu_coherency = 1;
639
640         for_each_domain_iommu(i, domain) {
641                 found = true;
642                 if (!ecap_coherent(g_iommus[i]->ecap)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         if (found)
648                 return;
649
650         /* No hardware attached; use lowest common denominator */
651         rcu_read_lock();
652         for_each_active_iommu(iommu, drhd) {
653                 if (!ecap_coherent(iommu->ecap)) {
654                         domain->iommu_coherency = 0;
655                         break;
656                 }
657         }
658         rcu_read_unlock();
659 }
660
661 static int domain_update_iommu_snooping(struct intel_iommu *skip)
662 {
663         struct dmar_drhd_unit *drhd;
664         struct intel_iommu *iommu;
665         int ret = 1;
666
667         rcu_read_lock();
668         for_each_active_iommu(iommu, drhd) {
669                 if (iommu != skip) {
670                         if (!ecap_sc_support(iommu->ecap)) {
671                                 ret = 0;
672                                 break;
673                         }
674                 }
675         }
676         rcu_read_unlock();
677
678         return ret;
679 }
680
681 static int domain_update_iommu_superpage(struct dmar_domain *domain,
682                                          struct intel_iommu *skip)
683 {
684         struct dmar_drhd_unit *drhd;
685         struct intel_iommu *iommu;
686         int mask = 0x3;
687
688         if (!intel_iommu_superpage) {
689                 return 0;
690         }
691
692         /* set iommu_superpage to the smallest common denominator */
693         rcu_read_lock();
694         for_each_active_iommu(iommu, drhd) {
695                 if (iommu != skip) {
696                         if (domain && domain_use_first_level(domain)) {
697                                 if (!cap_fl1gp_support(iommu->cap))
698                                         mask = 0x1;
699                         } else {
700                                 mask &= cap_super_page_val(iommu->cap);
701                         }
702
703                         if (!mask)
704                                 break;
705                 }
706         }
707         rcu_read_unlock();
708
709         return fls(mask);
710 }
711
712 /* Some capabilities may be different across iommus */
713 static void domain_update_iommu_cap(struct dmar_domain *domain)
714 {
715         domain_update_iommu_coherency(domain);
716         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
717         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
718 }
719
720 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
721                                          u8 devfn, int alloc)
722 {
723         struct root_entry *root = &iommu->root_entry[bus];
724         struct context_entry *context;
725         u64 *entry;
726
727         entry = &root->lo;
728         if (sm_supported(iommu)) {
729                 if (devfn >= 0x80) {
730                         devfn -= 0x80;
731                         entry = &root->hi;
732                 }
733                 devfn *= 2;
734         }
735         if (*entry & 1)
736                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
737         else {
738                 unsigned long phy_addr;
739                 if (!alloc)
740                         return NULL;
741
742                 context = alloc_pgtable_page(iommu->node);
743                 if (!context)
744                         return NULL;
745
746                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
747                 phy_addr = virt_to_phys((void *)context);
748                 *entry = phy_addr | 1;
749                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
750         }
751         return &context[devfn];
752 }
753
754 static int iommu_dummy(struct device *dev)
755 {
756         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
757 }
758
759 static bool attach_deferred(struct device *dev)
760 {
761         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
762 }
763
764 /**
765  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
766  *                               sub-hierarchy of a candidate PCI-PCI bridge
767  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
768  * @bridge: the candidate PCI-PCI bridge
769  *
770  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
771  */
772 static bool
773 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
774 {
775         struct pci_dev *pdev, *pbridge;
776
777         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
778                 return false;
779
780         pdev = to_pci_dev(dev);
781         pbridge = to_pci_dev(bridge);
782
783         if (pbridge->subordinate &&
784             pbridge->subordinate->number <= pdev->bus->number &&
785             pbridge->subordinate->busn_res.end >= pdev->bus->number)
786                 return true;
787
788         return false;
789 }
790
791 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
792 {
793         struct dmar_drhd_unit *drhd = NULL;
794         struct intel_iommu *iommu;
795         struct device *tmp;
796         struct pci_dev *pdev = NULL;
797         u16 segment = 0;
798         int i;
799
800         if (iommu_dummy(dev))
801                 return NULL;
802
803         if (dev_is_pci(dev)) {
804                 struct pci_dev *pf_pdev;
805
806                 pdev = pci_real_dma_dev(to_pci_dev(dev));
807
808                 /* VFs aren't listed in scope tables; we need to look up
809                  * the PF instead to find the IOMMU. */
810                 pf_pdev = pci_physfn(pdev);
811                 dev = &pf_pdev->dev;
812                 segment = pci_domain_nr(pdev->bus);
813         } else if (has_acpi_companion(dev))
814                 dev = &ACPI_COMPANION(dev)->dev;
815
816         rcu_read_lock();
817         for_each_active_iommu(iommu, drhd) {
818                 if (pdev && segment != drhd->segment)
819                         continue;
820
821                 for_each_active_dev_scope(drhd->devices,
822                                           drhd->devices_cnt, i, tmp) {
823                         if (tmp == dev) {
824                                 /* For a VF use its original BDF# not that of the PF
825                                  * which we used for the IOMMU lookup. Strictly speaking
826                                  * we could do this for all PCI devices; we only need to
827                                  * get the BDF# from the scope table for ACPI matches. */
828                                 if (pdev && pdev->is_virtfn)
829                                         goto got_pdev;
830
831                                 *bus = drhd->devices[i].bus;
832                                 *devfn = drhd->devices[i].devfn;
833                                 goto out;
834                         }
835
836                         if (is_downstream_to_pci_bridge(dev, tmp))
837                                 goto got_pdev;
838                 }
839
840                 if (pdev && drhd->include_all) {
841                 got_pdev:
842                         *bus = pdev->bus->number;
843                         *devfn = pdev->devfn;
844                         goto out;
845                 }
846         }
847         iommu = NULL;
848  out:
849         rcu_read_unlock();
850
851         return iommu;
852 }
853
854 static void domain_flush_cache(struct dmar_domain *domain,
855                                void *addr, int size)
856 {
857         if (!domain->iommu_coherency)
858                 clflush_cache_range(addr, size);
859 }
860
861 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
862 {
863         struct context_entry *context;
864         int ret = 0;
865         unsigned long flags;
866
867         spin_lock_irqsave(&iommu->lock, flags);
868         context = iommu_context_addr(iommu, bus, devfn, 0);
869         if (context)
870                 ret = context_present(context);
871         spin_unlock_irqrestore(&iommu->lock, flags);
872         return ret;
873 }
874
875 static void free_context_table(struct intel_iommu *iommu)
876 {
877         int i;
878         unsigned long flags;
879         struct context_entry *context;
880
881         spin_lock_irqsave(&iommu->lock, flags);
882         if (!iommu->root_entry) {
883                 goto out;
884         }
885         for (i = 0; i < ROOT_ENTRY_NR; i++) {
886                 context = iommu_context_addr(iommu, i, 0, 0);
887                 if (context)
888                         free_pgtable_page(context);
889
890                 if (!sm_supported(iommu))
891                         continue;
892
893                 context = iommu_context_addr(iommu, i, 0x80, 0);
894                 if (context)
895                         free_pgtable_page(context);
896
897         }
898         free_pgtable_page(iommu->root_entry);
899         iommu->root_entry = NULL;
900 out:
901         spin_unlock_irqrestore(&iommu->lock, flags);
902 }
903
904 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
905                                       unsigned long pfn, int *target_level)
906 {
907         struct dma_pte *parent, *pte;
908         int level = agaw_to_level(domain->agaw);
909         int offset;
910
911         BUG_ON(!domain->pgd);
912
913         if (!domain_pfn_supported(domain, pfn))
914                 /* Address beyond IOMMU's addressing capabilities. */
915                 return NULL;
916
917         parent = domain->pgd;
918
919         while (1) {
920                 void *tmp_page;
921
922                 offset = pfn_level_offset(pfn, level);
923                 pte = &parent[offset];
924                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
925                         break;
926                 if (level == *target_level)
927                         break;
928
929                 if (!dma_pte_present(pte)) {
930                         uint64_t pteval;
931
932                         tmp_page = alloc_pgtable_page(domain->nid);
933
934                         if (!tmp_page)
935                                 return NULL;
936
937                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
938                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
939                         if (domain_use_first_level(domain))
940                                 pteval |= DMA_FL_PTE_XD;
941                         if (cmpxchg64(&pte->val, 0ULL, pteval))
942                                 /* Someone else set it while we were thinking; use theirs. */
943                                 free_pgtable_page(tmp_page);
944                         else
945                                 domain_flush_cache(domain, pte, sizeof(*pte));
946                 }
947                 if (level == 1)
948                         break;
949
950                 parent = phys_to_virt(dma_pte_addr(pte));
951                 level--;
952         }
953
954         if (!*target_level)
955                 *target_level = level;
956
957         return pte;
958 }
959
960 /* return address's pte at specific level */
961 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
962                                          unsigned long pfn,
963                                          int level, int *large_page)
964 {
965         struct dma_pte *parent, *pte;
966         int total = agaw_to_level(domain->agaw);
967         int offset;
968
969         parent = domain->pgd;
970         while (level <= total) {
971                 offset = pfn_level_offset(pfn, total);
972                 pte = &parent[offset];
973                 if (level == total)
974                         return pte;
975
976                 if (!dma_pte_present(pte)) {
977                         *large_page = total;
978                         break;
979                 }
980
981                 if (dma_pte_superpage(pte)) {
982                         *large_page = total;
983                         return pte;
984                 }
985
986                 parent = phys_to_virt(dma_pte_addr(pte));
987                 total--;
988         }
989         return NULL;
990 }
991
992 /* clear last level pte, a tlb flush should be followed */
993 static void dma_pte_clear_range(struct dmar_domain *domain,
994                                 unsigned long start_pfn,
995                                 unsigned long last_pfn)
996 {
997         unsigned int large_page;
998         struct dma_pte *first_pte, *pte;
999
1000         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1001         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1002         BUG_ON(start_pfn > last_pfn);
1003
1004         /* we don't need lock here; nobody else touches the iova range */
1005         do {
1006                 large_page = 1;
1007                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1008                 if (!pte) {
1009                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1010                         continue;
1011                 }
1012                 do {
1013                         dma_clear_pte(pte);
1014                         start_pfn += lvl_to_nr_pages(large_page);
1015                         pte++;
1016                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1017
1018                 domain_flush_cache(domain, first_pte,
1019                                    (void *)pte - (void *)first_pte);
1020
1021         } while (start_pfn && start_pfn <= last_pfn);
1022 }
1023
1024 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1025                                int retain_level, struct dma_pte *pte,
1026                                unsigned long pfn, unsigned long start_pfn,
1027                                unsigned long last_pfn)
1028 {
1029         pfn = max(start_pfn, pfn);
1030         pte = &pte[pfn_level_offset(pfn, level)];
1031
1032         do {
1033                 unsigned long level_pfn;
1034                 struct dma_pte *level_pte;
1035
1036                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1037                         goto next;
1038
1039                 level_pfn = pfn & level_mask(level);
1040                 level_pte = phys_to_virt(dma_pte_addr(pte));
1041
1042                 if (level > 2) {
1043                         dma_pte_free_level(domain, level - 1, retain_level,
1044                                            level_pte, level_pfn, start_pfn,
1045                                            last_pfn);
1046                 }
1047
1048                 /*
1049                  * Free the page table if we're below the level we want to
1050                  * retain and the range covers the entire table.
1051                  */
1052                 if (level < retain_level && !(start_pfn > level_pfn ||
1053                       last_pfn < level_pfn + level_size(level) - 1)) {
1054                         dma_clear_pte(pte);
1055                         domain_flush_cache(domain, pte, sizeof(*pte));
1056                         free_pgtable_page(level_pte);
1057                 }
1058 next:
1059                 pfn += level_size(level);
1060         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1061 }
1062
1063 /*
1064  * clear last level (leaf) ptes and free page table pages below the
1065  * level we wish to keep intact.
1066  */
1067 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1068                                    unsigned long start_pfn,
1069                                    unsigned long last_pfn,
1070                                    int retain_level)
1071 {
1072         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1073         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1074         BUG_ON(start_pfn > last_pfn);
1075
1076         dma_pte_clear_range(domain, start_pfn, last_pfn);
1077
1078         /* We don't need lock here; nobody else touches the iova range */
1079         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1080                            domain->pgd, 0, start_pfn, last_pfn);
1081
1082         /* free pgd */
1083         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1084                 free_pgtable_page(domain->pgd);
1085                 domain->pgd = NULL;
1086         }
1087 }
1088
1089 /* When a page at a given level is being unlinked from its parent, we don't
1090    need to *modify* it at all. All we need to do is make a list of all the
1091    pages which can be freed just as soon as we've flushed the IOTLB and we
1092    know the hardware page-walk will no longer touch them.
1093    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1094    be freed. */
1095 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1096                                             int level, struct dma_pte *pte,
1097                                             struct page *freelist)
1098 {
1099         struct page *pg;
1100
1101         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1102         pg->freelist = freelist;
1103         freelist = pg;
1104
1105         if (level == 1)
1106                 return freelist;
1107
1108         pte = page_address(pg);
1109         do {
1110                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1111                         freelist = dma_pte_list_pagetables(domain, level - 1,
1112                                                            pte, freelist);
1113                 pte++;
1114         } while (!first_pte_in_page(pte));
1115
1116         return freelist;
1117 }
1118
1119 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1120                                         struct dma_pte *pte, unsigned long pfn,
1121                                         unsigned long start_pfn,
1122                                         unsigned long last_pfn,
1123                                         struct page *freelist)
1124 {
1125         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1126
1127         pfn = max(start_pfn, pfn);
1128         pte = &pte[pfn_level_offset(pfn, level)];
1129
1130         do {
1131                 unsigned long level_pfn;
1132
1133                 if (!dma_pte_present(pte))
1134                         goto next;
1135
1136                 level_pfn = pfn & level_mask(level);
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         freelist = dma_pte_clear_level(domain, level - 1,
1153                                                        phys_to_virt(dma_pte_addr(pte)),
1154                                                        level_pfn, start_pfn, last_pfn,
1155                                                        freelist);
1156                 }
1157 next:
1158                 pfn += level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164
1165         return freelist;
1166 }
1167
1168 /* We can't just free the pages because the IOMMU may still be walking
1169    the page tables, and may have cached the intermediate levels. The
1170    pages can only be freed after the IOTLB flush has been done. */
1171 static struct page *domain_unmap(struct dmar_domain *domain,
1172                                  unsigned long start_pfn,
1173                                  unsigned long last_pfn)
1174 {
1175         struct page *freelist;
1176
1177         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1178         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1179         BUG_ON(start_pfn > last_pfn);
1180
1181         /* we don't need lock here; nobody else touches the iova range */
1182         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1183                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1184
1185         /* free pgd */
1186         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1187                 struct page *pgd_page = virt_to_page(domain->pgd);
1188                 pgd_page->freelist = freelist;
1189                 freelist = pgd_page;
1190
1191                 domain->pgd = NULL;
1192         }
1193
1194         return freelist;
1195 }
1196
1197 static void dma_free_pagelist(struct page *freelist)
1198 {
1199         struct page *pg;
1200
1201         while ((pg = freelist)) {
1202                 freelist = pg->freelist;
1203                 free_pgtable_page(page_address(pg));
1204         }
1205 }
1206
1207 static void iova_entry_free(unsigned long data)
1208 {
1209         struct page *freelist = (struct page *)data;
1210
1211         dma_free_pagelist(freelist);
1212 }
1213
1214 /* iommu handling */
1215 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1216 {
1217         struct root_entry *root;
1218         unsigned long flags;
1219
1220         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1221         if (!root) {
1222                 pr_err("Allocating root entry for %s failed\n",
1223                         iommu->name);
1224                 return -ENOMEM;
1225         }
1226
1227         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1228
1229         spin_lock_irqsave(&iommu->lock, flags);
1230         iommu->root_entry = root;
1231         spin_unlock_irqrestore(&iommu->lock, flags);
1232
1233         return 0;
1234 }
1235
1236 static void iommu_set_root_entry(struct intel_iommu *iommu)
1237 {
1238         u64 addr;
1239         u32 sts;
1240         unsigned long flag;
1241
1242         addr = virt_to_phys(iommu->root_entry);
1243         if (sm_supported(iommu))
1244                 addr |= DMA_RTADDR_SMT;
1245
1246         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1247         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1248
1249         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (sts & DMA_GSTS_RTPS), sts);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1259 {
1260         u32 val;
1261         unsigned long flag;
1262
1263         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1264                 return;
1265
1266         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1267         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1268
1269         /* Make sure hardware complete it */
1270         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1271                       readl, (!(val & DMA_GSTS_WBFS)), val);
1272
1273         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1274 }
1275
1276 /* return value determine if we need a write buffer flush */
1277 static void __iommu_flush_context(struct intel_iommu *iommu,
1278                                   u16 did, u16 source_id, u8 function_mask,
1279                                   u64 type)
1280 {
1281         u64 val = 0;
1282         unsigned long flag;
1283
1284         switch (type) {
1285         case DMA_CCMD_GLOBAL_INVL:
1286                 val = DMA_CCMD_GLOBAL_INVL;
1287                 break;
1288         case DMA_CCMD_DOMAIN_INVL:
1289                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1290                 break;
1291         case DMA_CCMD_DEVICE_INVL:
1292                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1293                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1294                 break;
1295         default:
1296                 BUG();
1297         }
1298         val |= DMA_CCMD_ICC;
1299
1300         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1301         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1302
1303         /* Make sure hardware complete it */
1304         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1305                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1306
1307         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1308 }
1309
1310 /* return value determine if we need a write buffer flush */
1311 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1312                                 u64 addr, unsigned int size_order, u64 type)
1313 {
1314         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1315         u64 val = 0, val_iva = 0;
1316         unsigned long flag;
1317
1318         switch (type) {
1319         case DMA_TLB_GLOBAL_FLUSH:
1320                 /* global flush doesn't need set IVA_REG */
1321                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1322                 break;
1323         case DMA_TLB_DSI_FLUSH:
1324                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1325                 break;
1326         case DMA_TLB_PSI_FLUSH:
1327                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1328                 /* IH bit is passed in as part of address */
1329                 val_iva = size_order | addr;
1330                 break;
1331         default:
1332                 BUG();
1333         }
1334         /* Note: set drain read/write */
1335 #if 0
1336         /*
1337          * This is probably to be super secure.. Looks like we can
1338          * ignore it without any impact.
1339          */
1340         if (cap_read_drain(iommu->cap))
1341                 val |= DMA_TLB_READ_DRAIN;
1342 #endif
1343         if (cap_write_drain(iommu->cap))
1344                 val |= DMA_TLB_WRITE_DRAIN;
1345
1346         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1347         /* Note: Only uses first TLB reg currently */
1348         if (val_iva)
1349                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1350         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1351
1352         /* Make sure hardware complete it */
1353         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1354                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1355
1356         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357
1358         /* check IOTLB invalidation granularity */
1359         if (DMA_TLB_IAIG(val) == 0)
1360                 pr_err("Flush IOTLB failed\n");
1361         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1362                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1363                         (unsigned long long)DMA_TLB_IIRG(type),
1364                         (unsigned long long)DMA_TLB_IAIG(val));
1365 }
1366
1367 static struct device_domain_info *
1368 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1369                          u8 bus, u8 devfn)
1370 {
1371         struct device_domain_info *info;
1372
1373         assert_spin_locked(&device_domain_lock);
1374
1375         if (!iommu->qi)
1376                 return NULL;
1377
1378         list_for_each_entry(info, &domain->devices, link)
1379                 if (info->iommu == iommu && info->bus == bus &&
1380                     info->devfn == devfn) {
1381                         if (info->ats_supported && info->dev)
1382                                 return info;
1383                         break;
1384                 }
1385
1386         return NULL;
1387 }
1388
1389 static void domain_update_iotlb(struct dmar_domain *domain)
1390 {
1391         struct device_domain_info *info;
1392         bool has_iotlb_device = false;
1393
1394         assert_spin_locked(&device_domain_lock);
1395
1396         list_for_each_entry(info, &domain->devices, link) {
1397                 struct pci_dev *pdev;
1398
1399                 if (!info->dev || !dev_is_pci(info->dev))
1400                         continue;
1401
1402                 pdev = to_pci_dev(info->dev);
1403                 if (pdev->ats_enabled) {
1404                         has_iotlb_device = true;
1405                         break;
1406                 }
1407         }
1408
1409         domain->has_iotlb_device = has_iotlb_device;
1410 }
1411
1412 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1413 {
1414         struct pci_dev *pdev;
1415
1416         assert_spin_locked(&device_domain_lock);
1417
1418         if (!info || !dev_is_pci(info->dev))
1419                 return;
1420
1421         pdev = to_pci_dev(info->dev);
1422         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1423          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1424          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1425          * reserved, which should be set to 0.
1426          */
1427         if (!ecap_dit(info->iommu->ecap))
1428                 info->pfsid = 0;
1429         else {
1430                 struct pci_dev *pf_pdev;
1431
1432                 /* pdev will be returned if device is not a vf */
1433                 pf_pdev = pci_physfn(pdev);
1434                 info->pfsid = pci_dev_id(pf_pdev);
1435         }
1436
1437 #ifdef CONFIG_INTEL_IOMMU_SVM
1438         /* The PCIe spec, in its wisdom, declares that the behaviour of
1439            the device if you enable PASID support after ATS support is
1440            undefined. So always enable PASID support on devices which
1441            have it, even if we can't yet know if we're ever going to
1442            use it. */
1443         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1444                 info->pasid_enabled = 1;
1445
1446         if (info->pri_supported &&
1447             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1448             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1449                 info->pri_enabled = 1;
1450 #endif
1451         if (!pdev->untrusted && info->ats_supported &&
1452             pci_ats_page_aligned(pdev) &&
1453             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1454                 info->ats_enabled = 1;
1455                 domain_update_iotlb(info->domain);
1456                 info->ats_qdep = pci_ats_queue_depth(pdev);
1457         }
1458 }
1459
1460 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1461 {
1462         struct pci_dev *pdev;
1463
1464         assert_spin_locked(&device_domain_lock);
1465
1466         if (!dev_is_pci(info->dev))
1467                 return;
1468
1469         pdev = to_pci_dev(info->dev);
1470
1471         if (info->ats_enabled) {
1472                 pci_disable_ats(pdev);
1473                 info->ats_enabled = 0;
1474                 domain_update_iotlb(info->domain);
1475         }
1476 #ifdef CONFIG_INTEL_IOMMU_SVM
1477         if (info->pri_enabled) {
1478                 pci_disable_pri(pdev);
1479                 info->pri_enabled = 0;
1480         }
1481         if (info->pasid_enabled) {
1482                 pci_disable_pasid(pdev);
1483                 info->pasid_enabled = 0;
1484         }
1485 #endif
1486 }
1487
1488 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1489                                   u64 addr, unsigned mask)
1490 {
1491         u16 sid, qdep;
1492         unsigned long flags;
1493         struct device_domain_info *info;
1494
1495         if (!domain->has_iotlb_device)
1496                 return;
1497
1498         spin_lock_irqsave(&device_domain_lock, flags);
1499         list_for_each_entry(info, &domain->devices, link) {
1500                 if (!info->ats_enabled)
1501                         continue;
1502
1503                 sid = info->bus << 8 | info->devfn;
1504                 qdep = info->ats_qdep;
1505                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1506                                 qdep, addr, mask);
1507         }
1508         spin_unlock_irqrestore(&device_domain_lock, flags);
1509 }
1510
1511 static void domain_flush_piotlb(struct intel_iommu *iommu,
1512                                 struct dmar_domain *domain,
1513                                 u64 addr, unsigned long npages, bool ih)
1514 {
1515         u16 did = domain->iommu_did[iommu->seq_id];
1516
1517         if (domain->default_pasid)
1518                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1519                                 addr, npages, ih);
1520
1521         if (!list_empty(&domain->devices))
1522                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1523 }
1524
1525 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1526                                   struct dmar_domain *domain,
1527                                   unsigned long pfn, unsigned int pages,
1528                                   int ih, int map)
1529 {
1530         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1531         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1532         u16 did = domain->iommu_did[iommu->seq_id];
1533
1534         BUG_ON(pages == 0);
1535
1536         if (ih)
1537                 ih = 1 << 6;
1538
1539         if (domain_use_first_level(domain)) {
1540                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1541         } else {
1542                 /*
1543                  * Fallback to domain selective flush if no PSI support or
1544                  * the size is too big. PSI requires page size to be 2 ^ x,
1545                  * and the base address is naturally aligned to the size.
1546                  */
1547                 if (!cap_pgsel_inv(iommu->cap) ||
1548                     mask > cap_max_amask_val(iommu->cap))
1549                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1550                                                         DMA_TLB_DSI_FLUSH);
1551                 else
1552                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1553                                                         DMA_TLB_PSI_FLUSH);
1554         }
1555
1556         /*
1557          * In caching mode, changes of pages from non-present to present require
1558          * flush. However, device IOTLB doesn't need to be flushed in this case.
1559          */
1560         if (!cap_caching_mode(iommu->cap) || !map)
1561                 iommu_flush_dev_iotlb(domain, addr, mask);
1562 }
1563
1564 /* Notification for newly created mappings */
1565 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1566                                         struct dmar_domain *domain,
1567                                         unsigned long pfn, unsigned int pages)
1568 {
1569         /*
1570          * It's a non-present to present mapping. Only flush if caching mode
1571          * and second level.
1572          */
1573         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1574                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1575         else
1576                 iommu_flush_write_buffer(iommu);
1577 }
1578
1579 static void iommu_flush_iova(struct iova_domain *iovad)
1580 {
1581         struct dmar_domain *domain;
1582         int idx;
1583
1584         domain = container_of(iovad, struct dmar_domain, iovad);
1585
1586         for_each_domain_iommu(idx, domain) {
1587                 struct intel_iommu *iommu = g_iommus[idx];
1588                 u16 did = domain->iommu_did[iommu->seq_id];
1589
1590                 if (domain_use_first_level(domain))
1591                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1592                 else
1593                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1594                                                  DMA_TLB_DSI_FLUSH);
1595
1596                 if (!cap_caching_mode(iommu->cap))
1597                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1598                                               0, MAX_AGAW_PFN_WIDTH);
1599         }
1600 }
1601
1602 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1603 {
1604         u32 pmen;
1605         unsigned long flags;
1606
1607         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1608                 return;
1609
1610         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1611         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1612         pmen &= ~DMA_PMEN_EPM;
1613         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1614
1615         /* wait for the protected region status bit to clear */
1616         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1617                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1618
1619         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1620 }
1621
1622 static void iommu_enable_translation(struct intel_iommu *iommu)
1623 {
1624         u32 sts;
1625         unsigned long flags;
1626
1627         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1628         iommu->gcmd |= DMA_GCMD_TE;
1629         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1630
1631         /* Make sure hardware complete it */
1632         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1633                       readl, (sts & DMA_GSTS_TES), sts);
1634
1635         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1636 }
1637
1638 static void iommu_disable_translation(struct intel_iommu *iommu)
1639 {
1640         u32 sts;
1641         unsigned long flag;
1642
1643         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1644         iommu->gcmd &= ~DMA_GCMD_TE;
1645         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1646
1647         /* Make sure hardware complete it */
1648         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1649                       readl, (!(sts & DMA_GSTS_TES)), sts);
1650
1651         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1652 }
1653
1654 static int iommu_init_domains(struct intel_iommu *iommu)
1655 {
1656         u32 ndomains, nlongs;
1657         size_t size;
1658
1659         ndomains = cap_ndoms(iommu->cap);
1660         pr_debug("%s: Number of Domains supported <%d>\n",
1661                  iommu->name, ndomains);
1662         nlongs = BITS_TO_LONGS(ndomains);
1663
1664         spin_lock_init(&iommu->lock);
1665
1666         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1667         if (!iommu->domain_ids) {
1668                 pr_err("%s: Allocating domain id array failed\n",
1669                        iommu->name);
1670                 return -ENOMEM;
1671         }
1672
1673         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1674         iommu->domains = kzalloc(size, GFP_KERNEL);
1675
1676         if (iommu->domains) {
1677                 size = 256 * sizeof(struct dmar_domain *);
1678                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1679         }
1680
1681         if (!iommu->domains || !iommu->domains[0]) {
1682                 pr_err("%s: Allocating domain array failed\n",
1683                        iommu->name);
1684                 kfree(iommu->domain_ids);
1685                 kfree(iommu->domains);
1686                 iommu->domain_ids = NULL;
1687                 iommu->domains    = NULL;
1688                 return -ENOMEM;
1689         }
1690
1691         /*
1692          * If Caching mode is set, then invalid translations are tagged
1693          * with domain-id 0, hence we need to pre-allocate it. We also
1694          * use domain-id 0 as a marker for non-allocated domain-id, so
1695          * make sure it is not used for a real domain.
1696          */
1697         set_bit(0, iommu->domain_ids);
1698
1699         /*
1700          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1701          * entry for first-level or pass-through translation modes should
1702          * be programmed with a domain id different from those used for
1703          * second-level or nested translation. We reserve a domain id for
1704          * this purpose.
1705          */
1706         if (sm_supported(iommu))
1707                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1708
1709         return 0;
1710 }
1711
1712 static void disable_dmar_iommu(struct intel_iommu *iommu)
1713 {
1714         struct device_domain_info *info, *tmp;
1715         unsigned long flags;
1716
1717         if (!iommu->domains || !iommu->domain_ids)
1718                 return;
1719
1720         spin_lock_irqsave(&device_domain_lock, flags);
1721         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1722                 if (info->iommu != iommu)
1723                         continue;
1724
1725                 if (!info->dev || !info->domain)
1726                         continue;
1727
1728                 __dmar_remove_one_dev_info(info);
1729         }
1730         spin_unlock_irqrestore(&device_domain_lock, flags);
1731
1732         if (iommu->gcmd & DMA_GCMD_TE)
1733                 iommu_disable_translation(iommu);
1734 }
1735
1736 static void free_dmar_iommu(struct intel_iommu *iommu)
1737 {
1738         if ((iommu->domains) && (iommu->domain_ids)) {
1739                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1740                 int i;
1741
1742                 for (i = 0; i < elems; i++)
1743                         kfree(iommu->domains[i]);
1744                 kfree(iommu->domains);
1745                 kfree(iommu->domain_ids);
1746                 iommu->domains = NULL;
1747                 iommu->domain_ids = NULL;
1748         }
1749
1750         g_iommus[iommu->seq_id] = NULL;
1751
1752         /* free context mapping */
1753         free_context_table(iommu);
1754
1755 #ifdef CONFIG_INTEL_IOMMU_SVM
1756         if (pasid_supported(iommu)) {
1757                 if (ecap_prs(iommu->ecap))
1758                         intel_svm_finish_prq(iommu);
1759         }
1760 #endif
1761 }
1762
1763 /*
1764  * Check and return whether first level is used by default for
1765  * DMA translation.
1766  */
1767 static bool first_level_by_default(void)
1768 {
1769         struct dmar_drhd_unit *drhd;
1770         struct intel_iommu *iommu;
1771         static int first_level_support = -1;
1772
1773         if (likely(first_level_support != -1))
1774                 return first_level_support;
1775
1776         first_level_support = 1;
1777
1778         rcu_read_lock();
1779         for_each_active_iommu(iommu, drhd) {
1780                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1781                         first_level_support = 0;
1782                         break;
1783                 }
1784         }
1785         rcu_read_unlock();
1786
1787         return first_level_support;
1788 }
1789
1790 static struct dmar_domain *alloc_domain(int flags)
1791 {
1792         struct dmar_domain *domain;
1793
1794         domain = alloc_domain_mem();
1795         if (!domain)
1796                 return NULL;
1797
1798         memset(domain, 0, sizeof(*domain));
1799         domain->nid = NUMA_NO_NODE;
1800         domain->flags = flags;
1801         if (first_level_by_default())
1802                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1803         domain->has_iotlb_device = false;
1804         INIT_LIST_HEAD(&domain->devices);
1805
1806         return domain;
1807 }
1808
1809 /* Must be called with iommu->lock */
1810 static int domain_attach_iommu(struct dmar_domain *domain,
1811                                struct intel_iommu *iommu)
1812 {
1813         unsigned long ndomains;
1814         int num;
1815
1816         assert_spin_locked(&device_domain_lock);
1817         assert_spin_locked(&iommu->lock);
1818
1819         domain->iommu_refcnt[iommu->seq_id] += 1;
1820         domain->iommu_count += 1;
1821         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1822                 ndomains = cap_ndoms(iommu->cap);
1823                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1824
1825                 if (num >= ndomains) {
1826                         pr_err("%s: No free domain ids\n", iommu->name);
1827                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1828                         domain->iommu_count -= 1;
1829                         return -ENOSPC;
1830                 }
1831
1832                 set_bit(num, iommu->domain_ids);
1833                 set_iommu_domain(iommu, num, domain);
1834
1835                 domain->iommu_did[iommu->seq_id] = num;
1836                 domain->nid                      = iommu->node;
1837
1838                 domain_update_iommu_cap(domain);
1839         }
1840
1841         return 0;
1842 }
1843
1844 static int domain_detach_iommu(struct dmar_domain *domain,
1845                                struct intel_iommu *iommu)
1846 {
1847         int num, count;
1848
1849         assert_spin_locked(&device_domain_lock);
1850         assert_spin_locked(&iommu->lock);
1851
1852         domain->iommu_refcnt[iommu->seq_id] -= 1;
1853         count = --domain->iommu_count;
1854         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1855                 num = domain->iommu_did[iommu->seq_id];
1856                 clear_bit(num, iommu->domain_ids);
1857                 set_iommu_domain(iommu, num, NULL);
1858
1859                 domain_update_iommu_cap(domain);
1860                 domain->iommu_did[iommu->seq_id] = 0;
1861         }
1862
1863         return count;
1864 }
1865
1866 static struct iova_domain reserved_iova_list;
1867 static struct lock_class_key reserved_rbtree_key;
1868
1869 static int dmar_init_reserved_ranges(void)
1870 {
1871         struct pci_dev *pdev = NULL;
1872         struct iova *iova;
1873         int i;
1874
1875         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1876
1877         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1878                 &reserved_rbtree_key);
1879
1880         /* IOAPIC ranges shouldn't be accessed by DMA */
1881         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1882                 IOVA_PFN(IOAPIC_RANGE_END));
1883         if (!iova) {
1884                 pr_err("Reserve IOAPIC range failed\n");
1885                 return -ENODEV;
1886         }
1887
1888         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1889         for_each_pci_dev(pdev) {
1890                 struct resource *r;
1891
1892                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1893                         r = &pdev->resource[i];
1894                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1895                                 continue;
1896                         iova = reserve_iova(&reserved_iova_list,
1897                                             IOVA_PFN(r->start),
1898                                             IOVA_PFN(r->end));
1899                         if (!iova) {
1900                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1901                                 return -ENODEV;
1902                         }
1903                 }
1904         }
1905         return 0;
1906 }
1907
1908 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1909 {
1910         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1911 }
1912
1913 static inline int guestwidth_to_adjustwidth(int gaw)
1914 {
1915         int agaw;
1916         int r = (gaw - 12) % 9;
1917
1918         if (r == 0)
1919                 agaw = gaw;
1920         else
1921                 agaw = gaw + 9 - r;
1922         if (agaw > 64)
1923                 agaw = 64;
1924         return agaw;
1925 }
1926
1927 static void domain_exit(struct dmar_domain *domain)
1928 {
1929
1930         /* Remove associated devices and clear attached or cached domains */
1931         domain_remove_dev_info(domain);
1932
1933         /* destroy iovas */
1934         put_iova_domain(&domain->iovad);
1935
1936         if (domain->pgd) {
1937                 struct page *freelist;
1938
1939                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1940                 dma_free_pagelist(freelist);
1941         }
1942
1943         free_domain_mem(domain);
1944 }
1945
1946 /*
1947  * Get the PASID directory size for scalable mode context entry.
1948  * Value of X in the PDTS field of a scalable mode context entry
1949  * indicates PASID directory with 2^(X + 7) entries.
1950  */
1951 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1952 {
1953         int pds, max_pde;
1954
1955         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1956         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1957         if (pds < 7)
1958                 return 0;
1959
1960         return pds - 7;
1961 }
1962
1963 /*
1964  * Set the RID_PASID field of a scalable mode context entry. The
1965  * IOMMU hardware will use the PASID value set in this field for
1966  * DMA translations of DMA requests without PASID.
1967  */
1968 static inline void
1969 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1970 {
1971         context->hi |= pasid & ((1 << 20) - 1);
1972         context->hi |= (1 << 20);
1973 }
1974
1975 /*
1976  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1977  * entry.
1978  */
1979 static inline void context_set_sm_dte(struct context_entry *context)
1980 {
1981         context->lo |= (1 << 2);
1982 }
1983
1984 /*
1985  * Set the PRE(Page Request Enable) field of a scalable mode context
1986  * entry.
1987  */
1988 static inline void context_set_sm_pre(struct context_entry *context)
1989 {
1990         context->lo |= (1 << 4);
1991 }
1992
1993 /* Convert value to context PASID directory size field coding. */
1994 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1995
1996 static int domain_context_mapping_one(struct dmar_domain *domain,
1997                                       struct intel_iommu *iommu,
1998                                       struct pasid_table *table,
1999                                       u8 bus, u8 devfn)
2000 {
2001         u16 did = domain->iommu_did[iommu->seq_id];
2002         int translation = CONTEXT_TT_MULTI_LEVEL;
2003         struct device_domain_info *info = NULL;
2004         struct context_entry *context;
2005         unsigned long flags;
2006         int ret;
2007
2008         WARN_ON(did == 0);
2009
2010         if (hw_pass_through && domain_type_is_si(domain))
2011                 translation = CONTEXT_TT_PASS_THROUGH;
2012
2013         pr_debug("Set context mapping for %02x:%02x.%d\n",
2014                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2015
2016         BUG_ON(!domain->pgd);
2017
2018         spin_lock_irqsave(&device_domain_lock, flags);
2019         spin_lock(&iommu->lock);
2020
2021         ret = -ENOMEM;
2022         context = iommu_context_addr(iommu, bus, devfn, 1);
2023         if (!context)
2024                 goto out_unlock;
2025
2026         ret = 0;
2027         if (context_present(context))
2028                 goto out_unlock;
2029
2030         /*
2031          * For kdump cases, old valid entries may be cached due to the
2032          * in-flight DMA and copied pgtable, but there is no unmapping
2033          * behaviour for them, thus we need an explicit cache flush for
2034          * the newly-mapped device. For kdump, at this point, the device
2035          * is supposed to finish reset at its driver probe stage, so no
2036          * in-flight DMA will exist, and we don't need to worry anymore
2037          * hereafter.
2038          */
2039         if (context_copied(context)) {
2040                 u16 did_old = context_domain_id(context);
2041
2042                 if (did_old < cap_ndoms(iommu->cap)) {
2043                         iommu->flush.flush_context(iommu, did_old,
2044                                                    (((u16)bus) << 8) | devfn,
2045                                                    DMA_CCMD_MASK_NOBIT,
2046                                                    DMA_CCMD_DEVICE_INVL);
2047                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2048                                                  DMA_TLB_DSI_FLUSH);
2049                 }
2050         }
2051
2052         context_clear_entry(context);
2053
2054         if (sm_supported(iommu)) {
2055                 unsigned long pds;
2056
2057                 WARN_ON(!table);
2058
2059                 /* Setup the PASID DIR pointer: */
2060                 pds = context_get_sm_pds(table);
2061                 context->lo = (u64)virt_to_phys(table->table) |
2062                                 context_pdts(pds);
2063
2064                 /* Setup the RID_PASID field: */
2065                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2066
2067                 /*
2068                  * Setup the Device-TLB enable bit and Page request
2069                  * Enable bit:
2070                  */
2071                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072                 if (info && info->ats_supported)
2073                         context_set_sm_dte(context);
2074                 if (info && info->pri_supported)
2075                         context_set_sm_pre(context);
2076         } else {
2077                 struct dma_pte *pgd = domain->pgd;
2078                 int agaw;
2079
2080                 context_set_domain_id(context, did);
2081
2082                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2083                         /*
2084                          * Skip top levels of page tables for iommu which has
2085                          * less agaw than default. Unnecessary for PT mode.
2086                          */
2087                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2088                                 ret = -ENOMEM;
2089                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2090                                 if (!dma_pte_present(pgd))
2091                                         goto out_unlock;
2092                         }
2093
2094                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2095                         if (info && info->ats_supported)
2096                                 translation = CONTEXT_TT_DEV_IOTLB;
2097                         else
2098                                 translation = CONTEXT_TT_MULTI_LEVEL;
2099
2100                         context_set_address_root(context, virt_to_phys(pgd));
2101                         context_set_address_width(context, agaw);
2102                 } else {
2103                         /*
2104                          * In pass through mode, AW must be programmed to
2105                          * indicate the largest AGAW value supported by
2106                          * hardware. And ASR is ignored by hardware.
2107                          */
2108                         context_set_address_width(context, iommu->msagaw);
2109                 }
2110
2111                 context_set_translation_type(context, translation);
2112         }
2113
2114         context_set_fault_enable(context);
2115         context_set_present(context);
2116         domain_flush_cache(domain, context, sizeof(*context));
2117
2118         /*
2119          * It's a non-present to present mapping. If hardware doesn't cache
2120          * non-present entry we only need to flush the write-buffer. If the
2121          * _does_ cache non-present entries, then it does so in the special
2122          * domain #0, which we have to flush:
2123          */
2124         if (cap_caching_mode(iommu->cap)) {
2125                 iommu->flush.flush_context(iommu, 0,
2126                                            (((u16)bus) << 8) | devfn,
2127                                            DMA_CCMD_MASK_NOBIT,
2128                                            DMA_CCMD_DEVICE_INVL);
2129                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2130         } else {
2131                 iommu_flush_write_buffer(iommu);
2132         }
2133         iommu_enable_dev_iotlb(info);
2134
2135         ret = 0;
2136
2137 out_unlock:
2138         spin_unlock(&iommu->lock);
2139         spin_unlock_irqrestore(&device_domain_lock, flags);
2140
2141         return ret;
2142 }
2143
2144 struct domain_context_mapping_data {
2145         struct dmar_domain *domain;
2146         struct intel_iommu *iommu;
2147         struct pasid_table *table;
2148 };
2149
2150 static int domain_context_mapping_cb(struct pci_dev *pdev,
2151                                      u16 alias, void *opaque)
2152 {
2153         struct domain_context_mapping_data *data = opaque;
2154
2155         return domain_context_mapping_one(data->domain, data->iommu,
2156                                           data->table, PCI_BUS_NUM(alias),
2157                                           alias & 0xff);
2158 }
2159
2160 static int
2161 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2162 {
2163         struct domain_context_mapping_data data;
2164         struct pasid_table *table;
2165         struct intel_iommu *iommu;
2166         u8 bus, devfn;
2167
2168         iommu = device_to_iommu(dev, &bus, &devfn);
2169         if (!iommu)
2170                 return -ENODEV;
2171
2172         table = intel_pasid_get_table(dev);
2173
2174         if (!dev_is_pci(dev))
2175                 return domain_context_mapping_one(domain, iommu, table,
2176                                                   bus, devfn);
2177
2178         data.domain = domain;
2179         data.iommu = iommu;
2180         data.table = table;
2181
2182         return pci_for_each_dma_alias(to_pci_dev(dev),
2183                                       &domain_context_mapping_cb, &data);
2184 }
2185
2186 static int domain_context_mapped_cb(struct pci_dev *pdev,
2187                                     u16 alias, void *opaque)
2188 {
2189         struct intel_iommu *iommu = opaque;
2190
2191         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2192 }
2193
2194 static int domain_context_mapped(struct device *dev)
2195 {
2196         struct intel_iommu *iommu;
2197         u8 bus, devfn;
2198
2199         iommu = device_to_iommu(dev, &bus, &devfn);
2200         if (!iommu)
2201                 return -ENODEV;
2202
2203         if (!dev_is_pci(dev))
2204                 return device_context_mapped(iommu, bus, devfn);
2205
2206         return !pci_for_each_dma_alias(to_pci_dev(dev),
2207                                        domain_context_mapped_cb, iommu);
2208 }
2209
2210 /* Returns a number of VTD pages, but aligned to MM page size */
2211 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2212                                             size_t size)
2213 {
2214         host_addr &= ~PAGE_MASK;
2215         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2216 }
2217
2218 /* Return largest possible superpage level for a given mapping */
2219 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2220                                           unsigned long iov_pfn,
2221                                           unsigned long phy_pfn,
2222                                           unsigned long pages)
2223 {
2224         int support, level = 1;
2225         unsigned long pfnmerge;
2226
2227         support = domain->iommu_superpage;
2228
2229         /* To use a large page, the virtual *and* physical addresses
2230            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2231            of them will mean we have to use smaller pages. So just
2232            merge them and check both at once. */
2233         pfnmerge = iov_pfn | phy_pfn;
2234
2235         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2236                 pages >>= VTD_STRIDE_SHIFT;
2237                 if (!pages)
2238                         break;
2239                 pfnmerge >>= VTD_STRIDE_SHIFT;
2240                 level++;
2241                 support--;
2242         }
2243         return level;
2244 }
2245
2246 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2247                             struct scatterlist *sg, unsigned long phys_pfn,
2248                             unsigned long nr_pages, int prot)
2249 {
2250         struct dma_pte *first_pte = NULL, *pte = NULL;
2251         phys_addr_t uninitialized_var(pteval);
2252         unsigned long sg_res = 0;
2253         unsigned int largepage_lvl = 0;
2254         unsigned long lvl_pages = 0;
2255         u64 attr;
2256
2257         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2258
2259         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2260                 return -EINVAL;
2261
2262         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2263         if (domain_use_first_level(domain))
2264                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2265
2266         if (!sg) {
2267                 sg_res = nr_pages;
2268                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2269         }
2270
2271         while (nr_pages > 0) {
2272                 uint64_t tmp;
2273
2274                 if (!sg_res) {
2275                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2276
2277                         sg_res = aligned_nrpages(sg->offset, sg->length);
2278                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2279                         sg->dma_length = sg->length;
2280                         pteval = (sg_phys(sg) - pgoff) | attr;
2281                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2282                 }
2283
2284                 if (!pte) {
2285                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2286
2287                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2288                         if (!pte)
2289                                 return -ENOMEM;
2290                         /* It is large page*/
2291                         if (largepage_lvl > 1) {
2292                                 unsigned long nr_superpages, end_pfn;
2293
2294                                 pteval |= DMA_PTE_LARGE_PAGE;
2295                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2296
2297                                 nr_superpages = sg_res / lvl_pages;
2298                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2299
2300                                 /*
2301                                  * Ensure that old small page tables are
2302                                  * removed to make room for superpage(s).
2303                                  * We're adding new large pages, so make sure
2304                                  * we don't remove their parent tables.
2305                                  */
2306                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2307                                                        largepage_lvl + 1);
2308                         } else {
2309                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2310                         }
2311
2312                 }
2313                 /* We don't need lock here, nobody else
2314                  * touches the iova range
2315                  */
2316                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2317                 if (tmp) {
2318                         static int dumps = 5;
2319                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2320                                 iov_pfn, tmp, (unsigned long long)pteval);
2321                         if (dumps) {
2322                                 dumps--;
2323                                 debug_dma_dump_mappings(NULL);
2324                         }
2325                         WARN_ON(1);
2326                 }
2327
2328                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2329
2330                 BUG_ON(nr_pages < lvl_pages);
2331                 BUG_ON(sg_res < lvl_pages);
2332
2333                 nr_pages -= lvl_pages;
2334                 iov_pfn += lvl_pages;
2335                 phys_pfn += lvl_pages;
2336                 pteval += lvl_pages * VTD_PAGE_SIZE;
2337                 sg_res -= lvl_pages;
2338
2339                 /* If the next PTE would be the first in a new page, then we
2340                    need to flush the cache on the entries we've just written.
2341                    And then we'll need to recalculate 'pte', so clear it and
2342                    let it get set again in the if (!pte) block above.
2343
2344                    If we're done (!nr_pages) we need to flush the cache too.
2345
2346                    Also if we've been setting superpages, we may need to
2347                    recalculate 'pte' and switch back to smaller pages for the
2348                    end of the mapping, if the trailing size is not enough to
2349                    use another superpage (i.e. sg_res < lvl_pages). */
2350                 pte++;
2351                 if (!nr_pages || first_pte_in_page(pte) ||
2352                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2353                         domain_flush_cache(domain, first_pte,
2354                                            (void *)pte - (void *)first_pte);
2355                         pte = NULL;
2356                 }
2357
2358                 if (!sg_res && nr_pages)
2359                         sg = sg_next(sg);
2360         }
2361         return 0;
2362 }
2363
2364 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2365                           struct scatterlist *sg, unsigned long phys_pfn,
2366                           unsigned long nr_pages, int prot)
2367 {
2368         int iommu_id, ret;
2369         struct intel_iommu *iommu;
2370
2371         /* Do the real mapping first */
2372         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2373         if (ret)
2374                 return ret;
2375
2376         for_each_domain_iommu(iommu_id, domain) {
2377                 iommu = g_iommus[iommu_id];
2378                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2379         }
2380
2381         return 0;
2382 }
2383
2384 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2385                                     struct scatterlist *sg, unsigned long nr_pages,
2386                                     int prot)
2387 {
2388         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2389 }
2390
2391 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392                                      unsigned long phys_pfn, unsigned long nr_pages,
2393                                      int prot)
2394 {
2395         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2396 }
2397
2398 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2399 {
2400         unsigned long flags;
2401         struct context_entry *context;
2402         u16 did_old;
2403
2404         if (!iommu)
2405                 return;
2406
2407         spin_lock_irqsave(&iommu->lock, flags);
2408         context = iommu_context_addr(iommu, bus, devfn, 0);
2409         if (!context) {
2410                 spin_unlock_irqrestore(&iommu->lock, flags);
2411                 return;
2412         }
2413         did_old = context_domain_id(context);
2414         context_clear_entry(context);
2415         __iommu_flush_cache(iommu, context, sizeof(*context));
2416         spin_unlock_irqrestore(&iommu->lock, flags);
2417         iommu->flush.flush_context(iommu,
2418                                    did_old,
2419                                    (((u16)bus) << 8) | devfn,
2420                                    DMA_CCMD_MASK_NOBIT,
2421                                    DMA_CCMD_DEVICE_INVL);
2422         iommu->flush.flush_iotlb(iommu,
2423                                  did_old,
2424                                  0,
2425                                  0,
2426                                  DMA_TLB_DSI_FLUSH);
2427 }
2428
2429 static inline void unlink_domain_info(struct device_domain_info *info)
2430 {
2431         assert_spin_locked(&device_domain_lock);
2432         list_del(&info->link);
2433         list_del(&info->global);
2434         if (info->dev)
2435                 info->dev->archdata.iommu = NULL;
2436 }
2437
2438 static void domain_remove_dev_info(struct dmar_domain *domain)
2439 {
2440         struct device_domain_info *info, *tmp;
2441         unsigned long flags;
2442
2443         spin_lock_irqsave(&device_domain_lock, flags);
2444         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2445                 __dmar_remove_one_dev_info(info);
2446         spin_unlock_irqrestore(&device_domain_lock, flags);
2447 }
2448
2449 struct dmar_domain *find_domain(struct device *dev)
2450 {
2451         struct device_domain_info *info;
2452
2453         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2454                 return NULL;
2455
2456         if (dev_is_pci(dev))
2457                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2458
2459         /* No lock here, assumes no domain exit in normal case */
2460         info = dev->archdata.iommu;
2461         if (likely(info))
2462                 return info->domain;
2463
2464         return NULL;
2465 }
2466
2467 static void do_deferred_attach(struct device *dev)
2468 {
2469         struct iommu_domain *domain;
2470
2471         dev->archdata.iommu = NULL;
2472         domain = iommu_get_domain_for_dev(dev);
2473         if (domain)
2474                 intel_iommu_attach_device(domain, dev);
2475 }
2476
2477 static inline struct device_domain_info *
2478 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2479 {
2480         struct device_domain_info *info;
2481
2482         list_for_each_entry(info, &device_domain_list, global)
2483                 if (info->iommu->segment == segment && info->bus == bus &&
2484                     info->devfn == devfn)
2485                         return info;
2486
2487         return NULL;
2488 }
2489
2490 static int domain_setup_first_level(struct intel_iommu *iommu,
2491                                     struct dmar_domain *domain,
2492                                     struct device *dev,
2493                                     int pasid)
2494 {
2495         int flags = PASID_FLAG_SUPERVISOR_MODE;
2496         struct dma_pte *pgd = domain->pgd;
2497         int agaw, level;
2498
2499         /*
2500          * Skip top levels of page tables for iommu which has
2501          * less agaw than default. Unnecessary for PT mode.
2502          */
2503         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2504                 pgd = phys_to_virt(dma_pte_addr(pgd));
2505                 if (!dma_pte_present(pgd))
2506                         return -ENOMEM;
2507         }
2508
2509         level = agaw_to_level(agaw);
2510         if (level != 4 && level != 5)
2511                 return -EINVAL;
2512
2513         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2514
2515         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2516                                              domain->iommu_did[iommu->seq_id],
2517                                              flags);
2518 }
2519
2520 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2521                                                     int bus, int devfn,
2522                                                     struct device *dev,
2523                                                     struct dmar_domain *domain)
2524 {
2525         struct dmar_domain *found = NULL;
2526         struct device_domain_info *info;
2527         unsigned long flags;
2528         int ret;
2529
2530         info = alloc_devinfo_mem();
2531         if (!info)
2532                 return NULL;
2533
2534         info->bus = bus;
2535         info->devfn = devfn;
2536         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2537         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2538         info->ats_qdep = 0;
2539         info->dev = dev;
2540         info->domain = domain;
2541         info->iommu = iommu;
2542         info->pasid_table = NULL;
2543         info->auxd_enabled = 0;
2544         INIT_LIST_HEAD(&info->auxiliary_domains);
2545
2546         if (dev && dev_is_pci(dev)) {
2547                 struct pci_dev *pdev = to_pci_dev(info->dev);
2548
2549                 if (!pdev->untrusted &&
2550                     !pci_ats_disabled() &&
2551                     ecap_dev_iotlb_support(iommu->ecap) &&
2552                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2553                     dmar_find_matched_atsr_unit(pdev))
2554                         info->ats_supported = 1;
2555
2556                 if (sm_supported(iommu)) {
2557                         if (pasid_supported(iommu)) {
2558                                 int features = pci_pasid_features(pdev);
2559                                 if (features >= 0)
2560                                         info->pasid_supported = features | 1;
2561                         }
2562
2563                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2564                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2565                                 info->pri_supported = 1;
2566                 }
2567         }
2568
2569         spin_lock_irqsave(&device_domain_lock, flags);
2570         if (dev)
2571                 found = find_domain(dev);
2572
2573         if (!found) {
2574                 struct device_domain_info *info2;
2575                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2576                 if (info2) {
2577                         found      = info2->domain;
2578                         info2->dev = dev;
2579                 }
2580         }
2581
2582         if (found) {
2583                 spin_unlock_irqrestore(&device_domain_lock, flags);
2584                 free_devinfo_mem(info);
2585                 /* Caller must free the original domain */
2586                 return found;
2587         }
2588
2589         spin_lock(&iommu->lock);
2590         ret = domain_attach_iommu(domain, iommu);
2591         spin_unlock(&iommu->lock);
2592
2593         if (ret) {
2594                 spin_unlock_irqrestore(&device_domain_lock, flags);
2595                 free_devinfo_mem(info);
2596                 return NULL;
2597         }
2598
2599         list_add(&info->link, &domain->devices);
2600         list_add(&info->global, &device_domain_list);
2601         if (dev)
2602                 dev->archdata.iommu = info;
2603         spin_unlock_irqrestore(&device_domain_lock, flags);
2604
2605         /* PASID table is mandatory for a PCI device in scalable mode. */
2606         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2607                 ret = intel_pasid_alloc_table(dev);
2608                 if (ret) {
2609                         dev_err(dev, "PASID table allocation failed\n");
2610                         dmar_remove_one_dev_info(dev);
2611                         return NULL;
2612                 }
2613
2614                 /* Setup the PASID entry for requests without PASID: */
2615                 spin_lock(&iommu->lock);
2616                 if (hw_pass_through && domain_type_is_si(domain))
2617                         ret = intel_pasid_setup_pass_through(iommu, domain,
2618                                         dev, PASID_RID2PASID);
2619                 else if (domain_use_first_level(domain))
2620                         ret = domain_setup_first_level(iommu, domain, dev,
2621                                         PASID_RID2PASID);
2622                 else
2623                         ret = intel_pasid_setup_second_level(iommu, domain,
2624                                         dev, PASID_RID2PASID);
2625                 spin_unlock(&iommu->lock);
2626                 if (ret) {
2627                         dev_err(dev, "Setup RID2PASID failed\n");
2628                         dmar_remove_one_dev_info(dev);
2629                         return NULL;
2630                 }
2631         }
2632
2633         if (dev && domain_context_mapping(domain, dev)) {
2634                 dev_err(dev, "Domain context map failed\n");
2635                 dmar_remove_one_dev_info(dev);
2636                 return NULL;
2637         }
2638
2639         return domain;
2640 }
2641
2642 static int iommu_domain_identity_map(struct dmar_domain *domain,
2643                                      unsigned long long start,
2644                                      unsigned long long end)
2645 {
2646         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2647         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2648
2649         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2650                           dma_to_mm_pfn(last_vpfn))) {
2651                 pr_err("Reserving iova failed\n");
2652                 return -ENOMEM;
2653         }
2654
2655         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2656         /*
2657          * RMRR range might have overlap with physical memory range,
2658          * clear it first
2659          */
2660         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2661
2662         return __domain_mapping(domain, first_vpfn, NULL,
2663                                 first_vpfn, last_vpfn - first_vpfn + 1,
2664                                 DMA_PTE_READ|DMA_PTE_WRITE);
2665 }
2666
2667 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2668
2669 static int __init si_domain_init(int hw)
2670 {
2671         struct dmar_rmrr_unit *rmrr;
2672         struct device *dev;
2673         int i, nid, ret;
2674
2675         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2676         if (!si_domain)
2677                 return -EFAULT;
2678
2679         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2680                 domain_exit(si_domain);
2681                 return -EFAULT;
2682         }
2683
2684         if (hw)
2685                 return 0;
2686
2687         for_each_online_node(nid) {
2688                 unsigned long start_pfn, end_pfn;
2689                 int i;
2690
2691                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2692                         ret = iommu_domain_identity_map(si_domain,
2693                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2694                         if (ret)
2695                                 return ret;
2696                 }
2697         }
2698
2699         /*
2700          * Identity map the RMRRs so that devices with RMRRs could also use
2701          * the si_domain.
2702          */
2703         for_each_rmrr_units(rmrr) {
2704                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2705                                           i, dev) {
2706                         unsigned long long start = rmrr->base_address;
2707                         unsigned long long end = rmrr->end_address;
2708
2709                         if (WARN_ON(end < start ||
2710                                     end >> agaw_to_width(si_domain->agaw)))
2711                                 continue;
2712
2713                         ret = iommu_domain_identity_map(si_domain, start, end);
2714                         if (ret)
2715                                 return ret;
2716                 }
2717         }
2718
2719         return 0;
2720 }
2721
2722 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2723 {
2724         struct dmar_domain *ndomain;
2725         struct intel_iommu *iommu;
2726         u8 bus, devfn;
2727
2728         iommu = device_to_iommu(dev, &bus, &devfn);
2729         if (!iommu)
2730                 return -ENODEV;
2731
2732         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2733         if (ndomain != domain)
2734                 return -EBUSY;
2735
2736         return 0;
2737 }
2738
2739 static bool device_has_rmrr(struct device *dev)
2740 {
2741         struct dmar_rmrr_unit *rmrr;
2742         struct device *tmp;
2743         int i;
2744
2745         rcu_read_lock();
2746         for_each_rmrr_units(rmrr) {
2747                 /*
2748                  * Return TRUE if this RMRR contains the device that
2749                  * is passed in.
2750                  */
2751                 for_each_active_dev_scope(rmrr->devices,
2752                                           rmrr->devices_cnt, i, tmp)
2753                         if (tmp == dev ||
2754                             is_downstream_to_pci_bridge(dev, tmp)) {
2755                                 rcu_read_unlock();
2756                                 return true;
2757                         }
2758         }
2759         rcu_read_unlock();
2760         return false;
2761 }
2762
2763 /**
2764  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2765  * is relaxable (ie. is allowed to be not enforced under some conditions)
2766  * @dev: device handle
2767  *
2768  * We assume that PCI USB devices with RMRRs have them largely
2769  * for historical reasons and that the RMRR space is not actively used post
2770  * boot.  This exclusion may change if vendors begin to abuse it.
2771  *
2772  * The same exception is made for graphics devices, with the requirement that
2773  * any use of the RMRR regions will be torn down before assigning the device
2774  * to a guest.
2775  *
2776  * Return: true if the RMRR is relaxable, false otherwise
2777  */
2778 static bool device_rmrr_is_relaxable(struct device *dev)
2779 {
2780         struct pci_dev *pdev;
2781
2782         if (!dev_is_pci(dev))
2783                 return false;
2784
2785         pdev = to_pci_dev(dev);
2786         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2787                 return true;
2788         else
2789                 return false;
2790 }
2791
2792 /*
2793  * There are a couple cases where we need to restrict the functionality of
2794  * devices associated with RMRRs.  The first is when evaluating a device for
2795  * identity mapping because problems exist when devices are moved in and out
2796  * of domains and their respective RMRR information is lost.  This means that
2797  * a device with associated RMRRs will never be in a "passthrough" domain.
2798  * The second is use of the device through the IOMMU API.  This interface
2799  * expects to have full control of the IOVA space for the device.  We cannot
2800  * satisfy both the requirement that RMRR access is maintained and have an
2801  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2802  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2803  * We therefore prevent devices associated with an RMRR from participating in
2804  * the IOMMU API, which eliminates them from device assignment.
2805  *
2806  * In both cases, devices which have relaxable RMRRs are not concerned by this
2807  * restriction. See device_rmrr_is_relaxable comment.
2808  */
2809 static bool device_is_rmrr_locked(struct device *dev)
2810 {
2811         if (!device_has_rmrr(dev))
2812                 return false;
2813
2814         if (device_rmrr_is_relaxable(dev))
2815                 return false;
2816
2817         return true;
2818 }
2819
2820 /*
2821  * Return the required default domain type for a specific device.
2822  *
2823  * @dev: the device in query
2824  * @startup: true if this is during early boot
2825  *
2826  * Returns:
2827  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2828  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2829  *  - 0: both identity and dynamic domains work for this device
2830  */
2831 static int device_def_domain_type(struct device *dev)
2832 {
2833         if (dev_is_pci(dev)) {
2834                 struct pci_dev *pdev = to_pci_dev(dev);
2835
2836                 /*
2837                  * Prevent any device marked as untrusted from getting
2838                  * placed into the statically identity mapping domain.
2839                  */
2840                 if (pdev->untrusted)
2841                         return IOMMU_DOMAIN_DMA;
2842
2843                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2844                         return IOMMU_DOMAIN_IDENTITY;
2845
2846                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2847                         return IOMMU_DOMAIN_IDENTITY;
2848         }
2849
2850         return 0;
2851 }
2852
2853 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2854 {
2855         /*
2856          * Start from the sane iommu hardware state.
2857          * If the queued invalidation is already initialized by us
2858          * (for example, while enabling interrupt-remapping) then
2859          * we got the things already rolling from a sane state.
2860          */
2861         if (!iommu->qi) {
2862                 /*
2863                  * Clear any previous faults.
2864                  */
2865                 dmar_fault(-1, iommu);
2866                 /*
2867                  * Disable queued invalidation if supported and already enabled
2868                  * before OS handover.
2869                  */
2870                 dmar_disable_qi(iommu);
2871         }
2872
2873         if (dmar_enable_qi(iommu)) {
2874                 /*
2875                  * Queued Invalidate not enabled, use Register Based Invalidate
2876                  */
2877                 iommu->flush.flush_context = __iommu_flush_context;
2878                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2879                 pr_info("%s: Using Register based invalidation\n",
2880                         iommu->name);
2881         } else {
2882                 iommu->flush.flush_context = qi_flush_context;
2883                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2884                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2885         }
2886 }
2887
2888 static int copy_context_table(struct intel_iommu *iommu,
2889                               struct root_entry *old_re,
2890                               struct context_entry **tbl,
2891                               int bus, bool ext)
2892 {
2893         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2894         struct context_entry *new_ce = NULL, ce;
2895         struct context_entry *old_ce = NULL;
2896         struct root_entry re;
2897         phys_addr_t old_ce_phys;
2898
2899         tbl_idx = ext ? bus * 2 : bus;
2900         memcpy(&re, old_re, sizeof(re));
2901
2902         for (devfn = 0; devfn < 256; devfn++) {
2903                 /* First calculate the correct index */
2904                 idx = (ext ? devfn * 2 : devfn) % 256;
2905
2906                 if (idx == 0) {
2907                         /* First save what we may have and clean up */
2908                         if (new_ce) {
2909                                 tbl[tbl_idx] = new_ce;
2910                                 __iommu_flush_cache(iommu, new_ce,
2911                                                     VTD_PAGE_SIZE);
2912                                 pos = 1;
2913                         }
2914
2915                         if (old_ce)
2916                                 memunmap(old_ce);
2917
2918                         ret = 0;
2919                         if (devfn < 0x80)
2920                                 old_ce_phys = root_entry_lctp(&re);
2921                         else
2922                                 old_ce_phys = root_entry_uctp(&re);
2923
2924                         if (!old_ce_phys) {
2925                                 if (ext && devfn == 0) {
2926                                         /* No LCTP, try UCTP */
2927                                         devfn = 0x7f;
2928                                         continue;
2929                                 } else {
2930                                         goto out;
2931                                 }
2932                         }
2933
2934                         ret = -ENOMEM;
2935                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2936                                         MEMREMAP_WB);
2937                         if (!old_ce)
2938                                 goto out;
2939
2940                         new_ce = alloc_pgtable_page(iommu->node);
2941                         if (!new_ce)
2942                                 goto out_unmap;
2943
2944                         ret = 0;
2945                 }
2946
2947                 /* Now copy the context entry */
2948                 memcpy(&ce, old_ce + idx, sizeof(ce));
2949
2950                 if (!__context_present(&ce))
2951                         continue;
2952
2953                 did = context_domain_id(&ce);
2954                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2955                         set_bit(did, iommu->domain_ids);
2956
2957                 /*
2958                  * We need a marker for copied context entries. This
2959                  * marker needs to work for the old format as well as
2960                  * for extended context entries.
2961                  *
2962                  * Bit 67 of the context entry is used. In the old
2963                  * format this bit is available to software, in the
2964                  * extended format it is the PGE bit, but PGE is ignored
2965                  * by HW if PASIDs are disabled (and thus still
2966                  * available).
2967                  *
2968                  * So disable PASIDs first and then mark the entry
2969                  * copied. This means that we don't copy PASID
2970                  * translations from the old kernel, but this is fine as
2971                  * faults there are not fatal.
2972                  */
2973                 context_clear_pasid_enable(&ce);
2974                 context_set_copied(&ce);
2975
2976                 new_ce[idx] = ce;
2977         }
2978
2979         tbl[tbl_idx + pos] = new_ce;
2980
2981         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2982
2983 out_unmap:
2984         memunmap(old_ce);
2985
2986 out:
2987         return ret;
2988 }
2989
2990 static int copy_translation_tables(struct intel_iommu *iommu)
2991 {
2992         struct context_entry **ctxt_tbls;
2993         struct root_entry *old_rt;
2994         phys_addr_t old_rt_phys;
2995         int ctxt_table_entries;
2996         unsigned long flags;
2997         u64 rtaddr_reg;
2998         int bus, ret;
2999         bool new_ext, ext;
3000
3001         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3002         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3003         new_ext    = !!ecap_ecs(iommu->ecap);
3004
3005         /*
3006          * The RTT bit can only be changed when translation is disabled,
3007          * but disabling translation means to open a window for data
3008          * corruption. So bail out and don't copy anything if we would
3009          * have to change the bit.
3010          */
3011         if (new_ext != ext)
3012                 return -EINVAL;
3013
3014         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3015         if (!old_rt_phys)
3016                 return -EINVAL;
3017
3018         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3019         if (!old_rt)
3020                 return -ENOMEM;
3021
3022         /* This is too big for the stack - allocate it from slab */
3023         ctxt_table_entries = ext ? 512 : 256;
3024         ret = -ENOMEM;
3025         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3026         if (!ctxt_tbls)
3027                 goto out_unmap;
3028
3029         for (bus = 0; bus < 256; bus++) {
3030                 ret = copy_context_table(iommu, &old_rt[bus],
3031                                          ctxt_tbls, bus, ext);
3032                 if (ret) {
3033                         pr_err("%s: Failed to copy context table for bus %d\n",
3034                                 iommu->name, bus);
3035                         continue;
3036                 }
3037         }
3038
3039         spin_lock_irqsave(&iommu->lock, flags);
3040
3041         /* Context tables are copied, now write them to the root_entry table */
3042         for (bus = 0; bus < 256; bus++) {
3043                 int idx = ext ? bus * 2 : bus;
3044                 u64 val;
3045
3046                 if (ctxt_tbls[idx]) {
3047                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3048                         iommu->root_entry[bus].lo = val;
3049                 }
3050
3051                 if (!ext || !ctxt_tbls[idx + 1])
3052                         continue;
3053
3054                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3055                 iommu->root_entry[bus].hi = val;
3056         }
3057
3058         spin_unlock_irqrestore(&iommu->lock, flags);
3059
3060         kfree(ctxt_tbls);
3061
3062         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3063
3064         ret = 0;
3065
3066 out_unmap:
3067         memunmap(old_rt);
3068
3069         return ret;
3070 }
3071
3072 static int __init init_dmars(void)
3073 {
3074         struct dmar_drhd_unit *drhd;
3075         struct intel_iommu *iommu;
3076         int ret;
3077
3078         /*
3079          * for each drhd
3080          *    allocate root
3081          *    initialize and program root entry to not present
3082          * endfor
3083          */
3084         for_each_drhd_unit(drhd) {
3085                 /*
3086                  * lock not needed as this is only incremented in the single
3087                  * threaded kernel __init code path all other access are read
3088                  * only
3089                  */
3090                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3091                         g_num_of_iommus++;
3092                         continue;
3093                 }
3094                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3095         }
3096
3097         /* Preallocate enough resources for IOMMU hot-addition */
3098         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3099                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3100
3101         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3102                         GFP_KERNEL);
3103         if (!g_iommus) {
3104                 pr_err("Allocating global iommu array failed\n");
3105                 ret = -ENOMEM;
3106                 goto error;
3107         }
3108
3109         for_each_iommu(iommu, drhd) {
3110                 if (drhd->ignored) {
3111                         iommu_disable_translation(iommu);
3112                         continue;
3113                 }
3114
3115                 /*
3116                  * Find the max pasid size of all IOMMU's in the system.
3117                  * We need to ensure the system pasid table is no bigger
3118                  * than the smallest supported.
3119                  */
3120                 if (pasid_supported(iommu)) {
3121                         u32 temp = 2 << ecap_pss(iommu->ecap);
3122
3123                         intel_pasid_max_id = min_t(u32, temp,
3124                                                    intel_pasid_max_id);
3125                 }
3126
3127                 g_iommus[iommu->seq_id] = iommu;
3128
3129                 intel_iommu_init_qi(iommu);
3130
3131                 ret = iommu_init_domains(iommu);
3132                 if (ret)
3133                         goto free_iommu;
3134
3135                 init_translation_status(iommu);
3136
3137                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3138                         iommu_disable_translation(iommu);
3139                         clear_translation_pre_enabled(iommu);
3140                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3141                                 iommu->name);
3142                 }
3143
3144                 /*
3145                  * TBD:
3146                  * we could share the same root & context tables
3147                  * among all IOMMU's. Need to Split it later.
3148                  */
3149                 ret = iommu_alloc_root_entry(iommu);
3150                 if (ret)
3151                         goto free_iommu;
3152
3153                 if (translation_pre_enabled(iommu)) {
3154                         pr_info("Translation already enabled - trying to copy translation structures\n");
3155
3156                         ret = copy_translation_tables(iommu);
3157                         if (ret) {
3158                                 /*
3159                                  * We found the IOMMU with translation
3160                                  * enabled - but failed to copy over the
3161                                  * old root-entry table. Try to proceed
3162                                  * by disabling translation now and
3163                                  * allocating a clean root-entry table.
3164                                  * This might cause DMAR faults, but
3165                                  * probably the dump will still succeed.
3166                                  */
3167                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3168                                        iommu->name);
3169                                 iommu_disable_translation(iommu);
3170                                 clear_translation_pre_enabled(iommu);
3171                         } else {
3172                                 pr_info("Copied translation tables from previous kernel for %s\n",
3173                                         iommu->name);
3174                         }
3175                 }
3176
3177                 if (!ecap_pass_through(iommu->ecap))
3178                         hw_pass_through = 0;
3179                 intel_svm_check(iommu);
3180         }
3181
3182         /*
3183          * Now that qi is enabled on all iommus, set the root entry and flush
3184          * caches. This is required on some Intel X58 chipsets, otherwise the
3185          * flush_context function will loop forever and the boot hangs.
3186          */
3187         for_each_active_iommu(iommu, drhd) {
3188                 iommu_flush_write_buffer(iommu);
3189                 iommu_set_root_entry(iommu);
3190                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3191                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3192         }
3193
3194 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3195         dmar_map_gfx = 0;
3196 #endif
3197
3198         if (!dmar_map_gfx)
3199                 iommu_identity_mapping |= IDENTMAP_GFX;
3200
3201         check_tylersburg_isoch();
3202
3203         ret = si_domain_init(hw_pass_through);
3204         if (ret)
3205                 goto free_iommu;
3206
3207         /*
3208          * for each drhd
3209          *   enable fault log
3210          *   global invalidate context cache
3211          *   global invalidate iotlb
3212          *   enable translation
3213          */
3214         for_each_iommu(iommu, drhd) {
3215                 if (drhd->ignored) {
3216                         /*
3217                          * we always have to disable PMRs or DMA may fail on
3218                          * this device
3219                          */
3220                         if (force_on)
3221                                 iommu_disable_protect_mem_regions(iommu);
3222                         continue;
3223                 }
3224
3225                 iommu_flush_write_buffer(iommu);
3226
3227 #ifdef CONFIG_INTEL_IOMMU_SVM
3228                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3229                         /*
3230                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3231                          * could cause possible lock race condition.
3232                          */
3233                         up_write(&dmar_global_lock);
3234                         ret = intel_svm_enable_prq(iommu);
3235                         down_write(&dmar_global_lock);
3236                         if (ret)
3237                                 goto free_iommu;
3238                 }
3239 #endif
3240                 ret = dmar_set_interrupt(iommu);
3241                 if (ret)
3242                         goto free_iommu;
3243         }
3244
3245         return 0;
3246
3247 free_iommu:
3248         for_each_active_iommu(iommu, drhd) {
3249                 disable_dmar_iommu(iommu);
3250                 free_dmar_iommu(iommu);
3251         }
3252
3253         kfree(g_iommus);
3254
3255 error:
3256         return ret;
3257 }
3258
3259 /* This takes a number of _MM_ pages, not VTD pages */
3260 static unsigned long intel_alloc_iova(struct device *dev,
3261                                      struct dmar_domain *domain,
3262                                      unsigned long nrpages, uint64_t dma_mask)
3263 {
3264         unsigned long iova_pfn;
3265
3266         /*
3267          * Restrict dma_mask to the width that the iommu can handle.
3268          * First-level translation restricts the input-address to a
3269          * canonical address (i.e., address bits 63:N have the same
3270          * value as address bit [N-1], where N is 48-bits with 4-level
3271          * paging and 57-bits with 5-level paging). Hence, skip bit
3272          * [N-1].
3273          */
3274         if (domain_use_first_level(domain))
3275                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3276                                  dma_mask);
3277         else
3278                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3279                                  dma_mask);
3280
3281         /* Ensure we reserve the whole size-aligned region */
3282         nrpages = __roundup_pow_of_two(nrpages);
3283
3284         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3285                 /*
3286                  * First try to allocate an io virtual address in
3287                  * DMA_BIT_MASK(32) and if that fails then try allocating
3288                  * from higher range
3289                  */
3290                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3291                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3292                 if (iova_pfn)
3293                         return iova_pfn;
3294         }
3295         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3296                                    IOVA_PFN(dma_mask), true);
3297         if (unlikely(!iova_pfn)) {
3298                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3299                              nrpages);
3300                 return 0;
3301         }
3302
3303         return iova_pfn;
3304 }
3305
3306 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3307                                      size_t size, int dir, u64 dma_mask)
3308 {
3309         struct dmar_domain *domain;
3310         phys_addr_t start_paddr;
3311         unsigned long iova_pfn;
3312         int prot = 0;
3313         int ret;
3314         struct intel_iommu *iommu;
3315         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3316
3317         BUG_ON(dir == DMA_NONE);
3318
3319         if (unlikely(attach_deferred(dev)))
3320                 do_deferred_attach(dev);
3321
3322         domain = find_domain(dev);
3323         if (!domain)
3324                 return DMA_MAPPING_ERROR;
3325
3326         iommu = domain_get_iommu(domain);
3327         size = aligned_nrpages(paddr, size);
3328
3329         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3330         if (!iova_pfn)
3331                 goto error;
3332
3333         /*
3334          * Check if DMAR supports zero-length reads on write only
3335          * mappings..
3336          */
3337         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3338                         !cap_zlr(iommu->cap))
3339                 prot |= DMA_PTE_READ;
3340         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3341                 prot |= DMA_PTE_WRITE;
3342         /*
3343          * paddr - (paddr + size) might be partial page, we should map the whole
3344          * page.  Note: if two part of one page are separately mapped, we
3345          * might have two guest_addr mapping to the same host paddr, but this
3346          * is not a big problem
3347          */
3348         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3349                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3350         if (ret)
3351                 goto error;
3352
3353         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3354         start_paddr += paddr & ~PAGE_MASK;
3355
3356         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3357
3358         return start_paddr;
3359
3360 error:
3361         if (iova_pfn)
3362                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3363         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3364                 size, (unsigned long long)paddr, dir);
3365         return DMA_MAPPING_ERROR;
3366 }
3367
3368 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3369                                  unsigned long offset, size_t size,
3370                                  enum dma_data_direction dir,
3371                                  unsigned long attrs)
3372 {
3373         return __intel_map_single(dev, page_to_phys(page) + offset,
3374                                   size, dir, *dev->dma_mask);
3375 }
3376
3377 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3378                                      size_t size, enum dma_data_direction dir,
3379                                      unsigned long attrs)
3380 {
3381         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3382 }
3383
3384 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3385 {
3386         struct dmar_domain *domain;
3387         unsigned long start_pfn, last_pfn;
3388         unsigned long nrpages;
3389         unsigned long iova_pfn;
3390         struct intel_iommu *iommu;
3391         struct page *freelist;
3392         struct pci_dev *pdev = NULL;
3393
3394         domain = find_domain(dev);
3395         BUG_ON(!domain);
3396
3397         iommu = domain_get_iommu(domain);
3398
3399         iova_pfn = IOVA_PFN(dev_addr);
3400
3401         nrpages = aligned_nrpages(dev_addr, size);
3402         start_pfn = mm_to_dma_pfn(iova_pfn);
3403         last_pfn = start_pfn + nrpages - 1;
3404
3405         if (dev_is_pci(dev))
3406                 pdev = to_pci_dev(dev);
3407
3408         freelist = domain_unmap(domain, start_pfn, last_pfn);
3409         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3410                         !has_iova_flush_queue(&domain->iovad)) {
3411                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3412                                       nrpages, !freelist, 0);
3413                 /* free iova */
3414                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3415                 dma_free_pagelist(freelist);
3416         } else {
3417                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3418                            (unsigned long)freelist);
3419                 /*
3420                  * queue up the release of the unmap to save the 1/6th of the
3421                  * cpu used up by the iotlb flush operation...
3422                  */
3423         }
3424
3425         trace_unmap_single(dev, dev_addr, size);
3426 }
3427
3428 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3429                              size_t size, enum dma_data_direction dir,
3430                              unsigned long attrs)
3431 {
3432         intel_unmap(dev, dev_addr, size);
3433 }
3434
3435 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3436                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3437 {
3438         intel_unmap(dev, dev_addr, size);
3439 }
3440
3441 static void *intel_alloc_coherent(struct device *dev, size_t size,
3442                                   dma_addr_t *dma_handle, gfp_t flags,
3443                                   unsigned long attrs)
3444 {
3445         struct page *page = NULL;
3446         int order;
3447
3448         if (unlikely(attach_deferred(dev)))
3449                 do_deferred_attach(dev);
3450
3451         size = PAGE_ALIGN(size);
3452         order = get_order(size);
3453
3454         if (gfpflags_allow_blocking(flags)) {
3455                 unsigned int count = size >> PAGE_SHIFT;
3456
3457                 page = dma_alloc_from_contiguous(dev, count, order,
3458                                                  flags & __GFP_NOWARN);
3459         }
3460
3461         if (!page)
3462                 page = alloc_pages(flags, order);
3463         if (!page)
3464                 return NULL;
3465         memset(page_address(page), 0, size);
3466
3467         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3468                                          DMA_BIDIRECTIONAL,
3469                                          dev->coherent_dma_mask);
3470         if (*dma_handle != DMA_MAPPING_ERROR)
3471                 return page_address(page);
3472         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3473                 __free_pages(page, order);
3474
3475         return NULL;
3476 }
3477
3478 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3479                                 dma_addr_t dma_handle, unsigned long attrs)
3480 {
3481         int order;
3482         struct page *page = virt_to_page(vaddr);
3483
3484         size = PAGE_ALIGN(size);
3485         order = get_order(size);
3486
3487         intel_unmap(dev, dma_handle, size);
3488         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3489                 __free_pages(page, order);
3490 }
3491
3492 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3493                            int nelems, enum dma_data_direction dir,
3494                            unsigned long attrs)
3495 {
3496         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3497         unsigned long nrpages = 0;
3498         struct scatterlist *sg;
3499         int i;
3500
3501         for_each_sg(sglist, sg, nelems, i) {
3502                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3503         }
3504
3505         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3506
3507         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3508 }
3509
3510 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3511                         enum dma_data_direction dir, unsigned long attrs)
3512 {
3513         int i;
3514         struct dmar_domain *domain;
3515         size_t size = 0;
3516         int prot = 0;
3517         unsigned long iova_pfn;
3518         int ret;
3519         struct scatterlist *sg;
3520         unsigned long start_vpfn;
3521         struct intel_iommu *iommu;
3522
3523         BUG_ON(dir == DMA_NONE);
3524
3525         if (unlikely(attach_deferred(dev)))
3526                 do_deferred_attach(dev);
3527
3528         domain = find_domain(dev);
3529         if (!domain)
3530                 return 0;
3531
3532         iommu = domain_get_iommu(domain);
3533
3534         for_each_sg(sglist, sg, nelems, i)
3535                 size += aligned_nrpages(sg->offset, sg->length);
3536
3537         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3538                                 *dev->dma_mask);
3539         if (!iova_pfn) {
3540                 sglist->dma_length = 0;
3541                 return 0;
3542         }
3543
3544         /*
3545          * Check if DMAR supports zero-length reads on write only
3546          * mappings..
3547          */
3548         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3549                         !cap_zlr(iommu->cap))
3550                 prot |= DMA_PTE_READ;
3551         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3552                 prot |= DMA_PTE_WRITE;
3553
3554         start_vpfn = mm_to_dma_pfn(iova_pfn);
3555
3556         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3557         if (unlikely(ret)) {
3558                 dma_pte_free_pagetable(domain, start_vpfn,
3559                                        start_vpfn + size - 1,
3560                                        agaw_to_level(domain->agaw) + 1);
3561                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3562                 return 0;
3563         }
3564
3565         for_each_sg(sglist, sg, nelems, i)
3566                 trace_map_sg(dev, i + 1, nelems, sg);
3567
3568         return nelems;
3569 }
3570
3571 static u64 intel_get_required_mask(struct device *dev)
3572 {
3573         return DMA_BIT_MASK(32);
3574 }
3575
3576 static const struct dma_map_ops intel_dma_ops = {
3577         .alloc = intel_alloc_coherent,
3578         .free = intel_free_coherent,
3579         .map_sg = intel_map_sg,
3580         .unmap_sg = intel_unmap_sg,
3581         .map_page = intel_map_page,
3582         .unmap_page = intel_unmap_page,
3583         .map_resource = intel_map_resource,
3584         .unmap_resource = intel_unmap_resource,
3585         .dma_supported = dma_direct_supported,
3586         .mmap = dma_common_mmap,
3587         .get_sgtable = dma_common_get_sgtable,
3588         .get_required_mask = intel_get_required_mask,
3589 };
3590
3591 static void
3592 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3593                    enum dma_data_direction dir, enum dma_sync_target target)
3594 {
3595         struct dmar_domain *domain;
3596         phys_addr_t tlb_addr;
3597
3598         domain = find_domain(dev);
3599         if (WARN_ON(!domain))
3600                 return;
3601
3602         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3603         if (is_swiotlb_buffer(tlb_addr))
3604                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3605 }
3606
3607 static dma_addr_t
3608 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3609                   enum dma_data_direction dir, unsigned long attrs,
3610                   u64 dma_mask)
3611 {
3612         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3613         struct dmar_domain *domain;
3614         struct intel_iommu *iommu;
3615         unsigned long iova_pfn;
3616         unsigned long nrpages;
3617         phys_addr_t tlb_addr;
3618         int prot = 0;
3619         int ret;
3620
3621         if (unlikely(attach_deferred(dev)))
3622                 do_deferred_attach(dev);
3623
3624         domain = find_domain(dev);
3625
3626         if (WARN_ON(dir == DMA_NONE || !domain))
3627                 return DMA_MAPPING_ERROR;
3628
3629         iommu = domain_get_iommu(domain);
3630         if (WARN_ON(!iommu))
3631                 return DMA_MAPPING_ERROR;
3632
3633         nrpages = aligned_nrpages(0, size);
3634         iova_pfn = intel_alloc_iova(dev, domain,
3635                                     dma_to_mm_pfn(nrpages), dma_mask);
3636         if (!iova_pfn)
3637                 return DMA_MAPPING_ERROR;
3638
3639         /*
3640          * Check if DMAR supports zero-length reads on write only
3641          * mappings..
3642          */
3643         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3644                         !cap_zlr(iommu->cap))
3645                 prot |= DMA_PTE_READ;
3646         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3647                 prot |= DMA_PTE_WRITE;
3648
3649         /*
3650          * If both the physical buffer start address and size are
3651          * page aligned, we don't need to use a bounce page.
3652          */
3653         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3654                 tlb_addr = swiotlb_tbl_map_single(dev,
3655                                 __phys_to_dma(dev, io_tlb_start),
3656                                 paddr, size, aligned_size, dir, attrs);
3657                 if (tlb_addr == DMA_MAPPING_ERROR) {
3658                         goto swiotlb_error;
3659                 } else {
3660                         /* Cleanup the padding area. */
3661                         void *padding_start = phys_to_virt(tlb_addr);
3662                         size_t padding_size = aligned_size;
3663
3664                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3665                             (dir == DMA_TO_DEVICE ||
3666                              dir == DMA_BIDIRECTIONAL)) {
3667                                 padding_start += size;
3668                                 padding_size -= size;
3669                         }
3670
3671                         memset(padding_start, 0, padding_size);
3672                 }
3673         } else {
3674                 tlb_addr = paddr;
3675         }
3676
3677         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3678                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3679         if (ret)
3680                 goto mapping_error;
3681
3682         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3683
3684         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3685
3686 mapping_error:
3687         if (is_swiotlb_buffer(tlb_addr))
3688                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3689                                          aligned_size, dir, attrs);
3690 swiotlb_error:
3691         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3692         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3693                 size, (unsigned long long)paddr, dir);
3694
3695         return DMA_MAPPING_ERROR;
3696 }
3697
3698 static void
3699 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3700                     enum dma_data_direction dir, unsigned long attrs)
3701 {
3702         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3703         struct dmar_domain *domain;
3704         phys_addr_t tlb_addr;
3705
3706         domain = find_domain(dev);
3707         if (WARN_ON(!domain))
3708                 return;
3709
3710         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3711         if (WARN_ON(!tlb_addr))
3712                 return;
3713
3714         intel_unmap(dev, dev_addr, size);
3715         if (is_swiotlb_buffer(tlb_addr))
3716                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3717                                          aligned_size, dir, attrs);
3718
3719         trace_bounce_unmap_single(dev, dev_addr, size);
3720 }
3721
3722 static dma_addr_t
3723 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3724                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3725 {
3726         return bounce_map_single(dev, page_to_phys(page) + offset,
3727                                  size, dir, attrs, *dev->dma_mask);
3728 }
3729
3730 static dma_addr_t
3731 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3732                     enum dma_data_direction dir, unsigned long attrs)
3733 {
3734         return bounce_map_single(dev, phys_addr, size,
3735                                  dir, attrs, *dev->dma_mask);
3736 }
3737
3738 static void
3739 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3740                   enum dma_data_direction dir, unsigned long attrs)
3741 {
3742         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3743 }
3744
3745 static void
3746 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3747                       enum dma_data_direction dir, unsigned long attrs)
3748 {
3749         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3750 }
3751
3752 static void
3753 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3754                 enum dma_data_direction dir, unsigned long attrs)
3755 {
3756         struct scatterlist *sg;
3757         int i;
3758
3759         for_each_sg(sglist, sg, nelems, i)
3760                 bounce_unmap_page(dev, sg->dma_address,
3761                                   sg_dma_len(sg), dir, attrs);
3762 }
3763
3764 static int
3765 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3766               enum dma_data_direction dir, unsigned long attrs)
3767 {
3768         int i;
3769         struct scatterlist *sg;
3770
3771         for_each_sg(sglist, sg, nelems, i) {
3772                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3773                                                   sg->offset, sg->length,
3774                                                   dir, attrs);
3775                 if (sg->dma_address == DMA_MAPPING_ERROR)
3776                         goto out_unmap;
3777                 sg_dma_len(sg) = sg->length;
3778         }
3779
3780         for_each_sg(sglist, sg, nelems, i)
3781                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3782
3783         return nelems;
3784
3785 out_unmap:
3786         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3787         return 0;
3788 }
3789
3790 static void
3791 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3792                            size_t size, enum dma_data_direction dir)
3793 {
3794         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3795 }
3796
3797 static void
3798 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3799                               size_t size, enum dma_data_direction dir)
3800 {
3801         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3802 }
3803
3804 static void
3805 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3806                        int nelems, enum dma_data_direction dir)
3807 {
3808         struct scatterlist *sg;
3809         int i;
3810
3811         for_each_sg(sglist, sg, nelems, i)
3812                 bounce_sync_single(dev, sg_dma_address(sg),
3813                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3814 }
3815
3816 static void
3817 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3818                           int nelems, enum dma_data_direction dir)
3819 {
3820         struct scatterlist *sg;
3821         int i;
3822
3823         for_each_sg(sglist, sg, nelems, i)
3824                 bounce_sync_single(dev, sg_dma_address(sg),
3825                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3826 }
3827
3828 static const struct dma_map_ops bounce_dma_ops = {
3829         .alloc                  = intel_alloc_coherent,
3830         .free                   = intel_free_coherent,
3831         .map_sg                 = bounce_map_sg,
3832         .unmap_sg               = bounce_unmap_sg,
3833         .map_page               = bounce_map_page,
3834         .unmap_page             = bounce_unmap_page,
3835         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3836         .sync_single_for_device = bounce_sync_single_for_device,
3837         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3838         .sync_sg_for_device     = bounce_sync_sg_for_device,
3839         .map_resource           = bounce_map_resource,
3840         .unmap_resource         = bounce_unmap_resource,
3841         .dma_supported          = dma_direct_supported,
3842 };
3843
3844 static inline int iommu_domain_cache_init(void)
3845 {
3846         int ret = 0;
3847
3848         iommu_domain_cache = kmem_cache_create("iommu_domain",
3849                                          sizeof(struct dmar_domain),
3850                                          0,
3851                                          SLAB_HWCACHE_ALIGN,
3852
3853                                          NULL);
3854         if (!iommu_domain_cache) {
3855                 pr_err("Couldn't create iommu_domain cache\n");
3856                 ret = -ENOMEM;
3857         }
3858
3859         return ret;
3860 }
3861
3862 static inline int iommu_devinfo_cache_init(void)
3863 {
3864         int ret = 0;
3865
3866         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3867                                          sizeof(struct device_domain_info),
3868                                          0,
3869                                          SLAB_HWCACHE_ALIGN,
3870                                          NULL);
3871         if (!iommu_devinfo_cache) {
3872                 pr_err("Couldn't create devinfo cache\n");
3873                 ret = -ENOMEM;
3874         }
3875
3876         return ret;
3877 }
3878
3879 static int __init iommu_init_mempool(void)
3880 {
3881         int ret;
3882         ret = iova_cache_get();
3883         if (ret)
3884                 return ret;
3885
3886         ret = iommu_domain_cache_init();
3887         if (ret)
3888                 goto domain_error;
3889
3890         ret = iommu_devinfo_cache_init();
3891         if (!ret)
3892                 return ret;
3893
3894         kmem_cache_destroy(iommu_domain_cache);
3895 domain_error:
3896         iova_cache_put();
3897
3898         return -ENOMEM;
3899 }
3900
3901 static void __init iommu_exit_mempool(void)
3902 {
3903         kmem_cache_destroy(iommu_devinfo_cache);
3904         kmem_cache_destroy(iommu_domain_cache);
3905         iova_cache_put();
3906 }
3907
3908 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3909 {
3910         struct dmar_drhd_unit *drhd;
3911         u32 vtbar;
3912         int rc;
3913
3914         /* We know that this device on this chipset has its own IOMMU.
3915          * If we find it under a different IOMMU, then the BIOS is lying
3916          * to us. Hope that the IOMMU for this device is actually
3917          * disabled, and it needs no translation...
3918          */
3919         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3920         if (rc) {
3921                 /* "can't" happen */
3922                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3923                 return;
3924         }
3925         vtbar &= 0xffff0000;
3926
3927         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3928         drhd = dmar_find_matched_drhd_unit(pdev);
3929         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3930                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3931                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3932                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3933         }
3934 }
3935 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3936
3937 static void __init init_no_remapping_devices(void)
3938 {
3939         struct dmar_drhd_unit *drhd;
3940         struct device *dev;
3941         int i;
3942
3943         for_each_drhd_unit(drhd) {
3944                 if (!drhd->include_all) {
3945                         for_each_active_dev_scope(drhd->devices,
3946                                                   drhd->devices_cnt, i, dev)
3947                                 break;
3948                         /* ignore DMAR unit if no devices exist */
3949                         if (i == drhd->devices_cnt)
3950                                 drhd->ignored = 1;
3951                 }
3952         }
3953
3954         for_each_active_drhd_unit(drhd) {
3955                 if (drhd->include_all)
3956                         continue;
3957
3958                 for_each_active_dev_scope(drhd->devices,
3959                                           drhd->devices_cnt, i, dev)
3960                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3961                                 break;
3962                 if (i < drhd->devices_cnt)
3963                         continue;
3964
3965                 /* This IOMMU has *only* gfx devices. Either bypass it or
3966                    set the gfx_mapped flag, as appropriate */
3967                 if (!dmar_map_gfx) {
3968                         drhd->ignored = 1;
3969                         for_each_active_dev_scope(drhd->devices,
3970                                                   drhd->devices_cnt, i, dev)
3971                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3972                 }
3973         }
3974 }
3975
3976 #ifdef CONFIG_SUSPEND
3977 static int init_iommu_hw(void)
3978 {
3979         struct dmar_drhd_unit *drhd;
3980         struct intel_iommu *iommu = NULL;
3981
3982         for_each_active_iommu(iommu, drhd)
3983                 if (iommu->qi)
3984                         dmar_reenable_qi(iommu);
3985
3986         for_each_iommu(iommu, drhd) {
3987                 if (drhd->ignored) {
3988                         /*
3989                          * we always have to disable PMRs or DMA may fail on
3990                          * this device
3991                          */
3992                         if (force_on)
3993                                 iommu_disable_protect_mem_regions(iommu);
3994                         continue;
3995                 }
3996
3997                 iommu_flush_write_buffer(iommu);
3998
3999                 iommu_set_root_entry(iommu);
4000
4001                 iommu->flush.flush_context(iommu, 0, 0, 0,
4002                                            DMA_CCMD_GLOBAL_INVL);
4003                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4004                 iommu_enable_translation(iommu);
4005                 iommu_disable_protect_mem_regions(iommu);
4006         }
4007
4008         return 0;
4009 }
4010
4011 static void iommu_flush_all(void)
4012 {
4013         struct dmar_drhd_unit *drhd;
4014         struct intel_iommu *iommu;
4015
4016         for_each_active_iommu(iommu, drhd) {
4017                 iommu->flush.flush_context(iommu, 0, 0, 0,
4018                                            DMA_CCMD_GLOBAL_INVL);
4019                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4020                                          DMA_TLB_GLOBAL_FLUSH);
4021         }
4022 }
4023
4024 static int iommu_suspend(void)
4025 {
4026         struct dmar_drhd_unit *drhd;
4027         struct intel_iommu *iommu = NULL;
4028         unsigned long flag;
4029
4030         for_each_active_iommu(iommu, drhd) {
4031                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4032                                                  GFP_ATOMIC);
4033                 if (!iommu->iommu_state)
4034                         goto nomem;
4035         }
4036
4037         iommu_flush_all();
4038
4039         for_each_active_iommu(iommu, drhd) {
4040                 iommu_disable_translation(iommu);
4041
4042                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4043
4044                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4045                         readl(iommu->reg + DMAR_FECTL_REG);
4046                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4047                         readl(iommu->reg + DMAR_FEDATA_REG);
4048                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4049                         readl(iommu->reg + DMAR_FEADDR_REG);
4050                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4051                         readl(iommu->reg + DMAR_FEUADDR_REG);
4052
4053                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4054         }
4055         return 0;
4056
4057 nomem:
4058         for_each_active_iommu(iommu, drhd)
4059                 kfree(iommu->iommu_state);
4060
4061         return -ENOMEM;
4062 }
4063
4064 static void iommu_resume(void)
4065 {
4066         struct dmar_drhd_unit *drhd;
4067         struct intel_iommu *iommu = NULL;
4068         unsigned long flag;
4069
4070         if (init_iommu_hw()) {
4071                 if (force_on)
4072                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4073                 else
4074                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4075                 return;
4076         }
4077
4078         for_each_active_iommu(iommu, drhd) {
4079
4080                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4081
4082                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4083                         iommu->reg + DMAR_FECTL_REG);
4084                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4085                         iommu->reg + DMAR_FEDATA_REG);
4086                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4087                         iommu->reg + DMAR_FEADDR_REG);
4088                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4089                         iommu->reg + DMAR_FEUADDR_REG);
4090
4091                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4092         }
4093
4094         for_each_active_iommu(iommu, drhd)
4095                 kfree(iommu->iommu_state);
4096 }
4097
4098 static struct syscore_ops iommu_syscore_ops = {
4099         .resume         = iommu_resume,
4100         .suspend        = iommu_suspend,
4101 };
4102
4103 static void __init init_iommu_pm_ops(void)
4104 {
4105         register_syscore_ops(&iommu_syscore_ops);
4106 }
4107
4108 #else
4109 static inline void init_iommu_pm_ops(void) {}
4110 #endif  /* CONFIG_PM */
4111
4112 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4113 {
4114         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4115             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4116             rmrr->end_address <= rmrr->base_address ||
4117             arch_rmrr_sanity_check(rmrr))
4118                 return -EINVAL;
4119
4120         return 0;
4121 }
4122
4123 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4124 {
4125         struct acpi_dmar_reserved_memory *rmrr;
4126         struct dmar_rmrr_unit *rmrru;
4127
4128         rmrr = (struct acpi_dmar_reserved_memory *)header;
4129         if (rmrr_sanity_check(rmrr)) {
4130                 pr_warn(FW_BUG
4131                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4132                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4133                            rmrr->base_address, rmrr->end_address,
4134                            dmi_get_system_info(DMI_BIOS_VENDOR),
4135                            dmi_get_system_info(DMI_BIOS_VERSION),
4136                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4137                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4138         }
4139
4140         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4141         if (!rmrru)
4142                 goto out;
4143
4144         rmrru->hdr = header;
4145
4146         rmrru->base_address = rmrr->base_address;
4147         rmrru->end_address = rmrr->end_address;
4148
4149         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4150                                 ((void *)rmrr) + rmrr->header.length,
4151                                 &rmrru->devices_cnt);
4152         if (rmrru->devices_cnt && rmrru->devices == NULL)
4153                 goto free_rmrru;
4154
4155         list_add(&rmrru->list, &dmar_rmrr_units);
4156
4157         return 0;
4158 free_rmrru:
4159         kfree(rmrru);
4160 out:
4161         return -ENOMEM;
4162 }
4163
4164 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4165 {
4166         struct dmar_atsr_unit *atsru;
4167         struct acpi_dmar_atsr *tmp;
4168
4169         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4170                                 dmar_rcu_check()) {
4171                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4172                 if (atsr->segment != tmp->segment)
4173                         continue;
4174                 if (atsr->header.length != tmp->header.length)
4175                         continue;
4176                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4177                         return atsru;
4178         }
4179
4180         return NULL;
4181 }
4182
4183 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4184 {
4185         struct acpi_dmar_atsr *atsr;
4186         struct dmar_atsr_unit *atsru;
4187
4188         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4189                 return 0;
4190
4191         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4192         atsru = dmar_find_atsr(atsr);
4193         if (atsru)
4194                 return 0;
4195
4196         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4197         if (!atsru)
4198                 return -ENOMEM;
4199
4200         /*
4201          * If memory is allocated from slab by ACPI _DSM method, we need to
4202          * copy the memory content because the memory buffer will be freed
4203          * on return.
4204          */
4205         atsru->hdr = (void *)(atsru + 1);
4206         memcpy(atsru->hdr, hdr, hdr->length);
4207         atsru->include_all = atsr->flags & 0x1;
4208         if (!atsru->include_all) {
4209                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4210                                 (void *)atsr + atsr->header.length,
4211                                 &atsru->devices_cnt);
4212                 if (atsru->devices_cnt && atsru->devices == NULL) {
4213                         kfree(atsru);
4214                         return -ENOMEM;
4215                 }
4216         }
4217
4218         list_add_rcu(&atsru->list, &dmar_atsr_units);
4219
4220         return 0;
4221 }
4222
4223 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4224 {
4225         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4226         kfree(atsru);
4227 }
4228
4229 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4230 {
4231         struct acpi_dmar_atsr *atsr;
4232         struct dmar_atsr_unit *atsru;
4233
4234         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4235         atsru = dmar_find_atsr(atsr);
4236         if (atsru) {
4237                 list_del_rcu(&atsru->list);
4238                 synchronize_rcu();
4239                 intel_iommu_free_atsr(atsru);
4240         }
4241
4242         return 0;
4243 }
4244
4245 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4246 {
4247         int i;
4248         struct device *dev;
4249         struct acpi_dmar_atsr *atsr;
4250         struct dmar_atsr_unit *atsru;
4251
4252         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4253         atsru = dmar_find_atsr(atsr);
4254         if (!atsru)
4255                 return 0;
4256
4257         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4258                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4259                                           i, dev)
4260                         return -EBUSY;
4261         }
4262
4263         return 0;
4264 }
4265
4266 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4267 {
4268         int sp, ret;
4269         struct intel_iommu *iommu = dmaru->iommu;
4270
4271         if (g_iommus[iommu->seq_id])
4272                 return 0;
4273
4274         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4275                 pr_warn("%s: Doesn't support hardware pass through.\n",
4276                         iommu->name);
4277                 return -ENXIO;
4278         }
4279         if (!ecap_sc_support(iommu->ecap) &&
4280             domain_update_iommu_snooping(iommu)) {
4281                 pr_warn("%s: Doesn't support snooping.\n",
4282                         iommu->name);
4283                 return -ENXIO;
4284         }
4285         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4286         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4287                 pr_warn("%s: Doesn't support large page.\n",
4288                         iommu->name);
4289                 return -ENXIO;
4290         }
4291
4292         /*
4293          * Disable translation if already enabled prior to OS handover.
4294          */
4295         if (iommu->gcmd & DMA_GCMD_TE)
4296                 iommu_disable_translation(iommu);
4297
4298         g_iommus[iommu->seq_id] = iommu;
4299         ret = iommu_init_domains(iommu);
4300         if (ret == 0)
4301                 ret = iommu_alloc_root_entry(iommu);
4302         if (ret)
4303                 goto out;
4304
4305         intel_svm_check(iommu);
4306
4307         if (dmaru->ignored) {
4308                 /*
4309                  * we always have to disable PMRs or DMA may fail on this device
4310                  */
4311                 if (force_on)
4312                         iommu_disable_protect_mem_regions(iommu);
4313                 return 0;
4314         }
4315
4316         intel_iommu_init_qi(iommu);
4317         iommu_flush_write_buffer(iommu);
4318
4319 #ifdef CONFIG_INTEL_IOMMU_SVM
4320         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4321                 ret = intel_svm_enable_prq(iommu);
4322                 if (ret)
4323                         goto disable_iommu;
4324         }
4325 #endif
4326         ret = dmar_set_interrupt(iommu);
4327         if (ret)
4328                 goto disable_iommu;
4329
4330         iommu_set_root_entry(iommu);
4331         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4332         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4333         iommu_enable_translation(iommu);
4334
4335         iommu_disable_protect_mem_regions(iommu);
4336         return 0;
4337
4338 disable_iommu:
4339         disable_dmar_iommu(iommu);
4340 out:
4341         free_dmar_iommu(iommu);
4342         return ret;
4343 }
4344
4345 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4346 {
4347         int ret = 0;
4348         struct intel_iommu *iommu = dmaru->iommu;
4349
4350         if (!intel_iommu_enabled)
4351                 return 0;
4352         if (iommu == NULL)
4353                 return -EINVAL;
4354
4355         if (insert) {
4356                 ret = intel_iommu_add(dmaru);
4357         } else {
4358                 disable_dmar_iommu(iommu);
4359                 free_dmar_iommu(iommu);
4360         }
4361
4362         return ret;
4363 }
4364
4365 static void intel_iommu_free_dmars(void)
4366 {
4367         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4368         struct dmar_atsr_unit *atsru, *atsr_n;
4369
4370         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4371                 list_del(&rmrru->list);
4372                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4373                 kfree(rmrru);
4374         }
4375
4376         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4377                 list_del(&atsru->list);
4378                 intel_iommu_free_atsr(atsru);
4379         }
4380 }
4381
4382 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4383 {
4384         int i, ret = 1;
4385         struct pci_bus *bus;
4386         struct pci_dev *bridge = NULL;
4387         struct device *tmp;
4388         struct acpi_dmar_atsr *atsr;
4389         struct dmar_atsr_unit *atsru;
4390
4391         dev = pci_physfn(dev);
4392         for (bus = dev->bus; bus; bus = bus->parent) {
4393                 bridge = bus->self;
4394                 /* If it's an integrated device, allow ATS */
4395                 if (!bridge)
4396                         return 1;
4397                 /* Connected via non-PCIe: no ATS */
4398                 if (!pci_is_pcie(bridge) ||
4399                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4400                         return 0;
4401                 /* If we found the root port, look it up in the ATSR */
4402                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4403                         break;
4404         }
4405
4406         rcu_read_lock();
4407         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4408                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4409                 if (atsr->segment != pci_domain_nr(dev->bus))
4410                         continue;
4411
4412                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4413                         if (tmp == &bridge->dev)
4414                                 goto out;
4415
4416                 if (atsru->include_all)
4417                         goto out;
4418         }
4419         ret = 0;
4420 out:
4421         rcu_read_unlock();
4422
4423         return ret;
4424 }
4425
4426 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4427 {
4428         int ret;
4429         struct dmar_rmrr_unit *rmrru;
4430         struct dmar_atsr_unit *atsru;
4431         struct acpi_dmar_atsr *atsr;
4432         struct acpi_dmar_reserved_memory *rmrr;
4433
4434         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4435                 return 0;
4436
4437         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4438                 rmrr = container_of(rmrru->hdr,
4439                                     struct acpi_dmar_reserved_memory, header);
4440                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4441                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4442                                 ((void *)rmrr) + rmrr->header.length,
4443                                 rmrr->segment, rmrru->devices,
4444                                 rmrru->devices_cnt);
4445                         if (ret < 0)
4446                                 return ret;
4447                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4448                         dmar_remove_dev_scope(info, rmrr->segment,
4449                                 rmrru->devices, rmrru->devices_cnt);
4450                 }
4451         }
4452
4453         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4454                 if (atsru->include_all)
4455                         continue;
4456
4457                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4458                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4459                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4460                                         (void *)atsr + atsr->header.length,
4461                                         atsr->segment, atsru->devices,
4462                                         atsru->devices_cnt);
4463                         if (ret > 0)
4464                                 break;
4465                         else if (ret < 0)
4466                                 return ret;
4467                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4468                         if (dmar_remove_dev_scope(info, atsr->segment,
4469                                         atsru->devices, atsru->devices_cnt))
4470                                 break;
4471                 }
4472         }
4473
4474         return 0;
4475 }
4476
4477 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4478                                        unsigned long val, void *v)
4479 {
4480         struct memory_notify *mhp = v;
4481         unsigned long long start, end;
4482         unsigned long start_vpfn, last_vpfn;
4483
4484         switch (val) {
4485         case MEM_GOING_ONLINE:
4486                 start = mhp->start_pfn << PAGE_SHIFT;
4487                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4488                 if (iommu_domain_identity_map(si_domain, start, end)) {
4489                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4490                                 start, end);
4491                         return NOTIFY_BAD;
4492                 }
4493                 break;
4494
4495         case MEM_OFFLINE:
4496         case MEM_CANCEL_ONLINE:
4497                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4498                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4499                 while (start_vpfn <= last_vpfn) {
4500                         struct iova *iova;
4501                         struct dmar_drhd_unit *drhd;
4502                         struct intel_iommu *iommu;
4503                         struct page *freelist;
4504
4505                         iova = find_iova(&si_domain->iovad, start_vpfn);
4506                         if (iova == NULL) {
4507                                 pr_debug("Failed get IOVA for PFN %lx\n",
4508                                          start_vpfn);
4509                                 break;
4510                         }
4511
4512                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4513                                                      start_vpfn, last_vpfn);
4514                         if (iova == NULL) {
4515                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4516                                         start_vpfn, last_vpfn);
4517                                 return NOTIFY_BAD;
4518                         }
4519
4520                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4521                                                iova->pfn_hi);
4522
4523                         rcu_read_lock();
4524                         for_each_active_iommu(iommu, drhd)
4525                                 iommu_flush_iotlb_psi(iommu, si_domain,
4526                                         iova->pfn_lo, iova_size(iova),
4527                                         !freelist, 0);
4528                         rcu_read_unlock();
4529                         dma_free_pagelist(freelist);
4530
4531                         start_vpfn = iova->pfn_hi + 1;
4532                         free_iova_mem(iova);
4533                 }
4534                 break;
4535         }
4536
4537         return NOTIFY_OK;
4538 }
4539
4540 static struct notifier_block intel_iommu_memory_nb = {
4541         .notifier_call = intel_iommu_memory_notifier,
4542         .priority = 0
4543 };
4544
4545 static void free_all_cpu_cached_iovas(unsigned int cpu)
4546 {
4547         int i;
4548
4549         for (i = 0; i < g_num_of_iommus; i++) {
4550                 struct intel_iommu *iommu = g_iommus[i];
4551                 struct dmar_domain *domain;
4552                 int did;
4553
4554                 if (!iommu)
4555                         continue;
4556
4557                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4558                         domain = get_iommu_domain(iommu, (u16)did);
4559
4560                         if (!domain)
4561                                 continue;
4562                         free_cpu_cached_iovas(cpu, &domain->iovad);
4563                 }
4564         }
4565 }
4566
4567 static int intel_iommu_cpu_dead(unsigned int cpu)
4568 {
4569         free_all_cpu_cached_iovas(cpu);
4570         return 0;
4571 }
4572
4573 static void intel_disable_iommus(void)
4574 {
4575         struct intel_iommu *iommu = NULL;
4576         struct dmar_drhd_unit *drhd;
4577
4578         for_each_iommu(iommu, drhd)
4579                 iommu_disable_translation(iommu);
4580 }
4581
4582 void intel_iommu_shutdown(void)
4583 {
4584         struct dmar_drhd_unit *drhd;
4585         struct intel_iommu *iommu = NULL;
4586
4587         if (no_iommu || dmar_disabled)
4588                 return;
4589
4590         down_write(&dmar_global_lock);
4591
4592         /* Disable PMRs explicitly here. */
4593         for_each_iommu(iommu, drhd)
4594                 iommu_disable_protect_mem_regions(iommu);
4595
4596         /* Make sure the IOMMUs are switched off */
4597         intel_disable_iommus();
4598
4599         up_write(&dmar_global_lock);
4600 }
4601
4602 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4603 {
4604         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4605
4606         return container_of(iommu_dev, struct intel_iommu, iommu);
4607 }
4608
4609 static ssize_t intel_iommu_show_version(struct device *dev,
4610                                         struct device_attribute *attr,
4611                                         char *buf)
4612 {
4613         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4614         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4615         return sprintf(buf, "%d:%d\n",
4616                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4617 }
4618 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4619
4620 static ssize_t intel_iommu_show_address(struct device *dev,
4621                                         struct device_attribute *attr,
4622                                         char *buf)
4623 {
4624         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4625         return sprintf(buf, "%llx\n", iommu->reg_phys);
4626 }
4627 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4628
4629 static ssize_t intel_iommu_show_cap(struct device *dev,
4630                                     struct device_attribute *attr,
4631                                     char *buf)
4632 {
4633         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4634         return sprintf(buf, "%llx\n", iommu->cap);
4635 }
4636 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4637
4638 static ssize_t intel_iommu_show_ecap(struct device *dev,
4639                                     struct device_attribute *attr,
4640                                     char *buf)
4641 {
4642         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4643         return sprintf(buf, "%llx\n", iommu->ecap);
4644 }
4645 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4646
4647 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4648                                       struct device_attribute *attr,
4649                                       char *buf)
4650 {
4651         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4652         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4653 }
4654 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4655
4656 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4657                                            struct device_attribute *attr,
4658                                            char *buf)
4659 {
4660         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4661         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4662                                                   cap_ndoms(iommu->cap)));
4663 }
4664 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4665
4666 static struct attribute *intel_iommu_attrs[] = {
4667         &dev_attr_version.attr,
4668         &dev_attr_address.attr,
4669         &dev_attr_cap.attr,
4670         &dev_attr_ecap.attr,
4671         &dev_attr_domains_supported.attr,
4672         &dev_attr_domains_used.attr,
4673         NULL,
4674 };
4675
4676 static struct attribute_group intel_iommu_group = {
4677         .name = "intel-iommu",
4678         .attrs = intel_iommu_attrs,
4679 };
4680
4681 const struct attribute_group *intel_iommu_groups[] = {
4682         &intel_iommu_group,
4683         NULL,
4684 };
4685
4686 static inline bool has_untrusted_dev(void)
4687 {
4688         struct pci_dev *pdev = NULL;
4689
4690         for_each_pci_dev(pdev)
4691                 if (pdev->untrusted)
4692                         return true;
4693
4694         return false;
4695 }
4696
4697 static int __init platform_optin_force_iommu(void)
4698 {
4699         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4700                 return 0;
4701
4702         if (no_iommu || dmar_disabled)
4703                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4704
4705         /*
4706          * If Intel-IOMMU is disabled by default, we will apply identity
4707          * map for all devices except those marked as being untrusted.
4708          */
4709         if (dmar_disabled)
4710                 iommu_set_default_passthrough(false);
4711
4712         dmar_disabled = 0;
4713         no_iommu = 0;
4714
4715         return 1;
4716 }
4717
4718 static int __init probe_acpi_namespace_devices(void)
4719 {
4720         struct dmar_drhd_unit *drhd;
4721         /* To avoid a -Wunused-but-set-variable warning. */
4722         struct intel_iommu *iommu __maybe_unused;
4723         struct device *dev;
4724         int i, ret = 0;
4725
4726         for_each_active_iommu(iommu, drhd) {
4727                 for_each_active_dev_scope(drhd->devices,
4728                                           drhd->devices_cnt, i, dev) {
4729                         struct acpi_device_physical_node *pn;
4730                         struct iommu_group *group;
4731                         struct acpi_device *adev;
4732
4733                         if (dev->bus != &acpi_bus_type)
4734                                 continue;
4735
4736                         adev = to_acpi_device(dev);
4737                         mutex_lock(&adev->physical_node_lock);
4738                         list_for_each_entry(pn,
4739                                             &adev->physical_node_list, node) {
4740                                 group = iommu_group_get(pn->dev);
4741                                 if (group) {
4742                                         iommu_group_put(group);
4743                                         continue;
4744                                 }
4745
4746                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4747                                 ret = iommu_probe_device(pn->dev);
4748                                 if (ret)
4749                                         break;
4750                         }
4751                         mutex_unlock(&adev->physical_node_lock);
4752
4753                         if (ret)
4754                                 return ret;
4755                 }
4756         }
4757
4758         return 0;
4759 }
4760
4761 int __init intel_iommu_init(void)
4762 {
4763         int ret = -ENODEV;
4764         struct dmar_drhd_unit *drhd;
4765         struct intel_iommu *iommu;
4766
4767         /*
4768          * Intel IOMMU is required for a TXT/tboot launch or platform
4769          * opt in, so enforce that.
4770          */
4771         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4772
4773         if (iommu_init_mempool()) {
4774                 if (force_on)
4775                         panic("tboot: Failed to initialize iommu memory\n");
4776                 return -ENOMEM;
4777         }
4778
4779         down_write(&dmar_global_lock);
4780         if (dmar_table_init()) {
4781                 if (force_on)
4782                         panic("tboot: Failed to initialize DMAR table\n");
4783                 goto out_free_dmar;
4784         }
4785
4786         if (dmar_dev_scope_init() < 0) {
4787                 if (force_on)
4788                         panic("tboot: Failed to initialize DMAR device scope\n");
4789                 goto out_free_dmar;
4790         }
4791
4792         up_write(&dmar_global_lock);
4793
4794         /*
4795          * The bus notifier takes the dmar_global_lock, so lockdep will
4796          * complain later when we register it under the lock.
4797          */
4798         dmar_register_bus_notifier();
4799
4800         down_write(&dmar_global_lock);
4801
4802         if (!no_iommu)
4803                 intel_iommu_debugfs_init();
4804
4805         if (no_iommu || dmar_disabled) {
4806                 /*
4807                  * We exit the function here to ensure IOMMU's remapping and
4808                  * mempool aren't setup, which means that the IOMMU's PMRs
4809                  * won't be disabled via the call to init_dmars(). So disable
4810                  * it explicitly here. The PMRs were setup by tboot prior to
4811                  * calling SENTER, but the kernel is expected to reset/tear
4812                  * down the PMRs.
4813                  */
4814                 if (intel_iommu_tboot_noforce) {
4815                         for_each_iommu(iommu, drhd)
4816                                 iommu_disable_protect_mem_regions(iommu);
4817                 }
4818
4819                 /*
4820                  * Make sure the IOMMUs are switched off, even when we
4821                  * boot into a kexec kernel and the previous kernel left
4822                  * them enabled
4823                  */
4824                 intel_disable_iommus();
4825                 goto out_free_dmar;
4826         }
4827
4828         if (list_empty(&dmar_rmrr_units))
4829                 pr_info("No RMRR found\n");
4830
4831         if (list_empty(&dmar_atsr_units))
4832                 pr_info("No ATSR found\n");
4833
4834         if (dmar_init_reserved_ranges()) {
4835                 if (force_on)
4836                         panic("tboot: Failed to reserve iommu ranges\n");
4837                 goto out_free_reserved_range;
4838         }
4839
4840         if (dmar_map_gfx)
4841                 intel_iommu_gfx_mapped = 1;
4842
4843         init_no_remapping_devices();
4844
4845         ret = init_dmars();
4846         if (ret) {
4847                 if (force_on)
4848                         panic("tboot: Failed to initialize DMARs\n");
4849                 pr_err("Initialization failed\n");
4850                 goto out_free_reserved_range;
4851         }
4852         up_write(&dmar_global_lock);
4853
4854         init_iommu_pm_ops();
4855
4856         down_read(&dmar_global_lock);
4857         for_each_active_iommu(iommu, drhd) {
4858                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4859                                        intel_iommu_groups,
4860                                        "%s", iommu->name);
4861                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4862                 iommu_device_register(&iommu->iommu);
4863         }
4864         up_read(&dmar_global_lock);
4865
4866         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4867         if (si_domain && !hw_pass_through)
4868                 register_memory_notifier(&intel_iommu_memory_nb);
4869         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4870                           intel_iommu_cpu_dead);
4871
4872         down_read(&dmar_global_lock);
4873         if (probe_acpi_namespace_devices())
4874                 pr_warn("ACPI name space devices didn't probe correctly\n");
4875
4876         /* Finally, we enable the DMA remapping hardware. */
4877         for_each_iommu(iommu, drhd) {
4878                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4879                         iommu_enable_translation(iommu);
4880
4881                 iommu_disable_protect_mem_regions(iommu);
4882         }
4883         up_read(&dmar_global_lock);
4884
4885         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4886
4887         intel_iommu_enabled = 1;
4888
4889         return 0;
4890
4891 out_free_reserved_range:
4892         put_iova_domain(&reserved_iova_list);
4893 out_free_dmar:
4894         intel_iommu_free_dmars();
4895         up_write(&dmar_global_lock);
4896         iommu_exit_mempool();
4897         return ret;
4898 }
4899
4900 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4901 {
4902         struct intel_iommu *iommu = opaque;
4903
4904         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4905         return 0;
4906 }
4907
4908 /*
4909  * NB - intel-iommu lacks any sort of reference counting for the users of
4910  * dependent devices.  If multiple endpoints have intersecting dependent
4911  * devices, unbinding the driver from any one of them will possibly leave
4912  * the others unable to operate.
4913  */
4914 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4915 {
4916         if (!iommu || !dev || !dev_is_pci(dev))
4917                 return;
4918
4919         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4920 }
4921
4922 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4923 {
4924         struct dmar_domain *domain;
4925         struct intel_iommu *iommu;
4926         unsigned long flags;
4927
4928         assert_spin_locked(&device_domain_lock);
4929
4930         if (WARN_ON(!info))
4931                 return;
4932
4933         iommu = info->iommu;
4934         domain = info->domain;
4935
4936         if (info->dev) {
4937                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4938                         intel_pasid_tear_down_entry(iommu, info->dev,
4939                                         PASID_RID2PASID);
4940
4941                 iommu_disable_dev_iotlb(info);
4942                 domain_context_clear(iommu, info->dev);
4943                 intel_pasid_free_table(info->dev);
4944         }
4945
4946         unlink_domain_info(info);
4947
4948         spin_lock_irqsave(&iommu->lock, flags);
4949         domain_detach_iommu(domain, iommu);
4950         spin_unlock_irqrestore(&iommu->lock, flags);
4951
4952         free_devinfo_mem(info);
4953 }
4954
4955 static void dmar_remove_one_dev_info(struct device *dev)
4956 {
4957         struct device_domain_info *info;
4958         unsigned long flags;
4959
4960         spin_lock_irqsave(&device_domain_lock, flags);
4961         info = dev->archdata.iommu;
4962         if (info && info != DEFER_DEVICE_DOMAIN_INFO
4963             && info != DUMMY_DEVICE_DOMAIN_INFO)
4964                 __dmar_remove_one_dev_info(info);
4965         spin_unlock_irqrestore(&device_domain_lock, flags);
4966 }
4967
4968 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4969 {
4970         int adjust_width;
4971
4972         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4973         domain_reserve_special_ranges(domain);
4974
4975         /* calculate AGAW */
4976         domain->gaw = guest_width;
4977         adjust_width = guestwidth_to_adjustwidth(guest_width);
4978         domain->agaw = width_to_agaw(adjust_width);
4979
4980         domain->iommu_coherency = 0;
4981         domain->iommu_snooping = 0;
4982         domain->iommu_superpage = 0;
4983         domain->max_addr = 0;
4984
4985         /* always allocate the top pgd */
4986         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4987         if (!domain->pgd)
4988                 return -ENOMEM;
4989         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4990         return 0;
4991 }
4992
4993 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4994 {
4995         struct dmar_domain *dmar_domain;
4996         struct iommu_domain *domain;
4997         int ret;
4998
4999         switch (type) {
5000         case IOMMU_DOMAIN_DMA:
5001         /* fallthrough */
5002         case IOMMU_DOMAIN_UNMANAGED:
5003                 dmar_domain = alloc_domain(0);
5004                 if (!dmar_domain) {
5005                         pr_err("Can't allocate dmar_domain\n");
5006                         return NULL;
5007                 }
5008                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5009                         pr_err("Domain initialization failed\n");
5010                         domain_exit(dmar_domain);
5011                         return NULL;
5012                 }
5013
5014                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5015                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5016                                                     iommu_flush_iova,
5017                                                     iova_entry_free);
5018                         if (ret)
5019                                 pr_info("iova flush queue initialization failed\n");
5020                 }
5021
5022                 domain_update_iommu_cap(dmar_domain);
5023
5024                 domain = &dmar_domain->domain;
5025                 domain->geometry.aperture_start = 0;
5026                 domain->geometry.aperture_end   =
5027                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5028                 domain->geometry.force_aperture = true;
5029
5030                 return domain;
5031         case IOMMU_DOMAIN_IDENTITY:
5032                 return &si_domain->domain;
5033         default:
5034                 return NULL;
5035         }
5036
5037         return NULL;
5038 }
5039
5040 static void intel_iommu_domain_free(struct iommu_domain *domain)
5041 {
5042         if (domain != &si_domain->domain)
5043                 domain_exit(to_dmar_domain(domain));
5044 }
5045
5046 /*
5047  * Check whether a @domain could be attached to the @dev through the
5048  * aux-domain attach/detach APIs.
5049  */
5050 static inline bool
5051 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5052 {
5053         struct device_domain_info *info = dev->archdata.iommu;
5054
5055         return info && info->auxd_enabled &&
5056                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5057 }
5058
5059 static void auxiliary_link_device(struct dmar_domain *domain,
5060                                   struct device *dev)
5061 {
5062         struct device_domain_info *info = dev->archdata.iommu;
5063
5064         assert_spin_locked(&device_domain_lock);
5065         if (WARN_ON(!info))
5066                 return;
5067
5068         domain->auxd_refcnt++;
5069         list_add(&domain->auxd, &info->auxiliary_domains);
5070 }
5071
5072 static void auxiliary_unlink_device(struct dmar_domain *domain,
5073                                     struct device *dev)
5074 {
5075         struct device_domain_info *info = dev->archdata.iommu;
5076
5077         assert_spin_locked(&device_domain_lock);
5078         if (WARN_ON(!info))
5079                 return;
5080
5081         list_del(&domain->auxd);
5082         domain->auxd_refcnt--;
5083
5084         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5085                 ioasid_free(domain->default_pasid);
5086 }
5087
5088 static int aux_domain_add_dev(struct dmar_domain *domain,
5089                               struct device *dev)
5090 {
5091         int ret;
5092         u8 bus, devfn;
5093         unsigned long flags;
5094         struct intel_iommu *iommu;
5095
5096         iommu = device_to_iommu(dev, &bus, &devfn);
5097         if (!iommu)
5098                 return -ENODEV;
5099
5100         if (domain->default_pasid <= 0) {
5101                 int pasid;
5102
5103                 /* No private data needed for the default pasid */
5104                 pasid = ioasid_alloc(NULL, PASID_MIN,
5105                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5106                                      NULL);
5107                 if (pasid == INVALID_IOASID) {
5108                         pr_err("Can't allocate default pasid\n");
5109                         return -ENODEV;
5110                 }
5111                 domain->default_pasid = pasid;
5112         }
5113
5114         spin_lock_irqsave(&device_domain_lock, flags);
5115         /*
5116          * iommu->lock must be held to attach domain to iommu and setup the
5117          * pasid entry for second level translation.
5118          */
5119         spin_lock(&iommu->lock);
5120         ret = domain_attach_iommu(domain, iommu);
5121         if (ret)
5122                 goto attach_failed;
5123
5124         /* Setup the PASID entry for mediated devices: */
5125         if (domain_use_first_level(domain))
5126                 ret = domain_setup_first_level(iommu, domain, dev,
5127                                                domain->default_pasid);
5128         else
5129                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5130                                                      domain->default_pasid);
5131         if (ret)
5132                 goto table_failed;
5133         spin_unlock(&iommu->lock);
5134
5135         auxiliary_link_device(domain, dev);
5136
5137         spin_unlock_irqrestore(&device_domain_lock, flags);
5138
5139         return 0;
5140
5141 table_failed:
5142         domain_detach_iommu(domain, iommu);
5143 attach_failed:
5144         spin_unlock(&iommu->lock);
5145         spin_unlock_irqrestore(&device_domain_lock, flags);
5146         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5147                 ioasid_free(domain->default_pasid);
5148
5149         return ret;
5150 }
5151
5152 static void aux_domain_remove_dev(struct dmar_domain *domain,
5153                                   struct device *dev)
5154 {
5155         struct device_domain_info *info;
5156         struct intel_iommu *iommu;
5157         unsigned long flags;
5158
5159         if (!is_aux_domain(dev, &domain->domain))
5160                 return;
5161
5162         spin_lock_irqsave(&device_domain_lock, flags);
5163         info = dev->archdata.iommu;
5164         iommu = info->iommu;
5165
5166         auxiliary_unlink_device(domain, dev);
5167
5168         spin_lock(&iommu->lock);
5169         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5170         domain_detach_iommu(domain, iommu);
5171         spin_unlock(&iommu->lock);
5172
5173         spin_unlock_irqrestore(&device_domain_lock, flags);
5174 }
5175
5176 static int prepare_domain_attach_device(struct iommu_domain *domain,
5177                                         struct device *dev)
5178 {
5179         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5180         struct intel_iommu *iommu;
5181         int addr_width;
5182         u8 bus, devfn;
5183
5184         iommu = device_to_iommu(dev, &bus, &devfn);
5185         if (!iommu)
5186                 return -ENODEV;
5187
5188         /* check if this iommu agaw is sufficient for max mapped address */
5189         addr_width = agaw_to_width(iommu->agaw);
5190         if (addr_width > cap_mgaw(iommu->cap))
5191                 addr_width = cap_mgaw(iommu->cap);
5192
5193         if (dmar_domain->max_addr > (1LL << addr_width)) {
5194                 dev_err(dev, "%s: iommu width (%d) is not "
5195                         "sufficient for the mapped address (%llx)\n",
5196                         __func__, addr_width, dmar_domain->max_addr);
5197                 return -EFAULT;
5198         }
5199         dmar_domain->gaw = addr_width;
5200
5201         /*
5202          * Knock out extra levels of page tables if necessary
5203          */
5204         while (iommu->agaw < dmar_domain->agaw) {
5205                 struct dma_pte *pte;
5206
5207                 pte = dmar_domain->pgd;
5208                 if (dma_pte_present(pte)) {
5209                         dmar_domain->pgd = (struct dma_pte *)
5210                                 phys_to_virt(dma_pte_addr(pte));
5211                         free_pgtable_page(pte);
5212                 }
5213                 dmar_domain->agaw--;
5214         }
5215
5216         return 0;
5217 }
5218
5219 static int intel_iommu_attach_device(struct iommu_domain *domain,
5220                                      struct device *dev)
5221 {
5222         int ret;
5223
5224         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5225             device_is_rmrr_locked(dev)) {
5226                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5227                 return -EPERM;
5228         }
5229
5230         if (is_aux_domain(dev, domain))
5231                 return -EPERM;
5232
5233         /* normally dev is not mapped */
5234         if (unlikely(domain_context_mapped(dev))) {
5235                 struct dmar_domain *old_domain;
5236
5237                 old_domain = find_domain(dev);
5238                 if (old_domain)
5239                         dmar_remove_one_dev_info(dev);
5240         }
5241
5242         ret = prepare_domain_attach_device(domain, dev);
5243         if (ret)
5244                 return ret;
5245
5246         return domain_add_dev_info(to_dmar_domain(domain), dev);
5247 }
5248
5249 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5250                                          struct device *dev)
5251 {
5252         int ret;
5253
5254         if (!is_aux_domain(dev, domain))
5255                 return -EPERM;
5256
5257         ret = prepare_domain_attach_device(domain, dev);
5258         if (ret)
5259                 return ret;
5260
5261         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5262 }
5263
5264 static void intel_iommu_detach_device(struct iommu_domain *domain,
5265                                       struct device *dev)
5266 {
5267         dmar_remove_one_dev_info(dev);
5268 }
5269
5270 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5271                                           struct device *dev)
5272 {
5273         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5274 }
5275
5276 static int intel_iommu_map(struct iommu_domain *domain,
5277                            unsigned long iova, phys_addr_t hpa,
5278                            size_t size, int iommu_prot, gfp_t gfp)
5279 {
5280         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5281         u64 max_addr;
5282         int prot = 0;
5283         int ret;
5284
5285         if (iommu_prot & IOMMU_READ)
5286                 prot |= DMA_PTE_READ;
5287         if (iommu_prot & IOMMU_WRITE)
5288                 prot |= DMA_PTE_WRITE;
5289         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5290                 prot |= DMA_PTE_SNP;
5291
5292         max_addr = iova + size;
5293         if (dmar_domain->max_addr < max_addr) {
5294                 u64 end;
5295
5296                 /* check if minimum agaw is sufficient for mapped address */
5297                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5298                 if (end < max_addr) {
5299                         pr_err("%s: iommu width (%d) is not "
5300                                "sufficient for the mapped address (%llx)\n",
5301                                __func__, dmar_domain->gaw, max_addr);
5302                         return -EFAULT;
5303                 }
5304                 dmar_domain->max_addr = max_addr;
5305         }
5306         /* Round up size to next multiple of PAGE_SIZE, if it and
5307            the low bits of hpa would take us onto the next page */
5308         size = aligned_nrpages(hpa, size);
5309         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5310                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5311         return ret;
5312 }
5313
5314 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5315                                 unsigned long iova, size_t size,
5316                                 struct iommu_iotlb_gather *gather)
5317 {
5318         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5319         struct page *freelist = NULL;
5320         unsigned long start_pfn, last_pfn;
5321         unsigned int npages;
5322         int iommu_id, level = 0;
5323
5324         /* Cope with horrid API which requires us to unmap more than the
5325            size argument if it happens to be a large-page mapping. */
5326         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5327
5328         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5329                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5330
5331         start_pfn = iova >> VTD_PAGE_SHIFT;
5332         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5333
5334         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5335
5336         npages = last_pfn - start_pfn + 1;
5337
5338         for_each_domain_iommu(iommu_id, dmar_domain)
5339                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5340                                       start_pfn, npages, !freelist, 0);
5341
5342         dma_free_pagelist(freelist);
5343
5344         if (dmar_domain->max_addr == iova + size)
5345                 dmar_domain->max_addr = iova;
5346
5347         return size;
5348 }
5349
5350 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5351                                             dma_addr_t iova)
5352 {
5353         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5354         struct dma_pte *pte;
5355         int level = 0;
5356         u64 phys = 0;
5357
5358         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5359         if (pte && dma_pte_present(pte))
5360                 phys = dma_pte_addr(pte) +
5361                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5362                                                 VTD_PAGE_SHIFT) - 1));
5363
5364         return phys;
5365 }
5366
5367 static inline bool scalable_mode_support(void)
5368 {
5369         struct dmar_drhd_unit *drhd;
5370         struct intel_iommu *iommu;
5371         bool ret = true;
5372
5373         rcu_read_lock();
5374         for_each_active_iommu(iommu, drhd) {
5375                 if (!sm_supported(iommu)) {
5376                         ret = false;
5377                         break;
5378                 }
5379         }
5380         rcu_read_unlock();
5381
5382         return ret;
5383 }
5384
5385 static inline bool iommu_pasid_support(void)
5386 {
5387         struct dmar_drhd_unit *drhd;
5388         struct intel_iommu *iommu;
5389         bool ret = true;
5390
5391         rcu_read_lock();
5392         for_each_active_iommu(iommu, drhd) {
5393                 if (!pasid_supported(iommu)) {
5394                         ret = false;
5395                         break;
5396                 }
5397         }
5398         rcu_read_unlock();
5399
5400         return ret;
5401 }
5402
5403 static inline bool nested_mode_support(void)
5404 {
5405         struct dmar_drhd_unit *drhd;
5406         struct intel_iommu *iommu;
5407         bool ret = true;
5408
5409         rcu_read_lock();
5410         for_each_active_iommu(iommu, drhd) {
5411                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5412                         ret = false;
5413                         break;
5414                 }
5415         }
5416         rcu_read_unlock();
5417
5418         return ret;
5419 }
5420
5421 static bool intel_iommu_capable(enum iommu_cap cap)
5422 {
5423         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5424                 return domain_update_iommu_snooping(NULL) == 1;
5425         if (cap == IOMMU_CAP_INTR_REMAP)
5426                 return irq_remapping_enabled == 1;
5427
5428         return false;
5429 }
5430
5431 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5432 {
5433         struct intel_iommu *iommu;
5434         u8 bus, devfn;
5435
5436         iommu = device_to_iommu(dev, &bus, &devfn);
5437         if (!iommu)
5438                 return ERR_PTR(-ENODEV);
5439
5440         if (translation_pre_enabled(iommu))
5441                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5442
5443         return &iommu->iommu;
5444 }
5445
5446 static void intel_iommu_release_device(struct device *dev)
5447 {
5448         struct intel_iommu *iommu;
5449         u8 bus, devfn;
5450
5451         iommu = device_to_iommu(dev, &bus, &devfn);
5452         if (!iommu)
5453                 return;
5454
5455         dmar_remove_one_dev_info(dev);
5456
5457         set_dma_ops(dev, NULL);
5458 }
5459
5460 static void intel_iommu_probe_finalize(struct device *dev)
5461 {
5462         struct iommu_domain *domain;
5463
5464         domain = iommu_get_domain_for_dev(dev);
5465         if (device_needs_bounce(dev))
5466                 set_dma_ops(dev, &bounce_dma_ops);
5467         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5468                 set_dma_ops(dev, &intel_dma_ops);
5469         else
5470                 set_dma_ops(dev, NULL);
5471 }
5472
5473 static void intel_iommu_get_resv_regions(struct device *device,
5474                                          struct list_head *head)
5475 {
5476         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5477         struct iommu_resv_region *reg;
5478         struct dmar_rmrr_unit *rmrr;
5479         struct device *i_dev;
5480         int i;
5481
5482         down_read(&dmar_global_lock);
5483         for_each_rmrr_units(rmrr) {
5484                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5485                                           i, i_dev) {
5486                         struct iommu_resv_region *resv;
5487                         enum iommu_resv_type type;
5488                         size_t length;
5489
5490                         if (i_dev != device &&
5491                             !is_downstream_to_pci_bridge(device, i_dev))
5492                                 continue;
5493
5494                         length = rmrr->end_address - rmrr->base_address + 1;
5495
5496                         type = device_rmrr_is_relaxable(device) ?
5497                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5498
5499                         resv = iommu_alloc_resv_region(rmrr->base_address,
5500                                                        length, prot, type);
5501                         if (!resv)
5502                                 break;
5503
5504                         list_add_tail(&resv->list, head);
5505                 }
5506         }
5507         up_read(&dmar_global_lock);
5508
5509 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5510         if (dev_is_pci(device)) {
5511                 struct pci_dev *pdev = to_pci_dev(device);
5512
5513                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5514                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5515                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5516                         if (reg)
5517                                 list_add_tail(&reg->list, head);
5518                 }
5519         }
5520 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5521
5522         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5523                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5524                                       0, IOMMU_RESV_MSI);
5525         if (!reg)
5526                 return;
5527         list_add_tail(&reg->list, head);
5528 }
5529
5530 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5531 {
5532         struct device_domain_info *info;
5533         struct context_entry *context;
5534         struct dmar_domain *domain;
5535         unsigned long flags;
5536         u64 ctx_lo;
5537         int ret;
5538
5539         domain = find_domain(dev);
5540         if (!domain)
5541                 return -EINVAL;
5542
5543         spin_lock_irqsave(&device_domain_lock, flags);
5544         spin_lock(&iommu->lock);
5545
5546         ret = -EINVAL;
5547         info = dev->archdata.iommu;
5548         if (!info || !info->pasid_supported)
5549                 goto out;
5550
5551         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5552         if (WARN_ON(!context))
5553                 goto out;
5554
5555         ctx_lo = context[0].lo;
5556
5557         if (!(ctx_lo & CONTEXT_PASIDE)) {
5558                 ctx_lo |= CONTEXT_PASIDE;
5559                 context[0].lo = ctx_lo;
5560                 wmb();
5561                 iommu->flush.flush_context(iommu,
5562                                            domain->iommu_did[iommu->seq_id],
5563                                            PCI_DEVID(info->bus, info->devfn),
5564                                            DMA_CCMD_MASK_NOBIT,
5565                                            DMA_CCMD_DEVICE_INVL);
5566         }
5567
5568         /* Enable PASID support in the device, if it wasn't already */
5569         if (!info->pasid_enabled)
5570                 iommu_enable_dev_iotlb(info);
5571
5572         ret = 0;
5573
5574  out:
5575         spin_unlock(&iommu->lock);
5576         spin_unlock_irqrestore(&device_domain_lock, flags);
5577
5578         return ret;
5579 }
5580
5581 static void intel_iommu_apply_resv_region(struct device *dev,
5582                                           struct iommu_domain *domain,
5583                                           struct iommu_resv_region *region)
5584 {
5585         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5586         unsigned long start, end;
5587
5588         start = IOVA_PFN(region->start);
5589         end   = IOVA_PFN(region->start + region->length - 1);
5590
5591         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5592 }
5593
5594 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5595 {
5596         if (dev_is_pci(dev))
5597                 return pci_device_group(dev);
5598         return generic_device_group(dev);
5599 }
5600
5601 #ifdef CONFIG_INTEL_IOMMU_SVM
5602 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5603 {
5604         struct intel_iommu *iommu;
5605         u8 bus, devfn;
5606
5607         if (iommu_dummy(dev)) {
5608                 dev_warn(dev,
5609                          "No IOMMU translation for device; cannot enable SVM\n");
5610                 return NULL;
5611         }
5612
5613         iommu = device_to_iommu(dev, &bus, &devfn);
5614         if ((!iommu)) {
5615                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5616                 return NULL;
5617         }
5618
5619         return iommu;
5620 }
5621 #endif /* CONFIG_INTEL_IOMMU_SVM */
5622
5623 static int intel_iommu_enable_auxd(struct device *dev)
5624 {
5625         struct device_domain_info *info;
5626         struct intel_iommu *iommu;
5627         unsigned long flags;
5628         u8 bus, devfn;
5629         int ret;
5630
5631         iommu = device_to_iommu(dev, &bus, &devfn);
5632         if (!iommu || dmar_disabled)
5633                 return -EINVAL;
5634
5635         if (!sm_supported(iommu) || !pasid_supported(iommu))
5636                 return -EINVAL;
5637
5638         ret = intel_iommu_enable_pasid(iommu, dev);
5639         if (ret)
5640                 return -ENODEV;
5641
5642         spin_lock_irqsave(&device_domain_lock, flags);
5643         info = dev->archdata.iommu;
5644         info->auxd_enabled = 1;
5645         spin_unlock_irqrestore(&device_domain_lock, flags);
5646
5647         return 0;
5648 }
5649
5650 static int intel_iommu_disable_auxd(struct device *dev)
5651 {
5652         struct device_domain_info *info;
5653         unsigned long flags;
5654
5655         spin_lock_irqsave(&device_domain_lock, flags);
5656         info = dev->archdata.iommu;
5657         if (!WARN_ON(!info))
5658                 info->auxd_enabled = 0;
5659         spin_unlock_irqrestore(&device_domain_lock, flags);
5660
5661         return 0;
5662 }
5663
5664 /*
5665  * A PCI express designated vendor specific extended capability is defined
5666  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5667  * for system software and tools to detect endpoint devices supporting the
5668  * Intel scalable IO virtualization without host driver dependency.
5669  *
5670  * Returns the address of the matching extended capability structure within
5671  * the device's PCI configuration space or 0 if the device does not support
5672  * it.
5673  */
5674 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5675 {
5676         int pos;
5677         u16 vendor, id;
5678
5679         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5680         while (pos) {
5681                 pci_read_config_word(pdev, pos + 4, &vendor);
5682                 pci_read_config_word(pdev, pos + 8, &id);
5683                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5684                         return pos;
5685
5686                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5687         }
5688
5689         return 0;
5690 }
5691
5692 static bool
5693 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5694 {
5695         if (feat == IOMMU_DEV_FEAT_AUX) {
5696                 int ret;
5697
5698                 if (!dev_is_pci(dev) || dmar_disabled ||
5699                     !scalable_mode_support() || !iommu_pasid_support())
5700                         return false;
5701
5702                 ret = pci_pasid_features(to_pci_dev(dev));
5703                 if (ret < 0)
5704                         return false;
5705
5706                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5707         }
5708
5709         return false;
5710 }
5711
5712 static int
5713 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5714 {
5715         if (feat == IOMMU_DEV_FEAT_AUX)
5716                 return intel_iommu_enable_auxd(dev);
5717
5718         return -ENODEV;
5719 }
5720
5721 static int
5722 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5723 {
5724         if (feat == IOMMU_DEV_FEAT_AUX)
5725                 return intel_iommu_disable_auxd(dev);
5726
5727         return -ENODEV;
5728 }
5729
5730 static bool
5731 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5732 {
5733         struct device_domain_info *info = dev->archdata.iommu;
5734
5735         if (feat == IOMMU_DEV_FEAT_AUX)
5736                 return scalable_mode_support() && info && info->auxd_enabled;
5737
5738         return false;
5739 }
5740
5741 static int
5742 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5743 {
5744         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5745
5746         return dmar_domain->default_pasid > 0 ?
5747                         dmar_domain->default_pasid : -EINVAL;
5748 }
5749
5750 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5751                                            struct device *dev)
5752 {
5753         return attach_deferred(dev);
5754 }
5755
5756 static int
5757 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5758                             enum iommu_attr attr, void *data)
5759 {
5760         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5761         unsigned long flags;
5762         int ret = 0;
5763
5764         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5765                 return -EINVAL;
5766
5767         switch (attr) {
5768         case DOMAIN_ATTR_NESTING:
5769                 spin_lock_irqsave(&device_domain_lock, flags);
5770                 if (nested_mode_support() &&
5771                     list_empty(&dmar_domain->devices)) {
5772                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5773                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5774                 } else {
5775                         ret = -ENODEV;
5776                 }
5777                 spin_unlock_irqrestore(&device_domain_lock, flags);
5778                 break;
5779         default:
5780                 ret = -EINVAL;
5781                 break;
5782         }
5783
5784         return ret;
5785 }
5786
5787 const struct iommu_ops intel_iommu_ops = {
5788         .capable                = intel_iommu_capable,
5789         .domain_alloc           = intel_iommu_domain_alloc,
5790         .domain_free            = intel_iommu_domain_free,
5791         .domain_set_attr        = intel_iommu_domain_set_attr,
5792         .attach_dev             = intel_iommu_attach_device,
5793         .detach_dev             = intel_iommu_detach_device,
5794         .aux_attach_dev         = intel_iommu_aux_attach_device,
5795         .aux_detach_dev         = intel_iommu_aux_detach_device,
5796         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5797         .map                    = intel_iommu_map,
5798         .unmap                  = intel_iommu_unmap,
5799         .iova_to_phys           = intel_iommu_iova_to_phys,
5800         .probe_device           = intel_iommu_probe_device,
5801         .probe_finalize         = intel_iommu_probe_finalize,
5802         .release_device         = intel_iommu_release_device,
5803         .get_resv_regions       = intel_iommu_get_resv_regions,
5804         .put_resv_regions       = generic_iommu_put_resv_regions,
5805         .apply_resv_region      = intel_iommu_apply_resv_region,
5806         .device_group           = intel_iommu_device_group,
5807         .dev_has_feat           = intel_iommu_dev_has_feat,
5808         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5809         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5810         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5811         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5812         .def_domain_type        = device_def_domain_type,
5813         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5814 };
5815
5816 static void quirk_iommu_igfx(struct pci_dev *dev)
5817 {
5818         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5819         dmar_map_gfx = 0;
5820 }
5821
5822 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5830
5831 /* Broadwell igfx malfunctions with dmar */
5832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5856
5857 static void quirk_iommu_rwbf(struct pci_dev *dev)
5858 {
5859         /*
5860          * Mobile 4 Series Chipset neglects to set RWBF capability,
5861          * but needs it. Same seems to hold for the desktop versions.
5862          */
5863         pci_info(dev, "Forcing write-buffer flush capability\n");
5864         rwbf_quirk = 1;
5865 }
5866
5867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5874
5875 #define GGC 0x52
5876 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5877 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5878 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5879 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5880 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5881 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5882 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5883 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5884
5885 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5886 {
5887         unsigned short ggc;
5888
5889         if (pci_read_config_word(dev, GGC, &ggc))
5890                 return;
5891
5892         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5893                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5894                 dmar_map_gfx = 0;
5895         } else if (dmar_map_gfx) {
5896                 /* we have to ensure the gfx device is idle before we flush */
5897                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5898                 intel_iommu_strict = 1;
5899        }
5900 }
5901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5905
5906 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5907    ISOCH DMAR unit for the Azalia sound device, but not give it any
5908    TLB entries, which causes it to deadlock. Check for that.  We do
5909    this in a function called from init_dmars(), instead of in a PCI
5910    quirk, because we don't want to print the obnoxious "BIOS broken"
5911    message if VT-d is actually disabled.
5912 */
5913 static void __init check_tylersburg_isoch(void)
5914 {
5915         struct pci_dev *pdev;
5916         uint32_t vtisochctrl;
5917
5918         /* If there's no Azalia in the system anyway, forget it. */
5919         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5920         if (!pdev)
5921                 return;
5922         pci_dev_put(pdev);
5923
5924         /* System Management Registers. Might be hidden, in which case
5925            we can't do the sanity check. But that's OK, because the
5926            known-broken BIOSes _don't_ actually hide it, so far. */
5927         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5928         if (!pdev)
5929                 return;
5930
5931         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5932                 pci_dev_put(pdev);
5933                 return;
5934         }
5935
5936         pci_dev_put(pdev);
5937
5938         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5939         if (vtisochctrl & 1)
5940                 return;
5941
5942         /* Drop all bits other than the number of TLB entries */
5943         vtisochctrl &= 0x1c;
5944
5945         /* If we have the recommended number of TLB entries (16), fine. */
5946         if (vtisochctrl == 0x10)
5947                 return;
5948
5949         /* Zero TLB entries? You get to ride the short bus to school. */
5950         if (!vtisochctrl) {
5951                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5952                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5953                      dmi_get_system_info(DMI_BIOS_VENDOR),
5954                      dmi_get_system_info(DMI_BIOS_VERSION),
5955                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5956                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5957                 return;
5958         }
5959
5960         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5961                vtisochctrl);
5962 }