iommu/vt-d: Allow 32bit devices to uses DMA domain
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static int intel_iommu_attach_device(struct iommu_domain *domain,
359                                      struct device *dev);
360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
361                                             dma_addr_t iova);
362
363 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
364 int dmar_disabled = 0;
365 #else
366 int dmar_disabled = 1;
367 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
368
369 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
370 int intel_iommu_sm = 1;
371 #else
372 int intel_iommu_sm;
373 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
374
375 int intel_iommu_enabled = 0;
376 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
377
378 static int dmar_map_gfx = 1;
379 static int dmar_forcedac;
380 static int intel_iommu_strict;
381 static int intel_iommu_superpage = 1;
382 static int iommu_identity_mapping;
383 static int intel_no_bounce;
384
385 #define IDENTMAP_GFX            2
386 #define IDENTMAP_AZALIA         4
387
388 int intel_iommu_gfx_mapped;
389 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
390
391 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
392 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
393 DEFINE_SPINLOCK(device_domain_lock);
394 static LIST_HEAD(device_domain_list);
395
396 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
397                                 to_pci_dev(d)->untrusted)
398
399 /*
400  * Iterate over elements in device_domain_list and call the specified
401  * callback @fn against each element.
402  */
403 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
404                                      void *data), void *data)
405 {
406         int ret = 0;
407         unsigned long flags;
408         struct device_domain_info *info;
409
410         spin_lock_irqsave(&device_domain_lock, flags);
411         list_for_each_entry(info, &device_domain_list, global) {
412                 ret = fn(info, data);
413                 if (ret) {
414                         spin_unlock_irqrestore(&device_domain_lock, flags);
415                         return ret;
416                 }
417         }
418         spin_unlock_irqrestore(&device_domain_lock, flags);
419
420         return 0;
421 }
422
423 const struct iommu_ops intel_iommu_ops;
424
425 static bool translation_pre_enabled(struct intel_iommu *iommu)
426 {
427         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
428 }
429
430 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
433 }
434
435 static void init_translation_status(struct intel_iommu *iommu)
436 {
437         u32 gsts;
438
439         gsts = readl(iommu->reg + DMAR_GSTS_REG);
440         if (gsts & DMA_GSTS_TES)
441                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
442 }
443
444 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
445 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
446 {
447         return container_of(dom, struct dmar_domain, domain);
448 }
449
450 static int __init intel_iommu_setup(char *str)
451 {
452         if (!str)
453                 return -EINVAL;
454         while (*str) {
455                 if (!strncmp(str, "on", 2)) {
456                         dmar_disabled = 0;
457                         pr_info("IOMMU enabled\n");
458                 } else if (!strncmp(str, "off", 3)) {
459                         dmar_disabled = 1;
460                         no_platform_optin = 1;
461                         pr_info("IOMMU disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         pr_info("Disable GFX device mapping\n");
465                 } else if (!strncmp(str, "forcedac", 8)) {
466                         pr_info("Forcing DAC for PCI devices\n");
467                         dmar_forcedac = 1;
468                 } else if (!strncmp(str, "strict", 6)) {
469                         pr_info("Disable batched IOTLB flush\n");
470                         intel_iommu_strict = 1;
471                 } else if (!strncmp(str, "sp_off", 6)) {
472                         pr_info("Disable supported super page\n");
473                         intel_iommu_superpage = 0;
474                 } else if (!strncmp(str, "sm_on", 5)) {
475                         pr_info("Intel-IOMMU: scalable mode supported\n");
476                         intel_iommu_sm = 1;
477                 } else if (!strncmp(str, "tboot_noforce", 13)) {
478                         printk(KERN_INFO
479                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
480                         intel_iommu_tboot_noforce = 1;
481                 } else if (!strncmp(str, "nobounce", 8)) {
482                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
483                         intel_no_bounce = 1;
484                 }
485
486                 str += strcspn(str, ",");
487                 while (*str == ',')
488                         str++;
489         }
490         return 0;
491 }
492 __setup("intel_iommu=", intel_iommu_setup);
493
494 static struct kmem_cache *iommu_domain_cache;
495 static struct kmem_cache *iommu_devinfo_cache;
496
497 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
498 {
499         struct dmar_domain **domains;
500         int idx = did >> 8;
501
502         domains = iommu->domains[idx];
503         if (!domains)
504                 return NULL;
505
506         return domains[did & 0xff];
507 }
508
509 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
510                              struct dmar_domain *domain)
511 {
512         struct dmar_domain **domains;
513         int idx = did >> 8;
514
515         if (!iommu->domains[idx]) {
516                 size_t size = 256 * sizeof(struct dmar_domain *);
517                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
518         }
519
520         domains = iommu->domains[idx];
521         if (WARN_ON(!domains))
522                 return;
523         else
524                 domains[did & 0xff] = domain;
525 }
526
527 void *alloc_pgtable_page(int node)
528 {
529         struct page *page;
530         void *vaddr = NULL;
531
532         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
533         if (page)
534                 vaddr = page_address(page);
535         return vaddr;
536 }
537
538 void free_pgtable_page(void *vaddr)
539 {
540         free_page((unsigned long)vaddr);
541 }
542
543 static inline void *alloc_domain_mem(void)
544 {
545         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
546 }
547
548 static void free_domain_mem(void *vaddr)
549 {
550         kmem_cache_free(iommu_domain_cache, vaddr);
551 }
552
553 static inline void * alloc_devinfo_mem(void)
554 {
555         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
556 }
557
558 static inline void free_devinfo_mem(void *vaddr)
559 {
560         kmem_cache_free(iommu_devinfo_cache, vaddr);
561 }
562
563 static inline int domain_type_is_si(struct dmar_domain *domain)
564 {
565         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
566 }
567
568 static inline bool domain_use_first_level(struct dmar_domain *domain)
569 {
570         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
571 }
572
573 static inline int domain_pfn_supported(struct dmar_domain *domain,
574                                        unsigned long pfn)
575 {
576         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
577
578         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
579 }
580
581 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
582 {
583         unsigned long sagaw;
584         int agaw = -1;
585
586         sagaw = cap_sagaw(iommu->cap);
587         for (agaw = width_to_agaw(max_gaw);
588              agaw >= 0; agaw--) {
589                 if (test_bit(agaw, &sagaw))
590                         break;
591         }
592
593         return agaw;
594 }
595
596 /*
597  * Calculate max SAGAW for each iommu.
598  */
599 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
600 {
601         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
602 }
603
604 /*
605  * calculate agaw for each iommu.
606  * "SAGAW" may be different across iommus, use a default agaw, and
607  * get a supported less agaw for iommus that don't support the default agaw.
608  */
609 int iommu_calculate_agaw(struct intel_iommu *iommu)
610 {
611         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
612 }
613
614 /* This functionin only returns single iommu in a domain */
615 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
616 {
617         int iommu_id;
618
619         /* si_domain and vm domain should not get here. */
620         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
621                 return NULL;
622
623         for_each_domain_iommu(iommu_id, domain)
624                 break;
625
626         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
627                 return NULL;
628
629         return g_iommus[iommu_id];
630 }
631
632 static void domain_update_iommu_coherency(struct dmar_domain *domain)
633 {
634         struct dmar_drhd_unit *drhd;
635         struct intel_iommu *iommu;
636         bool found = false;
637         int i;
638
639         domain->iommu_coherency = 1;
640
641         for_each_domain_iommu(i, domain) {
642                 found = true;
643                 if (!ecap_coherent(g_iommus[i]->ecap)) {
644                         domain->iommu_coherency = 0;
645                         break;
646                 }
647         }
648         if (found)
649                 return;
650
651         /* No hardware attached; use lowest common denominator */
652         rcu_read_lock();
653         for_each_active_iommu(iommu, drhd) {
654                 if (!ecap_coherent(iommu->ecap)) {
655                         domain->iommu_coherency = 0;
656                         break;
657                 }
658         }
659         rcu_read_unlock();
660 }
661
662 static int domain_update_iommu_snooping(struct intel_iommu *skip)
663 {
664         struct dmar_drhd_unit *drhd;
665         struct intel_iommu *iommu;
666         int ret = 1;
667
668         rcu_read_lock();
669         for_each_active_iommu(iommu, drhd) {
670                 if (iommu != skip) {
671                         if (!ecap_sc_support(iommu->ecap)) {
672                                 ret = 0;
673                                 break;
674                         }
675                 }
676         }
677         rcu_read_unlock();
678
679         return ret;
680 }
681
682 static int domain_update_iommu_superpage(struct dmar_domain *domain,
683                                          struct intel_iommu *skip)
684 {
685         struct dmar_drhd_unit *drhd;
686         struct intel_iommu *iommu;
687         int mask = 0x3;
688
689         if (!intel_iommu_superpage) {
690                 return 0;
691         }
692
693         /* set iommu_superpage to the smallest common denominator */
694         rcu_read_lock();
695         for_each_active_iommu(iommu, drhd) {
696                 if (iommu != skip) {
697                         if (domain && domain_use_first_level(domain)) {
698                                 if (!cap_fl1gp_support(iommu->cap))
699                                         mask = 0x1;
700                         } else {
701                                 mask &= cap_super_page_val(iommu->cap);
702                         }
703
704                         if (!mask)
705                                 break;
706                 }
707         }
708         rcu_read_unlock();
709
710         return fls(mask);
711 }
712
713 /* Some capabilities may be different across iommus */
714 static void domain_update_iommu_cap(struct dmar_domain *domain)
715 {
716         domain_update_iommu_coherency(domain);
717         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
718         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
719 }
720
721 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
722                                          u8 devfn, int alloc)
723 {
724         struct root_entry *root = &iommu->root_entry[bus];
725         struct context_entry *context;
726         u64 *entry;
727
728         entry = &root->lo;
729         if (sm_supported(iommu)) {
730                 if (devfn >= 0x80) {
731                         devfn -= 0x80;
732                         entry = &root->hi;
733                 }
734                 devfn *= 2;
735         }
736         if (*entry & 1)
737                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
738         else {
739                 unsigned long phy_addr;
740                 if (!alloc)
741                         return NULL;
742
743                 context = alloc_pgtable_page(iommu->node);
744                 if (!context)
745                         return NULL;
746
747                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
748                 phy_addr = virt_to_phys((void *)context);
749                 *entry = phy_addr | 1;
750                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
751         }
752         return &context[devfn];
753 }
754
755 static int iommu_dummy(struct device *dev)
756 {
757         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
758 }
759
760 static bool attach_deferred(struct device *dev)
761 {
762         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
763 }
764
765 /**
766  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
767  *                               sub-hierarchy of a candidate PCI-PCI bridge
768  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
769  * @bridge: the candidate PCI-PCI bridge
770  *
771  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
772  */
773 static bool
774 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
775 {
776         struct pci_dev *pdev, *pbridge;
777
778         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
779                 return false;
780
781         pdev = to_pci_dev(dev);
782         pbridge = to_pci_dev(bridge);
783
784         if (pbridge->subordinate &&
785             pbridge->subordinate->number <= pdev->bus->number &&
786             pbridge->subordinate->busn_res.end >= pdev->bus->number)
787                 return true;
788
789         return false;
790 }
791
792 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
793 {
794         struct dmar_drhd_unit *drhd = NULL;
795         struct intel_iommu *iommu;
796         struct device *tmp;
797         struct pci_dev *pdev = NULL;
798         u16 segment = 0;
799         int i;
800
801         if (iommu_dummy(dev))
802                 return NULL;
803
804         if (dev_is_pci(dev)) {
805                 struct pci_dev *pf_pdev;
806
807                 pdev = pci_real_dma_dev(to_pci_dev(dev));
808
809                 /* VFs aren't listed in scope tables; we need to look up
810                  * the PF instead to find the IOMMU. */
811                 pf_pdev = pci_physfn(pdev);
812                 dev = &pf_pdev->dev;
813                 segment = pci_domain_nr(pdev->bus);
814         } else if (has_acpi_companion(dev))
815                 dev = &ACPI_COMPANION(dev)->dev;
816
817         rcu_read_lock();
818         for_each_active_iommu(iommu, drhd) {
819                 if (pdev && segment != drhd->segment)
820                         continue;
821
822                 for_each_active_dev_scope(drhd->devices,
823                                           drhd->devices_cnt, i, tmp) {
824                         if (tmp == dev) {
825                                 /* For a VF use its original BDF# not that of the PF
826                                  * which we used for the IOMMU lookup. Strictly speaking
827                                  * we could do this for all PCI devices; we only need to
828                                  * get the BDF# from the scope table for ACPI matches. */
829                                 if (pdev && pdev->is_virtfn)
830                                         goto got_pdev;
831
832                                 *bus = drhd->devices[i].bus;
833                                 *devfn = drhd->devices[i].devfn;
834                                 goto out;
835                         }
836
837                         if (is_downstream_to_pci_bridge(dev, tmp))
838                                 goto got_pdev;
839                 }
840
841                 if (pdev && drhd->include_all) {
842                 got_pdev:
843                         *bus = pdev->bus->number;
844                         *devfn = pdev->devfn;
845                         goto out;
846                 }
847         }
848         iommu = NULL;
849  out:
850         rcu_read_unlock();
851
852         return iommu;
853 }
854
855 static void domain_flush_cache(struct dmar_domain *domain,
856                                void *addr, int size)
857 {
858         if (!domain->iommu_coherency)
859                 clflush_cache_range(addr, size);
860 }
861
862 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
863 {
864         struct context_entry *context;
865         int ret = 0;
866         unsigned long flags;
867
868         spin_lock_irqsave(&iommu->lock, flags);
869         context = iommu_context_addr(iommu, bus, devfn, 0);
870         if (context)
871                 ret = context_present(context);
872         spin_unlock_irqrestore(&iommu->lock, flags);
873         return ret;
874 }
875
876 static void free_context_table(struct intel_iommu *iommu)
877 {
878         int i;
879         unsigned long flags;
880         struct context_entry *context;
881
882         spin_lock_irqsave(&iommu->lock, flags);
883         if (!iommu->root_entry) {
884                 goto out;
885         }
886         for (i = 0; i < ROOT_ENTRY_NR; i++) {
887                 context = iommu_context_addr(iommu, i, 0, 0);
888                 if (context)
889                         free_pgtable_page(context);
890
891                 if (!sm_supported(iommu))
892                         continue;
893
894                 context = iommu_context_addr(iommu, i, 0x80, 0);
895                 if (context)
896                         free_pgtable_page(context);
897
898         }
899         free_pgtable_page(iommu->root_entry);
900         iommu->root_entry = NULL;
901 out:
902         spin_unlock_irqrestore(&iommu->lock, flags);
903 }
904
905 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
906                                       unsigned long pfn, int *target_level)
907 {
908         struct dma_pte *parent, *pte;
909         int level = agaw_to_level(domain->agaw);
910         int offset;
911
912         BUG_ON(!domain->pgd);
913
914         if (!domain_pfn_supported(domain, pfn))
915                 /* Address beyond IOMMU's addressing capabilities. */
916                 return NULL;
917
918         parent = domain->pgd;
919
920         while (1) {
921                 void *tmp_page;
922
923                 offset = pfn_level_offset(pfn, level);
924                 pte = &parent[offset];
925                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
926                         break;
927                 if (level == *target_level)
928                         break;
929
930                 if (!dma_pte_present(pte)) {
931                         uint64_t pteval;
932
933                         tmp_page = alloc_pgtable_page(domain->nid);
934
935                         if (!tmp_page)
936                                 return NULL;
937
938                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
939                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
940                         if (domain_use_first_level(domain))
941                                 pteval |= DMA_FL_PTE_XD;
942                         if (cmpxchg64(&pte->val, 0ULL, pteval))
943                                 /* Someone else set it while we were thinking; use theirs. */
944                                 free_pgtable_page(tmp_page);
945                         else
946                                 domain_flush_cache(domain, pte, sizeof(*pte));
947                 }
948                 if (level == 1)
949                         break;
950
951                 parent = phys_to_virt(dma_pte_addr(pte));
952                 level--;
953         }
954
955         if (!*target_level)
956                 *target_level = level;
957
958         return pte;
959 }
960
961 /* return address's pte at specific level */
962 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
963                                          unsigned long pfn,
964                                          int level, int *large_page)
965 {
966         struct dma_pte *parent, *pte;
967         int total = agaw_to_level(domain->agaw);
968         int offset;
969
970         parent = domain->pgd;
971         while (level <= total) {
972                 offset = pfn_level_offset(pfn, total);
973                 pte = &parent[offset];
974                 if (level == total)
975                         return pte;
976
977                 if (!dma_pte_present(pte)) {
978                         *large_page = total;
979                         break;
980                 }
981
982                 if (dma_pte_superpage(pte)) {
983                         *large_page = total;
984                         return pte;
985                 }
986
987                 parent = phys_to_virt(dma_pte_addr(pte));
988                 total--;
989         }
990         return NULL;
991 }
992
993 /* clear last level pte, a tlb flush should be followed */
994 static void dma_pte_clear_range(struct dmar_domain *domain,
995                                 unsigned long start_pfn,
996                                 unsigned long last_pfn)
997 {
998         unsigned int large_page;
999         struct dma_pte *first_pte, *pte;
1000
1001         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1002         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1003         BUG_ON(start_pfn > last_pfn);
1004
1005         /* we don't need lock here; nobody else touches the iova range */
1006         do {
1007                 large_page = 1;
1008                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1009                 if (!pte) {
1010                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1011                         continue;
1012                 }
1013                 do {
1014                         dma_clear_pte(pte);
1015                         start_pfn += lvl_to_nr_pages(large_page);
1016                         pte++;
1017                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1018
1019                 domain_flush_cache(domain, first_pte,
1020                                    (void *)pte - (void *)first_pte);
1021
1022         } while (start_pfn && start_pfn <= last_pfn);
1023 }
1024
1025 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1026                                int retain_level, struct dma_pte *pte,
1027                                unsigned long pfn, unsigned long start_pfn,
1028                                unsigned long last_pfn)
1029 {
1030         pfn = max(start_pfn, pfn);
1031         pte = &pte[pfn_level_offset(pfn, level)];
1032
1033         do {
1034                 unsigned long level_pfn;
1035                 struct dma_pte *level_pte;
1036
1037                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1038                         goto next;
1039
1040                 level_pfn = pfn & level_mask(level);
1041                 level_pte = phys_to_virt(dma_pte_addr(pte));
1042
1043                 if (level > 2) {
1044                         dma_pte_free_level(domain, level - 1, retain_level,
1045                                            level_pte, level_pfn, start_pfn,
1046                                            last_pfn);
1047                 }
1048
1049                 /*
1050                  * Free the page table if we're below the level we want to
1051                  * retain and the range covers the entire table.
1052                  */
1053                 if (level < retain_level && !(start_pfn > level_pfn ||
1054                       last_pfn < level_pfn + level_size(level) - 1)) {
1055                         dma_clear_pte(pte);
1056                         domain_flush_cache(domain, pte, sizeof(*pte));
1057                         free_pgtable_page(level_pte);
1058                 }
1059 next:
1060                 pfn += level_size(level);
1061         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1062 }
1063
1064 /*
1065  * clear last level (leaf) ptes and free page table pages below the
1066  * level we wish to keep intact.
1067  */
1068 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1069                                    unsigned long start_pfn,
1070                                    unsigned long last_pfn,
1071                                    int retain_level)
1072 {
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         dma_pte_clear_range(domain, start_pfn, last_pfn);
1078
1079         /* We don't need lock here; nobody else touches the iova range */
1080         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1081                            domain->pgd, 0, start_pfn, last_pfn);
1082
1083         /* free pgd */
1084         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1085                 free_pgtable_page(domain->pgd);
1086                 domain->pgd = NULL;
1087         }
1088 }
1089
1090 /* When a page at a given level is being unlinked from its parent, we don't
1091    need to *modify* it at all. All we need to do is make a list of all the
1092    pages which can be freed just as soon as we've flushed the IOTLB and we
1093    know the hardware page-walk will no longer touch them.
1094    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1095    be freed. */
1096 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1097                                             int level, struct dma_pte *pte,
1098                                             struct page *freelist)
1099 {
1100         struct page *pg;
1101
1102         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1103         pg->freelist = freelist;
1104         freelist = pg;
1105
1106         if (level == 1)
1107                 return freelist;
1108
1109         pte = page_address(pg);
1110         do {
1111                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112                         freelist = dma_pte_list_pagetables(domain, level - 1,
1113                                                            pte, freelist);
1114                 pte++;
1115         } while (!first_pte_in_page(pte));
1116
1117         return freelist;
1118 }
1119
1120 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1121                                         struct dma_pte *pte, unsigned long pfn,
1122                                         unsigned long start_pfn,
1123                                         unsigned long last_pfn,
1124                                         struct page *freelist)
1125 {
1126         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1127
1128         pfn = max(start_pfn, pfn);
1129         pte = &pte[pfn_level_offset(pfn, level)];
1130
1131         do {
1132                 unsigned long level_pfn;
1133
1134                 if (!dma_pte_present(pte))
1135                         goto next;
1136
1137                 level_pfn = pfn & level_mask(level);
1138
1139                 /* If range covers entire pagetable, free it */
1140                 if (start_pfn <= level_pfn &&
1141                     last_pfn >= level_pfn + level_size(level) - 1) {
1142                         /* These suborbinate page tables are going away entirely. Don't
1143                            bother to clear them; we're just going to *free* them. */
1144                         if (level > 1 && !dma_pte_superpage(pte))
1145                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1146
1147                         dma_clear_pte(pte);
1148                         if (!first_pte)
1149                                 first_pte = pte;
1150                         last_pte = pte;
1151                 } else if (level > 1) {
1152                         /* Recurse down into a level that isn't *entirely* obsolete */
1153                         freelist = dma_pte_clear_level(domain, level - 1,
1154                                                        phys_to_virt(dma_pte_addr(pte)),
1155                                                        level_pfn, start_pfn, last_pfn,
1156                                                        freelist);
1157                 }
1158 next:
1159                 pfn += level_size(level);
1160         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161
1162         if (first_pte)
1163                 domain_flush_cache(domain, first_pte,
1164                                    (void *)++last_pte - (void *)first_pte);
1165
1166         return freelist;
1167 }
1168
1169 /* We can't just free the pages because the IOMMU may still be walking
1170    the page tables, and may have cached the intermediate levels. The
1171    pages can only be freed after the IOTLB flush has been done. */
1172 static struct page *domain_unmap(struct dmar_domain *domain,
1173                                  unsigned long start_pfn,
1174                                  unsigned long last_pfn)
1175 {
1176         struct page *freelist;
1177
1178         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1179         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1180         BUG_ON(start_pfn > last_pfn);
1181
1182         /* we don't need lock here; nobody else touches the iova range */
1183         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1184                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1185
1186         /* free pgd */
1187         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188                 struct page *pgd_page = virt_to_page(domain->pgd);
1189                 pgd_page->freelist = freelist;
1190                 freelist = pgd_page;
1191
1192                 domain->pgd = NULL;
1193         }
1194
1195         return freelist;
1196 }
1197
1198 static void dma_free_pagelist(struct page *freelist)
1199 {
1200         struct page *pg;
1201
1202         while ((pg = freelist)) {
1203                 freelist = pg->freelist;
1204                 free_pgtable_page(page_address(pg));
1205         }
1206 }
1207
1208 static void iova_entry_free(unsigned long data)
1209 {
1210         struct page *freelist = (struct page *)data;
1211
1212         dma_free_pagelist(freelist);
1213 }
1214
1215 /* iommu handling */
1216 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1217 {
1218         struct root_entry *root;
1219         unsigned long flags;
1220
1221         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1222         if (!root) {
1223                 pr_err("Allocating root entry for %s failed\n",
1224                         iommu->name);
1225                 return -ENOMEM;
1226         }
1227
1228         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1229
1230         spin_lock_irqsave(&iommu->lock, flags);
1231         iommu->root_entry = root;
1232         spin_unlock_irqrestore(&iommu->lock, flags);
1233
1234         return 0;
1235 }
1236
1237 static void iommu_set_root_entry(struct intel_iommu *iommu)
1238 {
1239         u64 addr;
1240         u32 sts;
1241         unsigned long flag;
1242
1243         addr = virt_to_phys(iommu->root_entry);
1244         if (sm_supported(iommu))
1245                 addr |= DMA_RTADDR_SMT;
1246
1247         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1249
1250         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1251
1252         /* Make sure hardware complete it */
1253         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1254                       readl, (sts & DMA_GSTS_RTPS), sts);
1255
1256         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1257 }
1258
1259 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1260 {
1261         u32 val;
1262         unsigned long flag;
1263
1264         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1265                 return;
1266
1267         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1268         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1269
1270         /* Make sure hardware complete it */
1271         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272                       readl, (!(val & DMA_GSTS_WBFS)), val);
1273
1274         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1275 }
1276
1277 /* return value determine if we need a write buffer flush */
1278 static void __iommu_flush_context(struct intel_iommu *iommu,
1279                                   u16 did, u16 source_id, u8 function_mask,
1280                                   u64 type)
1281 {
1282         u64 val = 0;
1283         unsigned long flag;
1284
1285         switch (type) {
1286         case DMA_CCMD_GLOBAL_INVL:
1287                 val = DMA_CCMD_GLOBAL_INVL;
1288                 break;
1289         case DMA_CCMD_DOMAIN_INVL:
1290                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1291                 break;
1292         case DMA_CCMD_DEVICE_INVL:
1293                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1294                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1295                 break;
1296         default:
1297                 BUG();
1298         }
1299         val |= DMA_CCMD_ICC;
1300
1301         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1303
1304         /* Make sure hardware complete it */
1305         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1306                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1307
1308         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1309 }
1310
1311 /* return value determine if we need a write buffer flush */
1312 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1313                                 u64 addr, unsigned int size_order, u64 type)
1314 {
1315         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1316         u64 val = 0, val_iva = 0;
1317         unsigned long flag;
1318
1319         switch (type) {
1320         case DMA_TLB_GLOBAL_FLUSH:
1321                 /* global flush doesn't need set IVA_REG */
1322                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1323                 break;
1324         case DMA_TLB_DSI_FLUSH:
1325                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1326                 break;
1327         case DMA_TLB_PSI_FLUSH:
1328                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1329                 /* IH bit is passed in as part of address */
1330                 val_iva = size_order | addr;
1331                 break;
1332         default:
1333                 BUG();
1334         }
1335         /* Note: set drain read/write */
1336 #if 0
1337         /*
1338          * This is probably to be super secure.. Looks like we can
1339          * ignore it without any impact.
1340          */
1341         if (cap_read_drain(iommu->cap))
1342                 val |= DMA_TLB_READ_DRAIN;
1343 #endif
1344         if (cap_write_drain(iommu->cap))
1345                 val |= DMA_TLB_WRITE_DRAIN;
1346
1347         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1348         /* Note: Only uses first TLB reg currently */
1349         if (val_iva)
1350                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1351         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1352
1353         /* Make sure hardware complete it */
1354         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1355                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1356
1357         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1358
1359         /* check IOTLB invalidation granularity */
1360         if (DMA_TLB_IAIG(val) == 0)
1361                 pr_err("Flush IOTLB failed\n");
1362         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1363                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1364                         (unsigned long long)DMA_TLB_IIRG(type),
1365                         (unsigned long long)DMA_TLB_IAIG(val));
1366 }
1367
1368 static struct device_domain_info *
1369 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1370                          u8 bus, u8 devfn)
1371 {
1372         struct device_domain_info *info;
1373
1374         assert_spin_locked(&device_domain_lock);
1375
1376         if (!iommu->qi)
1377                 return NULL;
1378
1379         list_for_each_entry(info, &domain->devices, link)
1380                 if (info->iommu == iommu && info->bus == bus &&
1381                     info->devfn == devfn) {
1382                         if (info->ats_supported && info->dev)
1383                                 return info;
1384                         break;
1385                 }
1386
1387         return NULL;
1388 }
1389
1390 static void domain_update_iotlb(struct dmar_domain *domain)
1391 {
1392         struct device_domain_info *info;
1393         bool has_iotlb_device = false;
1394
1395         assert_spin_locked(&device_domain_lock);
1396
1397         list_for_each_entry(info, &domain->devices, link) {
1398                 struct pci_dev *pdev;
1399
1400                 if (!info->dev || !dev_is_pci(info->dev))
1401                         continue;
1402
1403                 pdev = to_pci_dev(info->dev);
1404                 if (pdev->ats_enabled) {
1405                         has_iotlb_device = true;
1406                         break;
1407                 }
1408         }
1409
1410         domain->has_iotlb_device = has_iotlb_device;
1411 }
1412
1413 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1414 {
1415         struct pci_dev *pdev;
1416
1417         assert_spin_locked(&device_domain_lock);
1418
1419         if (!info || !dev_is_pci(info->dev))
1420                 return;
1421
1422         pdev = to_pci_dev(info->dev);
1423         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1424          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1425          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1426          * reserved, which should be set to 0.
1427          */
1428         if (!ecap_dit(info->iommu->ecap))
1429                 info->pfsid = 0;
1430         else {
1431                 struct pci_dev *pf_pdev;
1432
1433                 /* pdev will be returned if device is not a vf */
1434                 pf_pdev = pci_physfn(pdev);
1435                 info->pfsid = pci_dev_id(pf_pdev);
1436         }
1437
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439         /* The PCIe spec, in its wisdom, declares that the behaviour of
1440            the device if you enable PASID support after ATS support is
1441            undefined. So always enable PASID support on devices which
1442            have it, even if we can't yet know if we're ever going to
1443            use it. */
1444         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1445                 info->pasid_enabled = 1;
1446
1447         if (info->pri_supported &&
1448             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1449             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1450                 info->pri_enabled = 1;
1451 #endif
1452         if (!pdev->untrusted && info->ats_supported &&
1453             pci_ats_page_aligned(pdev) &&
1454             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1455                 info->ats_enabled = 1;
1456                 domain_update_iotlb(info->domain);
1457                 info->ats_qdep = pci_ats_queue_depth(pdev);
1458         }
1459 }
1460
1461 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1462 {
1463         struct pci_dev *pdev;
1464
1465         assert_spin_locked(&device_domain_lock);
1466
1467         if (!dev_is_pci(info->dev))
1468                 return;
1469
1470         pdev = to_pci_dev(info->dev);
1471
1472         if (info->ats_enabled) {
1473                 pci_disable_ats(pdev);
1474                 info->ats_enabled = 0;
1475                 domain_update_iotlb(info->domain);
1476         }
1477 #ifdef CONFIG_INTEL_IOMMU_SVM
1478         if (info->pri_enabled) {
1479                 pci_disable_pri(pdev);
1480                 info->pri_enabled = 0;
1481         }
1482         if (info->pasid_enabled) {
1483                 pci_disable_pasid(pdev);
1484                 info->pasid_enabled = 0;
1485         }
1486 #endif
1487 }
1488
1489 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1490                                   u64 addr, unsigned mask)
1491 {
1492         u16 sid, qdep;
1493         unsigned long flags;
1494         struct device_domain_info *info;
1495
1496         if (!domain->has_iotlb_device)
1497                 return;
1498
1499         spin_lock_irqsave(&device_domain_lock, flags);
1500         list_for_each_entry(info, &domain->devices, link) {
1501                 if (!info->ats_enabled)
1502                         continue;
1503
1504                 sid = info->bus << 8 | info->devfn;
1505                 qdep = info->ats_qdep;
1506                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1507                                 qdep, addr, mask);
1508         }
1509         spin_unlock_irqrestore(&device_domain_lock, flags);
1510 }
1511
1512 static void domain_flush_piotlb(struct intel_iommu *iommu,
1513                                 struct dmar_domain *domain,
1514                                 u64 addr, unsigned long npages, bool ih)
1515 {
1516         u16 did = domain->iommu_did[iommu->seq_id];
1517
1518         if (domain->default_pasid)
1519                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1520                                 addr, npages, ih);
1521
1522         if (!list_empty(&domain->devices))
1523                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1524 }
1525
1526 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1527                                   struct dmar_domain *domain,
1528                                   unsigned long pfn, unsigned int pages,
1529                                   int ih, int map)
1530 {
1531         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1532         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1533         u16 did = domain->iommu_did[iommu->seq_id];
1534
1535         BUG_ON(pages == 0);
1536
1537         if (ih)
1538                 ih = 1 << 6;
1539
1540         if (domain_use_first_level(domain)) {
1541                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1542         } else {
1543                 /*
1544                  * Fallback to domain selective flush if no PSI support or
1545                  * the size is too big. PSI requires page size to be 2 ^ x,
1546                  * and the base address is naturally aligned to the size.
1547                  */
1548                 if (!cap_pgsel_inv(iommu->cap) ||
1549                     mask > cap_max_amask_val(iommu->cap))
1550                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1551                                                         DMA_TLB_DSI_FLUSH);
1552                 else
1553                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1554                                                         DMA_TLB_PSI_FLUSH);
1555         }
1556
1557         /*
1558          * In caching mode, changes of pages from non-present to present require
1559          * flush. However, device IOTLB doesn't need to be flushed in this case.
1560          */
1561         if (!cap_caching_mode(iommu->cap) || !map)
1562                 iommu_flush_dev_iotlb(domain, addr, mask);
1563 }
1564
1565 /* Notification for newly created mappings */
1566 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1567                                         struct dmar_domain *domain,
1568                                         unsigned long pfn, unsigned int pages)
1569 {
1570         /*
1571          * It's a non-present to present mapping. Only flush if caching mode
1572          * and second level.
1573          */
1574         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1575                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1576         else
1577                 iommu_flush_write_buffer(iommu);
1578 }
1579
1580 static void iommu_flush_iova(struct iova_domain *iovad)
1581 {
1582         struct dmar_domain *domain;
1583         int idx;
1584
1585         domain = container_of(iovad, struct dmar_domain, iovad);
1586
1587         for_each_domain_iommu(idx, domain) {
1588                 struct intel_iommu *iommu = g_iommus[idx];
1589                 u16 did = domain->iommu_did[iommu->seq_id];
1590
1591                 if (domain_use_first_level(domain))
1592                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1593                 else
1594                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1595                                                  DMA_TLB_DSI_FLUSH);
1596
1597                 if (!cap_caching_mode(iommu->cap))
1598                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1599                                               0, MAX_AGAW_PFN_WIDTH);
1600         }
1601 }
1602
1603 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1604 {
1605         u32 pmen;
1606         unsigned long flags;
1607
1608         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1609                 return;
1610
1611         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1613         pmen &= ~DMA_PMEN_EPM;
1614         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1615
1616         /* wait for the protected region status bit to clear */
1617         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1618                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1619
1620         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1621 }
1622
1623 static void iommu_enable_translation(struct intel_iommu *iommu)
1624 {
1625         u32 sts;
1626         unsigned long flags;
1627
1628         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1629         iommu->gcmd |= DMA_GCMD_TE;
1630         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1631
1632         /* Make sure hardware complete it */
1633         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1634                       readl, (sts & DMA_GSTS_TES), sts);
1635
1636         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1637 }
1638
1639 static void iommu_disable_translation(struct intel_iommu *iommu)
1640 {
1641         u32 sts;
1642         unsigned long flag;
1643
1644         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1645         iommu->gcmd &= ~DMA_GCMD_TE;
1646         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1647
1648         /* Make sure hardware complete it */
1649         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1650                       readl, (!(sts & DMA_GSTS_TES)), sts);
1651
1652         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1653 }
1654
1655 static int iommu_init_domains(struct intel_iommu *iommu)
1656 {
1657         u32 ndomains, nlongs;
1658         size_t size;
1659
1660         ndomains = cap_ndoms(iommu->cap);
1661         pr_debug("%s: Number of Domains supported <%d>\n",
1662                  iommu->name, ndomains);
1663         nlongs = BITS_TO_LONGS(ndomains);
1664
1665         spin_lock_init(&iommu->lock);
1666
1667         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1668         if (!iommu->domain_ids) {
1669                 pr_err("%s: Allocating domain id array failed\n",
1670                        iommu->name);
1671                 return -ENOMEM;
1672         }
1673
1674         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1675         iommu->domains = kzalloc(size, GFP_KERNEL);
1676
1677         if (iommu->domains) {
1678                 size = 256 * sizeof(struct dmar_domain *);
1679                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1680         }
1681
1682         if (!iommu->domains || !iommu->domains[0]) {
1683                 pr_err("%s: Allocating domain array failed\n",
1684                        iommu->name);
1685                 kfree(iommu->domain_ids);
1686                 kfree(iommu->domains);
1687                 iommu->domain_ids = NULL;
1688                 iommu->domains    = NULL;
1689                 return -ENOMEM;
1690         }
1691
1692         /*
1693          * If Caching mode is set, then invalid translations are tagged
1694          * with domain-id 0, hence we need to pre-allocate it. We also
1695          * use domain-id 0 as a marker for non-allocated domain-id, so
1696          * make sure it is not used for a real domain.
1697          */
1698         set_bit(0, iommu->domain_ids);
1699
1700         /*
1701          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1702          * entry for first-level or pass-through translation modes should
1703          * be programmed with a domain id different from those used for
1704          * second-level or nested translation. We reserve a domain id for
1705          * this purpose.
1706          */
1707         if (sm_supported(iommu))
1708                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1709
1710         return 0;
1711 }
1712
1713 static void disable_dmar_iommu(struct intel_iommu *iommu)
1714 {
1715         struct device_domain_info *info, *tmp;
1716         unsigned long flags;
1717
1718         if (!iommu->domains || !iommu->domain_ids)
1719                 return;
1720
1721         spin_lock_irqsave(&device_domain_lock, flags);
1722         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1723                 if (info->iommu != iommu)
1724                         continue;
1725
1726                 if (!info->dev || !info->domain)
1727                         continue;
1728
1729                 __dmar_remove_one_dev_info(info);
1730         }
1731         spin_unlock_irqrestore(&device_domain_lock, flags);
1732
1733         if (iommu->gcmd & DMA_GCMD_TE)
1734                 iommu_disable_translation(iommu);
1735 }
1736
1737 static void free_dmar_iommu(struct intel_iommu *iommu)
1738 {
1739         if ((iommu->domains) && (iommu->domain_ids)) {
1740                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1741                 int i;
1742
1743                 for (i = 0; i < elems; i++)
1744                         kfree(iommu->domains[i]);
1745                 kfree(iommu->domains);
1746                 kfree(iommu->domain_ids);
1747                 iommu->domains = NULL;
1748                 iommu->domain_ids = NULL;
1749         }
1750
1751         g_iommus[iommu->seq_id] = NULL;
1752
1753         /* free context mapping */
1754         free_context_table(iommu);
1755
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757         if (pasid_supported(iommu)) {
1758                 if (ecap_prs(iommu->ecap))
1759                         intel_svm_finish_prq(iommu);
1760         }
1761 #endif
1762 }
1763
1764 /*
1765  * Check and return whether first level is used by default for
1766  * DMA translation.
1767  */
1768 static bool first_level_by_default(void)
1769 {
1770         struct dmar_drhd_unit *drhd;
1771         struct intel_iommu *iommu;
1772         static int first_level_support = -1;
1773
1774         if (likely(first_level_support != -1))
1775                 return first_level_support;
1776
1777         first_level_support = 1;
1778
1779         rcu_read_lock();
1780         for_each_active_iommu(iommu, drhd) {
1781                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1782                         first_level_support = 0;
1783                         break;
1784                 }
1785         }
1786         rcu_read_unlock();
1787
1788         return first_level_support;
1789 }
1790
1791 static struct dmar_domain *alloc_domain(int flags)
1792 {
1793         struct dmar_domain *domain;
1794
1795         domain = alloc_domain_mem();
1796         if (!domain)
1797                 return NULL;
1798
1799         memset(domain, 0, sizeof(*domain));
1800         domain->nid = NUMA_NO_NODE;
1801         domain->flags = flags;
1802         if (first_level_by_default())
1803                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1804         domain->has_iotlb_device = false;
1805         INIT_LIST_HEAD(&domain->devices);
1806
1807         return domain;
1808 }
1809
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812                                struct intel_iommu *iommu)
1813 {
1814         unsigned long ndomains;
1815         int num;
1816
1817         assert_spin_locked(&device_domain_lock);
1818         assert_spin_locked(&iommu->lock);
1819
1820         domain->iommu_refcnt[iommu->seq_id] += 1;
1821         domain->iommu_count += 1;
1822         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823                 ndomains = cap_ndoms(iommu->cap);
1824                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1825
1826                 if (num >= ndomains) {
1827                         pr_err("%s: No free domain ids\n", iommu->name);
1828                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1829                         domain->iommu_count -= 1;
1830                         return -ENOSPC;
1831                 }
1832
1833                 set_bit(num, iommu->domain_ids);
1834                 set_iommu_domain(iommu, num, domain);
1835
1836                 domain->iommu_did[iommu->seq_id] = num;
1837                 domain->nid                      = iommu->node;
1838
1839                 domain_update_iommu_cap(domain);
1840         }
1841
1842         return 0;
1843 }
1844
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846                                struct intel_iommu *iommu)
1847 {
1848         int num, count;
1849
1850         assert_spin_locked(&device_domain_lock);
1851         assert_spin_locked(&iommu->lock);
1852
1853         domain->iommu_refcnt[iommu->seq_id] -= 1;
1854         count = --domain->iommu_count;
1855         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856                 num = domain->iommu_did[iommu->seq_id];
1857                 clear_bit(num, iommu->domain_ids);
1858                 set_iommu_domain(iommu, num, NULL);
1859
1860                 domain_update_iommu_cap(domain);
1861                 domain->iommu_did[iommu->seq_id] = 0;
1862         }
1863
1864         return count;
1865 }
1866
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1869
1870 static int dmar_init_reserved_ranges(void)
1871 {
1872         struct pci_dev *pdev = NULL;
1873         struct iova *iova;
1874         int i;
1875
1876         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1877
1878         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879                 &reserved_rbtree_key);
1880
1881         /* IOAPIC ranges shouldn't be accessed by DMA */
1882         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883                 IOVA_PFN(IOAPIC_RANGE_END));
1884         if (!iova) {
1885                 pr_err("Reserve IOAPIC range failed\n");
1886                 return -ENODEV;
1887         }
1888
1889         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1890         for_each_pci_dev(pdev) {
1891                 struct resource *r;
1892
1893                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894                         r = &pdev->resource[i];
1895                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1896                                 continue;
1897                         iova = reserve_iova(&reserved_iova_list,
1898                                             IOVA_PFN(r->start),
1899                                             IOVA_PFN(r->end));
1900                         if (!iova) {
1901                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1902                                 return -ENODEV;
1903                         }
1904                 }
1905         }
1906         return 0;
1907 }
1908
1909 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1910 {
1911         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1912 }
1913
1914 static inline int guestwidth_to_adjustwidth(int gaw)
1915 {
1916         int agaw;
1917         int r = (gaw - 12) % 9;
1918
1919         if (r == 0)
1920                 agaw = gaw;
1921         else
1922                 agaw = gaw + 9 - r;
1923         if (agaw > 64)
1924                 agaw = 64;
1925         return agaw;
1926 }
1927
1928 static void domain_exit(struct dmar_domain *domain)
1929 {
1930
1931         /* Remove associated devices and clear attached or cached domains */
1932         domain_remove_dev_info(domain);
1933
1934         /* destroy iovas */
1935         put_iova_domain(&domain->iovad);
1936
1937         if (domain->pgd) {
1938                 struct page *freelist;
1939
1940                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1941                 dma_free_pagelist(freelist);
1942         }
1943
1944         free_domain_mem(domain);
1945 }
1946
1947 /*
1948  * Get the PASID directory size for scalable mode context entry.
1949  * Value of X in the PDTS field of a scalable mode context entry
1950  * indicates PASID directory with 2^(X + 7) entries.
1951  */
1952 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1953 {
1954         int pds, max_pde;
1955
1956         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1957         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1958         if (pds < 7)
1959                 return 0;
1960
1961         return pds - 7;
1962 }
1963
1964 /*
1965  * Set the RID_PASID field of a scalable mode context entry. The
1966  * IOMMU hardware will use the PASID value set in this field for
1967  * DMA translations of DMA requests without PASID.
1968  */
1969 static inline void
1970 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1971 {
1972         context->hi |= pasid & ((1 << 20) - 1);
1973         context->hi |= (1 << 20);
1974 }
1975
1976 /*
1977  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1978  * entry.
1979  */
1980 static inline void context_set_sm_dte(struct context_entry *context)
1981 {
1982         context->lo |= (1 << 2);
1983 }
1984
1985 /*
1986  * Set the PRE(Page Request Enable) field of a scalable mode context
1987  * entry.
1988  */
1989 static inline void context_set_sm_pre(struct context_entry *context)
1990 {
1991         context->lo |= (1 << 4);
1992 }
1993
1994 /* Convert value to context PASID directory size field coding. */
1995 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1996
1997 static int domain_context_mapping_one(struct dmar_domain *domain,
1998                                       struct intel_iommu *iommu,
1999                                       struct pasid_table *table,
2000                                       u8 bus, u8 devfn)
2001 {
2002         u16 did = domain->iommu_did[iommu->seq_id];
2003         int translation = CONTEXT_TT_MULTI_LEVEL;
2004         struct device_domain_info *info = NULL;
2005         struct context_entry *context;
2006         unsigned long flags;
2007         int ret;
2008
2009         WARN_ON(did == 0);
2010
2011         if (hw_pass_through && domain_type_is_si(domain))
2012                 translation = CONTEXT_TT_PASS_THROUGH;
2013
2014         pr_debug("Set context mapping for %02x:%02x.%d\n",
2015                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2016
2017         BUG_ON(!domain->pgd);
2018
2019         spin_lock_irqsave(&device_domain_lock, flags);
2020         spin_lock(&iommu->lock);
2021
2022         ret = -ENOMEM;
2023         context = iommu_context_addr(iommu, bus, devfn, 1);
2024         if (!context)
2025                 goto out_unlock;
2026
2027         ret = 0;
2028         if (context_present(context))
2029                 goto out_unlock;
2030
2031         /*
2032          * For kdump cases, old valid entries may be cached due to the
2033          * in-flight DMA and copied pgtable, but there is no unmapping
2034          * behaviour for them, thus we need an explicit cache flush for
2035          * the newly-mapped device. For kdump, at this point, the device
2036          * is supposed to finish reset at its driver probe stage, so no
2037          * in-flight DMA will exist, and we don't need to worry anymore
2038          * hereafter.
2039          */
2040         if (context_copied(context)) {
2041                 u16 did_old = context_domain_id(context);
2042
2043                 if (did_old < cap_ndoms(iommu->cap)) {
2044                         iommu->flush.flush_context(iommu, did_old,
2045                                                    (((u16)bus) << 8) | devfn,
2046                                                    DMA_CCMD_MASK_NOBIT,
2047                                                    DMA_CCMD_DEVICE_INVL);
2048                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2049                                                  DMA_TLB_DSI_FLUSH);
2050                 }
2051         }
2052
2053         context_clear_entry(context);
2054
2055         if (sm_supported(iommu)) {
2056                 unsigned long pds;
2057
2058                 WARN_ON(!table);
2059
2060                 /* Setup the PASID DIR pointer: */
2061                 pds = context_get_sm_pds(table);
2062                 context->lo = (u64)virt_to_phys(table->table) |
2063                                 context_pdts(pds);
2064
2065                 /* Setup the RID_PASID field: */
2066                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2067
2068                 /*
2069                  * Setup the Device-TLB enable bit and Page request
2070                  * Enable bit:
2071                  */
2072                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2073                 if (info && info->ats_supported)
2074                         context_set_sm_dte(context);
2075                 if (info && info->pri_supported)
2076                         context_set_sm_pre(context);
2077         } else {
2078                 struct dma_pte *pgd = domain->pgd;
2079                 int agaw;
2080
2081                 context_set_domain_id(context, did);
2082
2083                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2084                         /*
2085                          * Skip top levels of page tables for iommu which has
2086                          * less agaw than default. Unnecessary for PT mode.
2087                          */
2088                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2089                                 ret = -ENOMEM;
2090                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2091                                 if (!dma_pte_present(pgd))
2092                                         goto out_unlock;
2093                         }
2094
2095                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2096                         if (info && info->ats_supported)
2097                                 translation = CONTEXT_TT_DEV_IOTLB;
2098                         else
2099                                 translation = CONTEXT_TT_MULTI_LEVEL;
2100
2101                         context_set_address_root(context, virt_to_phys(pgd));
2102                         context_set_address_width(context, agaw);
2103                 } else {
2104                         /*
2105                          * In pass through mode, AW must be programmed to
2106                          * indicate the largest AGAW value supported by
2107                          * hardware. And ASR is ignored by hardware.
2108                          */
2109                         context_set_address_width(context, iommu->msagaw);
2110                 }
2111
2112                 context_set_translation_type(context, translation);
2113         }
2114
2115         context_set_fault_enable(context);
2116         context_set_present(context);
2117         domain_flush_cache(domain, context, sizeof(*context));
2118
2119         /*
2120          * It's a non-present to present mapping. If hardware doesn't cache
2121          * non-present entry we only need to flush the write-buffer. If the
2122          * _does_ cache non-present entries, then it does so in the special
2123          * domain #0, which we have to flush:
2124          */
2125         if (cap_caching_mode(iommu->cap)) {
2126                 iommu->flush.flush_context(iommu, 0,
2127                                            (((u16)bus) << 8) | devfn,
2128                                            DMA_CCMD_MASK_NOBIT,
2129                                            DMA_CCMD_DEVICE_INVL);
2130                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2131         } else {
2132                 iommu_flush_write_buffer(iommu);
2133         }
2134         iommu_enable_dev_iotlb(info);
2135
2136         ret = 0;
2137
2138 out_unlock:
2139         spin_unlock(&iommu->lock);
2140         spin_unlock_irqrestore(&device_domain_lock, flags);
2141
2142         return ret;
2143 }
2144
2145 struct domain_context_mapping_data {
2146         struct dmar_domain *domain;
2147         struct intel_iommu *iommu;
2148         struct pasid_table *table;
2149 };
2150
2151 static int domain_context_mapping_cb(struct pci_dev *pdev,
2152                                      u16 alias, void *opaque)
2153 {
2154         struct domain_context_mapping_data *data = opaque;
2155
2156         return domain_context_mapping_one(data->domain, data->iommu,
2157                                           data->table, PCI_BUS_NUM(alias),
2158                                           alias & 0xff);
2159 }
2160
2161 static int
2162 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2163 {
2164         struct domain_context_mapping_data data;
2165         struct pasid_table *table;
2166         struct intel_iommu *iommu;
2167         u8 bus, devfn;
2168
2169         iommu = device_to_iommu(dev, &bus, &devfn);
2170         if (!iommu)
2171                 return -ENODEV;
2172
2173         table = intel_pasid_get_table(dev);
2174
2175         if (!dev_is_pci(dev))
2176                 return domain_context_mapping_one(domain, iommu, table,
2177                                                   bus, devfn);
2178
2179         data.domain = domain;
2180         data.iommu = iommu;
2181         data.table = table;
2182
2183         return pci_for_each_dma_alias(to_pci_dev(dev),
2184                                       &domain_context_mapping_cb, &data);
2185 }
2186
2187 static int domain_context_mapped_cb(struct pci_dev *pdev,
2188                                     u16 alias, void *opaque)
2189 {
2190         struct intel_iommu *iommu = opaque;
2191
2192         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2193 }
2194
2195 static int domain_context_mapped(struct device *dev)
2196 {
2197         struct intel_iommu *iommu;
2198         u8 bus, devfn;
2199
2200         iommu = device_to_iommu(dev, &bus, &devfn);
2201         if (!iommu)
2202                 return -ENODEV;
2203
2204         if (!dev_is_pci(dev))
2205                 return device_context_mapped(iommu, bus, devfn);
2206
2207         return !pci_for_each_dma_alias(to_pci_dev(dev),
2208                                        domain_context_mapped_cb, iommu);
2209 }
2210
2211 /* Returns a number of VTD pages, but aligned to MM page size */
2212 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2213                                             size_t size)
2214 {
2215         host_addr &= ~PAGE_MASK;
2216         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2217 }
2218
2219 /* Return largest possible superpage level for a given mapping */
2220 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2221                                           unsigned long iov_pfn,
2222                                           unsigned long phy_pfn,
2223                                           unsigned long pages)
2224 {
2225         int support, level = 1;
2226         unsigned long pfnmerge;
2227
2228         support = domain->iommu_superpage;
2229
2230         /* To use a large page, the virtual *and* physical addresses
2231            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2232            of them will mean we have to use smaller pages. So just
2233            merge them and check both at once. */
2234         pfnmerge = iov_pfn | phy_pfn;
2235
2236         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2237                 pages >>= VTD_STRIDE_SHIFT;
2238                 if (!pages)
2239                         break;
2240                 pfnmerge >>= VTD_STRIDE_SHIFT;
2241                 level++;
2242                 support--;
2243         }
2244         return level;
2245 }
2246
2247 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2248                             struct scatterlist *sg, unsigned long phys_pfn,
2249                             unsigned long nr_pages, int prot)
2250 {
2251         struct dma_pte *first_pte = NULL, *pte = NULL;
2252         phys_addr_t uninitialized_var(pteval);
2253         unsigned long sg_res = 0;
2254         unsigned int largepage_lvl = 0;
2255         unsigned long lvl_pages = 0;
2256         u64 attr;
2257
2258         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2259
2260         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2261                 return -EINVAL;
2262
2263         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2264         if (domain_use_first_level(domain))
2265                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2266
2267         if (!sg) {
2268                 sg_res = nr_pages;
2269                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2270         }
2271
2272         while (nr_pages > 0) {
2273                 uint64_t tmp;
2274
2275                 if (!sg_res) {
2276                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2277
2278                         sg_res = aligned_nrpages(sg->offset, sg->length);
2279                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2280                         sg->dma_length = sg->length;
2281                         pteval = (sg_phys(sg) - pgoff) | attr;
2282                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2283                 }
2284
2285                 if (!pte) {
2286                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2287
2288                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2289                         if (!pte)
2290                                 return -ENOMEM;
2291                         /* It is large page*/
2292                         if (largepage_lvl > 1) {
2293                                 unsigned long nr_superpages, end_pfn;
2294
2295                                 pteval |= DMA_PTE_LARGE_PAGE;
2296                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2297
2298                                 nr_superpages = sg_res / lvl_pages;
2299                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2300
2301                                 /*
2302                                  * Ensure that old small page tables are
2303                                  * removed to make room for superpage(s).
2304                                  * We're adding new large pages, so make sure
2305                                  * we don't remove their parent tables.
2306                                  */
2307                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2308                                                        largepage_lvl + 1);
2309                         } else {
2310                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2311                         }
2312
2313                 }
2314                 /* We don't need lock here, nobody else
2315                  * touches the iova range
2316                  */
2317                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2318                 if (tmp) {
2319                         static int dumps = 5;
2320                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2321                                 iov_pfn, tmp, (unsigned long long)pteval);
2322                         if (dumps) {
2323                                 dumps--;
2324                                 debug_dma_dump_mappings(NULL);
2325                         }
2326                         WARN_ON(1);
2327                 }
2328
2329                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2330
2331                 BUG_ON(nr_pages < lvl_pages);
2332                 BUG_ON(sg_res < lvl_pages);
2333
2334                 nr_pages -= lvl_pages;
2335                 iov_pfn += lvl_pages;
2336                 phys_pfn += lvl_pages;
2337                 pteval += lvl_pages * VTD_PAGE_SIZE;
2338                 sg_res -= lvl_pages;
2339
2340                 /* If the next PTE would be the first in a new page, then we
2341                    need to flush the cache on the entries we've just written.
2342                    And then we'll need to recalculate 'pte', so clear it and
2343                    let it get set again in the if (!pte) block above.
2344
2345                    If we're done (!nr_pages) we need to flush the cache too.
2346
2347                    Also if we've been setting superpages, we may need to
2348                    recalculate 'pte' and switch back to smaller pages for the
2349                    end of the mapping, if the trailing size is not enough to
2350                    use another superpage (i.e. sg_res < lvl_pages). */
2351                 pte++;
2352                 if (!nr_pages || first_pte_in_page(pte) ||
2353                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2354                         domain_flush_cache(domain, first_pte,
2355                                            (void *)pte - (void *)first_pte);
2356                         pte = NULL;
2357                 }
2358
2359                 if (!sg_res && nr_pages)
2360                         sg = sg_next(sg);
2361         }
2362         return 0;
2363 }
2364
2365 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366                           struct scatterlist *sg, unsigned long phys_pfn,
2367                           unsigned long nr_pages, int prot)
2368 {
2369         int iommu_id, ret;
2370         struct intel_iommu *iommu;
2371
2372         /* Do the real mapping first */
2373         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2374         if (ret)
2375                 return ret;
2376
2377         for_each_domain_iommu(iommu_id, domain) {
2378                 iommu = g_iommus[iommu_id];
2379                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2380         }
2381
2382         return 0;
2383 }
2384
2385 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2386                                     struct scatterlist *sg, unsigned long nr_pages,
2387                                     int prot)
2388 {
2389         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2390 }
2391
2392 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2393                                      unsigned long phys_pfn, unsigned long nr_pages,
2394                                      int prot)
2395 {
2396         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2397 }
2398
2399 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2400 {
2401         unsigned long flags;
2402         struct context_entry *context;
2403         u16 did_old;
2404
2405         if (!iommu)
2406                 return;
2407
2408         spin_lock_irqsave(&iommu->lock, flags);
2409         context = iommu_context_addr(iommu, bus, devfn, 0);
2410         if (!context) {
2411                 spin_unlock_irqrestore(&iommu->lock, flags);
2412                 return;
2413         }
2414         did_old = context_domain_id(context);
2415         context_clear_entry(context);
2416         __iommu_flush_cache(iommu, context, sizeof(*context));
2417         spin_unlock_irqrestore(&iommu->lock, flags);
2418         iommu->flush.flush_context(iommu,
2419                                    did_old,
2420                                    (((u16)bus) << 8) | devfn,
2421                                    DMA_CCMD_MASK_NOBIT,
2422                                    DMA_CCMD_DEVICE_INVL);
2423         iommu->flush.flush_iotlb(iommu,
2424                                  did_old,
2425                                  0,
2426                                  0,
2427                                  DMA_TLB_DSI_FLUSH);
2428 }
2429
2430 static inline void unlink_domain_info(struct device_domain_info *info)
2431 {
2432         assert_spin_locked(&device_domain_lock);
2433         list_del(&info->link);
2434         list_del(&info->global);
2435         if (info->dev)
2436                 info->dev->archdata.iommu = NULL;
2437 }
2438
2439 static void domain_remove_dev_info(struct dmar_domain *domain)
2440 {
2441         struct device_domain_info *info, *tmp;
2442         unsigned long flags;
2443
2444         spin_lock_irqsave(&device_domain_lock, flags);
2445         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2446                 __dmar_remove_one_dev_info(info);
2447         spin_unlock_irqrestore(&device_domain_lock, flags);
2448 }
2449
2450 struct dmar_domain *find_domain(struct device *dev)
2451 {
2452         struct device_domain_info *info;
2453
2454         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2455                 return NULL;
2456
2457         if (dev_is_pci(dev))
2458                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2459
2460         /* No lock here, assumes no domain exit in normal case */
2461         info = dev->archdata.iommu;
2462         if (likely(info))
2463                 return info->domain;
2464
2465         return NULL;
2466 }
2467
2468 static void do_deferred_attach(struct device *dev)
2469 {
2470         struct iommu_domain *domain;
2471
2472         dev->archdata.iommu = NULL;
2473         domain = iommu_get_domain_for_dev(dev);
2474         if (domain)
2475                 intel_iommu_attach_device(domain, dev);
2476 }
2477
2478 static inline struct device_domain_info *
2479 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2480 {
2481         struct device_domain_info *info;
2482
2483         list_for_each_entry(info, &device_domain_list, global)
2484                 if (info->iommu->segment == segment && info->bus == bus &&
2485                     info->devfn == devfn)
2486                         return info;
2487
2488         return NULL;
2489 }
2490
2491 static int domain_setup_first_level(struct intel_iommu *iommu,
2492                                     struct dmar_domain *domain,
2493                                     struct device *dev,
2494                                     int pasid)
2495 {
2496         int flags = PASID_FLAG_SUPERVISOR_MODE;
2497         struct dma_pte *pgd = domain->pgd;
2498         int agaw, level;
2499
2500         /*
2501          * Skip top levels of page tables for iommu which has
2502          * less agaw than default. Unnecessary for PT mode.
2503          */
2504         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2505                 pgd = phys_to_virt(dma_pte_addr(pgd));
2506                 if (!dma_pte_present(pgd))
2507                         return -ENOMEM;
2508         }
2509
2510         level = agaw_to_level(agaw);
2511         if (level != 4 && level != 5)
2512                 return -EINVAL;
2513
2514         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2515
2516         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2517                                              domain->iommu_did[iommu->seq_id],
2518                                              flags);
2519 }
2520
2521 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2522                                                     int bus, int devfn,
2523                                                     struct device *dev,
2524                                                     struct dmar_domain *domain)
2525 {
2526         struct dmar_domain *found = NULL;
2527         struct device_domain_info *info;
2528         unsigned long flags;
2529         int ret;
2530
2531         info = alloc_devinfo_mem();
2532         if (!info)
2533                 return NULL;
2534
2535         info->bus = bus;
2536         info->devfn = devfn;
2537         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2538         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2539         info->ats_qdep = 0;
2540         info->dev = dev;
2541         info->domain = domain;
2542         info->iommu = iommu;
2543         info->pasid_table = NULL;
2544         info->auxd_enabled = 0;
2545         INIT_LIST_HEAD(&info->auxiliary_domains);
2546
2547         if (dev && dev_is_pci(dev)) {
2548                 struct pci_dev *pdev = to_pci_dev(info->dev);
2549
2550                 if (!pdev->untrusted &&
2551                     !pci_ats_disabled() &&
2552                     ecap_dev_iotlb_support(iommu->ecap) &&
2553                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2554                     dmar_find_matched_atsr_unit(pdev))
2555                         info->ats_supported = 1;
2556
2557                 if (sm_supported(iommu)) {
2558                         if (pasid_supported(iommu)) {
2559                                 int features = pci_pasid_features(pdev);
2560                                 if (features >= 0)
2561                                         info->pasid_supported = features | 1;
2562                         }
2563
2564                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2565                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2566                                 info->pri_supported = 1;
2567                 }
2568         }
2569
2570         spin_lock_irqsave(&device_domain_lock, flags);
2571         if (dev)
2572                 found = find_domain(dev);
2573
2574         if (!found) {
2575                 struct device_domain_info *info2;
2576                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2577                 if (info2) {
2578                         found      = info2->domain;
2579                         info2->dev = dev;
2580                 }
2581         }
2582
2583         if (found) {
2584                 spin_unlock_irqrestore(&device_domain_lock, flags);
2585                 free_devinfo_mem(info);
2586                 /* Caller must free the original domain */
2587                 return found;
2588         }
2589
2590         spin_lock(&iommu->lock);
2591         ret = domain_attach_iommu(domain, iommu);
2592         spin_unlock(&iommu->lock);
2593
2594         if (ret) {
2595                 spin_unlock_irqrestore(&device_domain_lock, flags);
2596                 free_devinfo_mem(info);
2597                 return NULL;
2598         }
2599
2600         list_add(&info->link, &domain->devices);
2601         list_add(&info->global, &device_domain_list);
2602         if (dev)
2603                 dev->archdata.iommu = info;
2604         spin_unlock_irqrestore(&device_domain_lock, flags);
2605
2606         /* PASID table is mandatory for a PCI device in scalable mode. */
2607         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2608                 ret = intel_pasid_alloc_table(dev);
2609                 if (ret) {
2610                         dev_err(dev, "PASID table allocation failed\n");
2611                         dmar_remove_one_dev_info(dev);
2612                         return NULL;
2613                 }
2614
2615                 /* Setup the PASID entry for requests without PASID: */
2616                 spin_lock(&iommu->lock);
2617                 if (hw_pass_through && domain_type_is_si(domain))
2618                         ret = intel_pasid_setup_pass_through(iommu, domain,
2619                                         dev, PASID_RID2PASID);
2620                 else if (domain_use_first_level(domain))
2621                         ret = domain_setup_first_level(iommu, domain, dev,
2622                                         PASID_RID2PASID);
2623                 else
2624                         ret = intel_pasid_setup_second_level(iommu, domain,
2625                                         dev, PASID_RID2PASID);
2626                 spin_unlock(&iommu->lock);
2627                 if (ret) {
2628                         dev_err(dev, "Setup RID2PASID failed\n");
2629                         dmar_remove_one_dev_info(dev);
2630                         return NULL;
2631                 }
2632         }
2633
2634         if (dev && domain_context_mapping(domain, dev)) {
2635                 dev_err(dev, "Domain context map failed\n");
2636                 dmar_remove_one_dev_info(dev);
2637                 return NULL;
2638         }
2639
2640         return domain;
2641 }
2642
2643 static int iommu_domain_identity_map(struct dmar_domain *domain,
2644                                      unsigned long long start,
2645                                      unsigned long long end)
2646 {
2647         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2648         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2649
2650         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2651                           dma_to_mm_pfn(last_vpfn))) {
2652                 pr_err("Reserving iova failed\n");
2653                 return -ENOMEM;
2654         }
2655
2656         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2657         /*
2658          * RMRR range might have overlap with physical memory range,
2659          * clear it first
2660          */
2661         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2662
2663         return __domain_mapping(domain, first_vpfn, NULL,
2664                                 first_vpfn, last_vpfn - first_vpfn + 1,
2665                                 DMA_PTE_READ|DMA_PTE_WRITE);
2666 }
2667
2668 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2669
2670 static int __init si_domain_init(int hw)
2671 {
2672         struct dmar_rmrr_unit *rmrr;
2673         struct device *dev;
2674         int i, nid, ret;
2675
2676         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2677         if (!si_domain)
2678                 return -EFAULT;
2679
2680         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2681                 domain_exit(si_domain);
2682                 return -EFAULT;
2683         }
2684
2685         if (hw)
2686                 return 0;
2687
2688         for_each_online_node(nid) {
2689                 unsigned long start_pfn, end_pfn;
2690                 int i;
2691
2692                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2693                         ret = iommu_domain_identity_map(si_domain,
2694                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2695                         if (ret)
2696                                 return ret;
2697                 }
2698         }
2699
2700         /*
2701          * Identity map the RMRRs so that devices with RMRRs could also use
2702          * the si_domain.
2703          */
2704         for_each_rmrr_units(rmrr) {
2705                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2706                                           i, dev) {
2707                         unsigned long long start = rmrr->base_address;
2708                         unsigned long long end = rmrr->end_address;
2709
2710                         if (WARN_ON(end < start ||
2711                                     end >> agaw_to_width(si_domain->agaw)))
2712                                 continue;
2713
2714                         ret = iommu_domain_identity_map(si_domain, start, end);
2715                         if (ret)
2716                                 return ret;
2717                 }
2718         }
2719
2720         return 0;
2721 }
2722
2723 static int identity_mapping(struct device *dev)
2724 {
2725         struct device_domain_info *info;
2726
2727         info = dev->archdata.iommu;
2728         if (info)
2729                 return (info->domain == si_domain);
2730
2731         return 0;
2732 }
2733
2734 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2735 {
2736         struct dmar_domain *ndomain;
2737         struct intel_iommu *iommu;
2738         u8 bus, devfn;
2739
2740         iommu = device_to_iommu(dev, &bus, &devfn);
2741         if (!iommu)
2742                 return -ENODEV;
2743
2744         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2745         if (ndomain != domain)
2746                 return -EBUSY;
2747
2748         return 0;
2749 }
2750
2751 static bool device_has_rmrr(struct device *dev)
2752 {
2753         struct dmar_rmrr_unit *rmrr;
2754         struct device *tmp;
2755         int i;
2756
2757         rcu_read_lock();
2758         for_each_rmrr_units(rmrr) {
2759                 /*
2760                  * Return TRUE if this RMRR contains the device that
2761                  * is passed in.
2762                  */
2763                 for_each_active_dev_scope(rmrr->devices,
2764                                           rmrr->devices_cnt, i, tmp)
2765                         if (tmp == dev ||
2766                             is_downstream_to_pci_bridge(dev, tmp)) {
2767                                 rcu_read_unlock();
2768                                 return true;
2769                         }
2770         }
2771         rcu_read_unlock();
2772         return false;
2773 }
2774
2775 /**
2776  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2777  * is relaxable (ie. is allowed to be not enforced under some conditions)
2778  * @dev: device handle
2779  *
2780  * We assume that PCI USB devices with RMRRs have them largely
2781  * for historical reasons and that the RMRR space is not actively used post
2782  * boot.  This exclusion may change if vendors begin to abuse it.
2783  *
2784  * The same exception is made for graphics devices, with the requirement that
2785  * any use of the RMRR regions will be torn down before assigning the device
2786  * to a guest.
2787  *
2788  * Return: true if the RMRR is relaxable, false otherwise
2789  */
2790 static bool device_rmrr_is_relaxable(struct device *dev)
2791 {
2792         struct pci_dev *pdev;
2793
2794         if (!dev_is_pci(dev))
2795                 return false;
2796
2797         pdev = to_pci_dev(dev);
2798         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2799                 return true;
2800         else
2801                 return false;
2802 }
2803
2804 /*
2805  * There are a couple cases where we need to restrict the functionality of
2806  * devices associated with RMRRs.  The first is when evaluating a device for
2807  * identity mapping because problems exist when devices are moved in and out
2808  * of domains and their respective RMRR information is lost.  This means that
2809  * a device with associated RMRRs will never be in a "passthrough" domain.
2810  * The second is use of the device through the IOMMU API.  This interface
2811  * expects to have full control of the IOVA space for the device.  We cannot
2812  * satisfy both the requirement that RMRR access is maintained and have an
2813  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2814  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2815  * We therefore prevent devices associated with an RMRR from participating in
2816  * the IOMMU API, which eliminates them from device assignment.
2817  *
2818  * In both cases, devices which have relaxable RMRRs are not concerned by this
2819  * restriction. See device_rmrr_is_relaxable comment.
2820  */
2821 static bool device_is_rmrr_locked(struct device *dev)
2822 {
2823         if (!device_has_rmrr(dev))
2824                 return false;
2825
2826         if (device_rmrr_is_relaxable(dev))
2827                 return false;
2828
2829         return true;
2830 }
2831
2832 /*
2833  * Return the required default domain type for a specific device.
2834  *
2835  * @dev: the device in query
2836  * @startup: true if this is during early boot
2837  *
2838  * Returns:
2839  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2840  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2841  *  - 0: both identity and dynamic domains work for this device
2842  */
2843 static int device_def_domain_type(struct device *dev)
2844 {
2845         if (dev_is_pci(dev)) {
2846                 struct pci_dev *pdev = to_pci_dev(dev);
2847
2848                 /*
2849                  * Prevent any device marked as untrusted from getting
2850                  * placed into the statically identity mapping domain.
2851                  */
2852                 if (pdev->untrusted)
2853                         return IOMMU_DOMAIN_DMA;
2854
2855                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2856                         return IOMMU_DOMAIN_IDENTITY;
2857
2858                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2859                         return IOMMU_DOMAIN_IDENTITY;
2860
2861                 /*
2862                  * We want to start off with all devices in the 1:1 domain, and
2863                  * take them out later if we find they can't access all of memory.
2864                  *
2865                  * However, we can't do this for PCI devices behind bridges,
2866                  * because all PCI devices behind the same bridge will end up
2867                  * with the same source-id on their transactions.
2868                  *
2869                  * Practically speaking, we can't change things around for these
2870                  * devices at run-time, because we can't be sure there'll be no
2871                  * DMA transactions in flight for any of their siblings.
2872                  *
2873                  * So PCI devices (unless they're on the root bus) as well as
2874                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2875                  * the 1:1 domain, just in _case_ one of their siblings turns out
2876                  * not to be able to map all of memory.
2877                  */
2878                 if (!pci_is_pcie(pdev)) {
2879                         if (!pci_is_root_bus(pdev->bus))
2880                                 return IOMMU_DOMAIN_DMA;
2881                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2882                                 return IOMMU_DOMAIN_DMA;
2883                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2884                         return IOMMU_DOMAIN_DMA;
2885         }
2886
2887         return 0;
2888 }
2889
2890 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2891 {
2892         /*
2893          * Start from the sane iommu hardware state.
2894          * If the queued invalidation is already initialized by us
2895          * (for example, while enabling interrupt-remapping) then
2896          * we got the things already rolling from a sane state.
2897          */
2898         if (!iommu->qi) {
2899                 /*
2900                  * Clear any previous faults.
2901                  */
2902                 dmar_fault(-1, iommu);
2903                 /*
2904                  * Disable queued invalidation if supported and already enabled
2905                  * before OS handover.
2906                  */
2907                 dmar_disable_qi(iommu);
2908         }
2909
2910         if (dmar_enable_qi(iommu)) {
2911                 /*
2912                  * Queued Invalidate not enabled, use Register Based Invalidate
2913                  */
2914                 iommu->flush.flush_context = __iommu_flush_context;
2915                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2916                 pr_info("%s: Using Register based invalidation\n",
2917                         iommu->name);
2918         } else {
2919                 iommu->flush.flush_context = qi_flush_context;
2920                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2921                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2922         }
2923 }
2924
2925 static int copy_context_table(struct intel_iommu *iommu,
2926                               struct root_entry *old_re,
2927                               struct context_entry **tbl,
2928                               int bus, bool ext)
2929 {
2930         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2931         struct context_entry *new_ce = NULL, ce;
2932         struct context_entry *old_ce = NULL;
2933         struct root_entry re;
2934         phys_addr_t old_ce_phys;
2935
2936         tbl_idx = ext ? bus * 2 : bus;
2937         memcpy(&re, old_re, sizeof(re));
2938
2939         for (devfn = 0; devfn < 256; devfn++) {
2940                 /* First calculate the correct index */
2941                 idx = (ext ? devfn * 2 : devfn) % 256;
2942
2943                 if (idx == 0) {
2944                         /* First save what we may have and clean up */
2945                         if (new_ce) {
2946                                 tbl[tbl_idx] = new_ce;
2947                                 __iommu_flush_cache(iommu, new_ce,
2948                                                     VTD_PAGE_SIZE);
2949                                 pos = 1;
2950                         }
2951
2952                         if (old_ce)
2953                                 memunmap(old_ce);
2954
2955                         ret = 0;
2956                         if (devfn < 0x80)
2957                                 old_ce_phys = root_entry_lctp(&re);
2958                         else
2959                                 old_ce_phys = root_entry_uctp(&re);
2960
2961                         if (!old_ce_phys) {
2962                                 if (ext && devfn == 0) {
2963                                         /* No LCTP, try UCTP */
2964                                         devfn = 0x7f;
2965                                         continue;
2966                                 } else {
2967                                         goto out;
2968                                 }
2969                         }
2970
2971                         ret = -ENOMEM;
2972                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2973                                         MEMREMAP_WB);
2974                         if (!old_ce)
2975                                 goto out;
2976
2977                         new_ce = alloc_pgtable_page(iommu->node);
2978                         if (!new_ce)
2979                                 goto out_unmap;
2980
2981                         ret = 0;
2982                 }
2983
2984                 /* Now copy the context entry */
2985                 memcpy(&ce, old_ce + idx, sizeof(ce));
2986
2987                 if (!__context_present(&ce))
2988                         continue;
2989
2990                 did = context_domain_id(&ce);
2991                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2992                         set_bit(did, iommu->domain_ids);
2993
2994                 /*
2995                  * We need a marker for copied context entries. This
2996                  * marker needs to work for the old format as well as
2997                  * for extended context entries.
2998                  *
2999                  * Bit 67 of the context entry is used. In the old
3000                  * format this bit is available to software, in the
3001                  * extended format it is the PGE bit, but PGE is ignored
3002                  * by HW if PASIDs are disabled (and thus still
3003                  * available).
3004                  *
3005                  * So disable PASIDs first and then mark the entry
3006                  * copied. This means that we don't copy PASID
3007                  * translations from the old kernel, but this is fine as
3008                  * faults there are not fatal.
3009                  */
3010                 context_clear_pasid_enable(&ce);
3011                 context_set_copied(&ce);
3012
3013                 new_ce[idx] = ce;
3014         }
3015
3016         tbl[tbl_idx + pos] = new_ce;
3017
3018         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3019
3020 out_unmap:
3021         memunmap(old_ce);
3022
3023 out:
3024         return ret;
3025 }
3026
3027 static int copy_translation_tables(struct intel_iommu *iommu)
3028 {
3029         struct context_entry **ctxt_tbls;
3030         struct root_entry *old_rt;
3031         phys_addr_t old_rt_phys;
3032         int ctxt_table_entries;
3033         unsigned long flags;
3034         u64 rtaddr_reg;
3035         int bus, ret;
3036         bool new_ext, ext;
3037
3038         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3039         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3040         new_ext    = !!ecap_ecs(iommu->ecap);
3041
3042         /*
3043          * The RTT bit can only be changed when translation is disabled,
3044          * but disabling translation means to open a window for data
3045          * corruption. So bail out and don't copy anything if we would
3046          * have to change the bit.
3047          */
3048         if (new_ext != ext)
3049                 return -EINVAL;
3050
3051         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3052         if (!old_rt_phys)
3053                 return -EINVAL;
3054
3055         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3056         if (!old_rt)
3057                 return -ENOMEM;
3058
3059         /* This is too big for the stack - allocate it from slab */
3060         ctxt_table_entries = ext ? 512 : 256;
3061         ret = -ENOMEM;
3062         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3063         if (!ctxt_tbls)
3064                 goto out_unmap;
3065
3066         for (bus = 0; bus < 256; bus++) {
3067                 ret = copy_context_table(iommu, &old_rt[bus],
3068                                          ctxt_tbls, bus, ext);
3069                 if (ret) {
3070                         pr_err("%s: Failed to copy context table for bus %d\n",
3071                                 iommu->name, bus);
3072                         continue;
3073                 }
3074         }
3075
3076         spin_lock_irqsave(&iommu->lock, flags);
3077
3078         /* Context tables are copied, now write them to the root_entry table */
3079         for (bus = 0; bus < 256; bus++) {
3080                 int idx = ext ? bus * 2 : bus;
3081                 u64 val;
3082
3083                 if (ctxt_tbls[idx]) {
3084                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3085                         iommu->root_entry[bus].lo = val;
3086                 }
3087
3088                 if (!ext || !ctxt_tbls[idx + 1])
3089                         continue;
3090
3091                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3092                 iommu->root_entry[bus].hi = val;
3093         }
3094
3095         spin_unlock_irqrestore(&iommu->lock, flags);
3096
3097         kfree(ctxt_tbls);
3098
3099         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3100
3101         ret = 0;
3102
3103 out_unmap:
3104         memunmap(old_rt);
3105
3106         return ret;
3107 }
3108
3109 static int __init init_dmars(void)
3110 {
3111         struct dmar_drhd_unit *drhd;
3112         struct intel_iommu *iommu;
3113         int ret;
3114
3115         /*
3116          * for each drhd
3117          *    allocate root
3118          *    initialize and program root entry to not present
3119          * endfor
3120          */
3121         for_each_drhd_unit(drhd) {
3122                 /*
3123                  * lock not needed as this is only incremented in the single
3124                  * threaded kernel __init code path all other access are read
3125                  * only
3126                  */
3127                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3128                         g_num_of_iommus++;
3129                         continue;
3130                 }
3131                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3132         }
3133
3134         /* Preallocate enough resources for IOMMU hot-addition */
3135         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3136                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3137
3138         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3139                         GFP_KERNEL);
3140         if (!g_iommus) {
3141                 pr_err("Allocating global iommu array failed\n");
3142                 ret = -ENOMEM;
3143                 goto error;
3144         }
3145
3146         for_each_iommu(iommu, drhd) {
3147                 if (drhd->ignored) {
3148                         iommu_disable_translation(iommu);
3149                         continue;
3150                 }
3151
3152                 /*
3153                  * Find the max pasid size of all IOMMU's in the system.
3154                  * We need to ensure the system pasid table is no bigger
3155                  * than the smallest supported.
3156                  */
3157                 if (pasid_supported(iommu)) {
3158                         u32 temp = 2 << ecap_pss(iommu->ecap);
3159
3160                         intel_pasid_max_id = min_t(u32, temp,
3161                                                    intel_pasid_max_id);
3162                 }
3163
3164                 g_iommus[iommu->seq_id] = iommu;
3165
3166                 intel_iommu_init_qi(iommu);
3167
3168                 ret = iommu_init_domains(iommu);
3169                 if (ret)
3170                         goto free_iommu;
3171
3172                 init_translation_status(iommu);
3173
3174                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3175                         iommu_disable_translation(iommu);
3176                         clear_translation_pre_enabled(iommu);
3177                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3178                                 iommu->name);
3179                 }
3180
3181                 /*
3182                  * TBD:
3183                  * we could share the same root & context tables
3184                  * among all IOMMU's. Need to Split it later.
3185                  */
3186                 ret = iommu_alloc_root_entry(iommu);
3187                 if (ret)
3188                         goto free_iommu;
3189
3190                 if (translation_pre_enabled(iommu)) {
3191                         pr_info("Translation already enabled - trying to copy translation structures\n");
3192
3193                         ret = copy_translation_tables(iommu);
3194                         if (ret) {
3195                                 /*
3196                                  * We found the IOMMU with translation
3197                                  * enabled - but failed to copy over the
3198                                  * old root-entry table. Try to proceed
3199                                  * by disabling translation now and
3200                                  * allocating a clean root-entry table.
3201                                  * This might cause DMAR faults, but
3202                                  * probably the dump will still succeed.
3203                                  */
3204                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3205                                        iommu->name);
3206                                 iommu_disable_translation(iommu);
3207                                 clear_translation_pre_enabled(iommu);
3208                         } else {
3209                                 pr_info("Copied translation tables from previous kernel for %s\n",
3210                                         iommu->name);
3211                         }
3212                 }
3213
3214                 if (!ecap_pass_through(iommu->ecap))
3215                         hw_pass_through = 0;
3216                 intel_svm_check(iommu);
3217         }
3218
3219         /*
3220          * Now that qi is enabled on all iommus, set the root entry and flush
3221          * caches. This is required on some Intel X58 chipsets, otherwise the
3222          * flush_context function will loop forever and the boot hangs.
3223          */
3224         for_each_active_iommu(iommu, drhd) {
3225                 iommu_flush_write_buffer(iommu);
3226                 iommu_set_root_entry(iommu);
3227                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3228                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3229         }
3230
3231 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3232         dmar_map_gfx = 0;
3233 #endif
3234
3235         if (!dmar_map_gfx)
3236                 iommu_identity_mapping |= IDENTMAP_GFX;
3237
3238         check_tylersburg_isoch();
3239
3240         ret = si_domain_init(hw_pass_through);
3241         if (ret)
3242                 goto free_iommu;
3243
3244         /*
3245          * for each drhd
3246          *   enable fault log
3247          *   global invalidate context cache
3248          *   global invalidate iotlb
3249          *   enable translation
3250          */
3251         for_each_iommu(iommu, drhd) {
3252                 if (drhd->ignored) {
3253                         /*
3254                          * we always have to disable PMRs or DMA may fail on
3255                          * this device
3256                          */
3257                         if (force_on)
3258                                 iommu_disable_protect_mem_regions(iommu);
3259                         continue;
3260                 }
3261
3262                 iommu_flush_write_buffer(iommu);
3263
3264 #ifdef CONFIG_INTEL_IOMMU_SVM
3265                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3266                         /*
3267                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3268                          * could cause possible lock race condition.
3269                          */
3270                         up_write(&dmar_global_lock);
3271                         ret = intel_svm_enable_prq(iommu);
3272                         down_write(&dmar_global_lock);
3273                         if (ret)
3274                                 goto free_iommu;
3275                 }
3276 #endif
3277                 ret = dmar_set_interrupt(iommu);
3278                 if (ret)
3279                         goto free_iommu;
3280         }
3281
3282         return 0;
3283
3284 free_iommu:
3285         for_each_active_iommu(iommu, drhd) {
3286                 disable_dmar_iommu(iommu);
3287                 free_dmar_iommu(iommu);
3288         }
3289
3290         kfree(g_iommus);
3291
3292 error:
3293         return ret;
3294 }
3295
3296 /* This takes a number of _MM_ pages, not VTD pages */
3297 static unsigned long intel_alloc_iova(struct device *dev,
3298                                      struct dmar_domain *domain,
3299                                      unsigned long nrpages, uint64_t dma_mask)
3300 {
3301         unsigned long iova_pfn;
3302
3303         /*
3304          * Restrict dma_mask to the width that the iommu can handle.
3305          * First-level translation restricts the input-address to a
3306          * canonical address (i.e., address bits 63:N have the same
3307          * value as address bit [N-1], where N is 48-bits with 4-level
3308          * paging and 57-bits with 5-level paging). Hence, skip bit
3309          * [N-1].
3310          */
3311         if (domain_use_first_level(domain))
3312                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3313                                  dma_mask);
3314         else
3315                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3316                                  dma_mask);
3317
3318         /* Ensure we reserve the whole size-aligned region */
3319         nrpages = __roundup_pow_of_two(nrpages);
3320
3321         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3322                 /*
3323                  * First try to allocate an io virtual address in
3324                  * DMA_BIT_MASK(32) and if that fails then try allocating
3325                  * from higher range
3326                  */
3327                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3328                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3329                 if (iova_pfn)
3330                         return iova_pfn;
3331         }
3332         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3333                                    IOVA_PFN(dma_mask), true);
3334         if (unlikely(!iova_pfn)) {
3335                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3336                              nrpages);
3337                 return 0;
3338         }
3339
3340         return iova_pfn;
3341 }
3342
3343 /* Check if the dev needs to go through non-identity map and unmap process.*/
3344 static bool iommu_need_mapping(struct device *dev)
3345 {
3346         if (iommu_dummy(dev))
3347                 return false;
3348
3349         if (unlikely(attach_deferred(dev)))
3350                 do_deferred_attach(dev);
3351
3352         return !identity_mapping(dev);
3353 }
3354
3355 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3356                                      size_t size, int dir, u64 dma_mask)
3357 {
3358         struct dmar_domain *domain;
3359         phys_addr_t start_paddr;
3360         unsigned long iova_pfn;
3361         int prot = 0;
3362         int ret;
3363         struct intel_iommu *iommu;
3364         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3365
3366         BUG_ON(dir == DMA_NONE);
3367
3368         domain = find_domain(dev);
3369         if (!domain)
3370                 return DMA_MAPPING_ERROR;
3371
3372         iommu = domain_get_iommu(domain);
3373         size = aligned_nrpages(paddr, size);
3374
3375         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3376         if (!iova_pfn)
3377                 goto error;
3378
3379         /*
3380          * Check if DMAR supports zero-length reads on write only
3381          * mappings..
3382          */
3383         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3384                         !cap_zlr(iommu->cap))
3385                 prot |= DMA_PTE_READ;
3386         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3387                 prot |= DMA_PTE_WRITE;
3388         /*
3389          * paddr - (paddr + size) might be partial page, we should map the whole
3390          * page.  Note: if two part of one page are separately mapped, we
3391          * might have two guest_addr mapping to the same host paddr, but this
3392          * is not a big problem
3393          */
3394         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3395                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3396         if (ret)
3397                 goto error;
3398
3399         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3400         start_paddr += paddr & ~PAGE_MASK;
3401
3402         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3403
3404         return start_paddr;
3405
3406 error:
3407         if (iova_pfn)
3408                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3409         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3410                 size, (unsigned long long)paddr, dir);
3411         return DMA_MAPPING_ERROR;
3412 }
3413
3414 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3415                                  unsigned long offset, size_t size,
3416                                  enum dma_data_direction dir,
3417                                  unsigned long attrs)
3418 {
3419         if (iommu_need_mapping(dev))
3420                 return __intel_map_single(dev, page_to_phys(page) + offset,
3421                                 size, dir, *dev->dma_mask);
3422         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3423 }
3424
3425 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3426                                      size_t size, enum dma_data_direction dir,
3427                                      unsigned long attrs)
3428 {
3429         if (iommu_need_mapping(dev))
3430                 return __intel_map_single(dev, phys_addr, size, dir,
3431                                 *dev->dma_mask);
3432         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3433 }
3434
3435 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3436 {
3437         struct dmar_domain *domain;
3438         unsigned long start_pfn, last_pfn;
3439         unsigned long nrpages;
3440         unsigned long iova_pfn;
3441         struct intel_iommu *iommu;
3442         struct page *freelist;
3443         struct pci_dev *pdev = NULL;
3444
3445         domain = find_domain(dev);
3446         BUG_ON(!domain);
3447
3448         iommu = domain_get_iommu(domain);
3449
3450         iova_pfn = IOVA_PFN(dev_addr);
3451
3452         nrpages = aligned_nrpages(dev_addr, size);
3453         start_pfn = mm_to_dma_pfn(iova_pfn);
3454         last_pfn = start_pfn + nrpages - 1;
3455
3456         if (dev_is_pci(dev))
3457                 pdev = to_pci_dev(dev);
3458
3459         freelist = domain_unmap(domain, start_pfn, last_pfn);
3460         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3461                         !has_iova_flush_queue(&domain->iovad)) {
3462                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3463                                       nrpages, !freelist, 0);
3464                 /* free iova */
3465                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3466                 dma_free_pagelist(freelist);
3467         } else {
3468                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3469                            (unsigned long)freelist);
3470                 /*
3471                  * queue up the release of the unmap to save the 1/6th of the
3472                  * cpu used up by the iotlb flush operation...
3473                  */
3474         }
3475
3476         trace_unmap_single(dev, dev_addr, size);
3477 }
3478
3479 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3480                              size_t size, enum dma_data_direction dir,
3481                              unsigned long attrs)
3482 {
3483         if (iommu_need_mapping(dev))
3484                 intel_unmap(dev, dev_addr, size);
3485         else
3486                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3487 }
3488
3489 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3490                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3491 {
3492         if (iommu_need_mapping(dev))
3493                 intel_unmap(dev, dev_addr, size);
3494 }
3495
3496 static void *intel_alloc_coherent(struct device *dev, size_t size,
3497                                   dma_addr_t *dma_handle, gfp_t flags,
3498                                   unsigned long attrs)
3499 {
3500         struct page *page = NULL;
3501         int order;
3502
3503         if (!iommu_need_mapping(dev))
3504                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3505
3506         size = PAGE_ALIGN(size);
3507         order = get_order(size);
3508
3509         if (gfpflags_allow_blocking(flags)) {
3510                 unsigned int count = size >> PAGE_SHIFT;
3511
3512                 page = dma_alloc_from_contiguous(dev, count, order,
3513                                                  flags & __GFP_NOWARN);
3514         }
3515
3516         if (!page)
3517                 page = alloc_pages(flags, order);
3518         if (!page)
3519                 return NULL;
3520         memset(page_address(page), 0, size);
3521
3522         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3523                                          DMA_BIDIRECTIONAL,
3524                                          dev->coherent_dma_mask);
3525         if (*dma_handle != DMA_MAPPING_ERROR)
3526                 return page_address(page);
3527         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3528                 __free_pages(page, order);
3529
3530         return NULL;
3531 }
3532
3533 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3534                                 dma_addr_t dma_handle, unsigned long attrs)
3535 {
3536         int order;
3537         struct page *page = virt_to_page(vaddr);
3538
3539         if (!iommu_need_mapping(dev))
3540                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3541
3542         size = PAGE_ALIGN(size);
3543         order = get_order(size);
3544
3545         intel_unmap(dev, dma_handle, size);
3546         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3547                 __free_pages(page, order);
3548 }
3549
3550 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3551                            int nelems, enum dma_data_direction dir,
3552                            unsigned long attrs)
3553 {
3554         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3555         unsigned long nrpages = 0;
3556         struct scatterlist *sg;
3557         int i;
3558
3559         if (!iommu_need_mapping(dev))
3560                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3561
3562         for_each_sg(sglist, sg, nelems, i) {
3563                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3564         }
3565
3566         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3567
3568         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3569 }
3570
3571 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3572                         enum dma_data_direction dir, unsigned long attrs)
3573 {
3574         int i;
3575         struct dmar_domain *domain;
3576         size_t size = 0;
3577         int prot = 0;
3578         unsigned long iova_pfn;
3579         int ret;
3580         struct scatterlist *sg;
3581         unsigned long start_vpfn;
3582         struct intel_iommu *iommu;
3583
3584         BUG_ON(dir == DMA_NONE);
3585         if (!iommu_need_mapping(dev))
3586                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3587
3588         domain = find_domain(dev);
3589         if (!domain)
3590                 return 0;
3591
3592         iommu = domain_get_iommu(domain);
3593
3594         for_each_sg(sglist, sg, nelems, i)
3595                 size += aligned_nrpages(sg->offset, sg->length);
3596
3597         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3598                                 *dev->dma_mask);
3599         if (!iova_pfn) {
3600                 sglist->dma_length = 0;
3601                 return 0;
3602         }
3603
3604         /*
3605          * Check if DMAR supports zero-length reads on write only
3606          * mappings..
3607          */
3608         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3609                         !cap_zlr(iommu->cap))
3610                 prot |= DMA_PTE_READ;
3611         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3612                 prot |= DMA_PTE_WRITE;
3613
3614         start_vpfn = mm_to_dma_pfn(iova_pfn);
3615
3616         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3617         if (unlikely(ret)) {
3618                 dma_pte_free_pagetable(domain, start_vpfn,
3619                                        start_vpfn + size - 1,
3620                                        agaw_to_level(domain->agaw) + 1);
3621                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3622                 return 0;
3623         }
3624
3625         for_each_sg(sglist, sg, nelems, i)
3626                 trace_map_sg(dev, i + 1, nelems, sg);
3627
3628         return nelems;
3629 }
3630
3631 static u64 intel_get_required_mask(struct device *dev)
3632 {
3633         if (!iommu_need_mapping(dev))
3634                 return dma_direct_get_required_mask(dev);
3635         return DMA_BIT_MASK(32);
3636 }
3637
3638 static const struct dma_map_ops intel_dma_ops = {
3639         .alloc = intel_alloc_coherent,
3640         .free = intel_free_coherent,
3641         .map_sg = intel_map_sg,
3642         .unmap_sg = intel_unmap_sg,
3643         .map_page = intel_map_page,
3644         .unmap_page = intel_unmap_page,
3645         .map_resource = intel_map_resource,
3646         .unmap_resource = intel_unmap_resource,
3647         .dma_supported = dma_direct_supported,
3648         .mmap = dma_common_mmap,
3649         .get_sgtable = dma_common_get_sgtable,
3650         .get_required_mask = intel_get_required_mask,
3651 };
3652
3653 static void
3654 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3655                    enum dma_data_direction dir, enum dma_sync_target target)
3656 {
3657         struct dmar_domain *domain;
3658         phys_addr_t tlb_addr;
3659
3660         domain = find_domain(dev);
3661         if (WARN_ON(!domain))
3662                 return;
3663
3664         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3665         if (is_swiotlb_buffer(tlb_addr))
3666                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3667 }
3668
3669 static dma_addr_t
3670 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3671                   enum dma_data_direction dir, unsigned long attrs,
3672                   u64 dma_mask)
3673 {
3674         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3675         struct dmar_domain *domain;
3676         struct intel_iommu *iommu;
3677         unsigned long iova_pfn;
3678         unsigned long nrpages;
3679         phys_addr_t tlb_addr;
3680         int prot = 0;
3681         int ret;
3682
3683         if (unlikely(attach_deferred(dev)))
3684                 do_deferred_attach(dev);
3685
3686         domain = find_domain(dev);
3687
3688         if (WARN_ON(dir == DMA_NONE || !domain))
3689                 return DMA_MAPPING_ERROR;
3690
3691         iommu = domain_get_iommu(domain);
3692         if (WARN_ON(!iommu))
3693                 return DMA_MAPPING_ERROR;
3694
3695         nrpages = aligned_nrpages(0, size);
3696         iova_pfn = intel_alloc_iova(dev, domain,
3697                                     dma_to_mm_pfn(nrpages), dma_mask);
3698         if (!iova_pfn)
3699                 return DMA_MAPPING_ERROR;
3700
3701         /*
3702          * Check if DMAR supports zero-length reads on write only
3703          * mappings..
3704          */
3705         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3706                         !cap_zlr(iommu->cap))
3707                 prot |= DMA_PTE_READ;
3708         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3709                 prot |= DMA_PTE_WRITE;
3710
3711         /*
3712          * If both the physical buffer start address and size are
3713          * page aligned, we don't need to use a bounce page.
3714          */
3715         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3716                 tlb_addr = swiotlb_tbl_map_single(dev,
3717                                 __phys_to_dma(dev, io_tlb_start),
3718                                 paddr, size, aligned_size, dir, attrs);
3719                 if (tlb_addr == DMA_MAPPING_ERROR) {
3720                         goto swiotlb_error;
3721                 } else {
3722                         /* Cleanup the padding area. */
3723                         void *padding_start = phys_to_virt(tlb_addr);
3724                         size_t padding_size = aligned_size;
3725
3726                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3727                             (dir == DMA_TO_DEVICE ||
3728                              dir == DMA_BIDIRECTIONAL)) {
3729                                 padding_start += size;
3730                                 padding_size -= size;
3731                         }
3732
3733                         memset(padding_start, 0, padding_size);
3734                 }
3735         } else {
3736                 tlb_addr = paddr;
3737         }
3738
3739         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3740                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3741         if (ret)
3742                 goto mapping_error;
3743
3744         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3745
3746         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3747
3748 mapping_error:
3749         if (is_swiotlb_buffer(tlb_addr))
3750                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3751                                          aligned_size, dir, attrs);
3752 swiotlb_error:
3753         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3754         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3755                 size, (unsigned long long)paddr, dir);
3756
3757         return DMA_MAPPING_ERROR;
3758 }
3759
3760 static void
3761 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3762                     enum dma_data_direction dir, unsigned long attrs)
3763 {
3764         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3765         struct dmar_domain *domain;
3766         phys_addr_t tlb_addr;
3767
3768         domain = find_domain(dev);
3769         if (WARN_ON(!domain))
3770                 return;
3771
3772         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3773         if (WARN_ON(!tlb_addr))
3774                 return;
3775
3776         intel_unmap(dev, dev_addr, size);
3777         if (is_swiotlb_buffer(tlb_addr))
3778                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3779                                          aligned_size, dir, attrs);
3780
3781         trace_bounce_unmap_single(dev, dev_addr, size);
3782 }
3783
3784 static dma_addr_t
3785 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3786                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3787 {
3788         return bounce_map_single(dev, page_to_phys(page) + offset,
3789                                  size, dir, attrs, *dev->dma_mask);
3790 }
3791
3792 static dma_addr_t
3793 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3794                     enum dma_data_direction dir, unsigned long attrs)
3795 {
3796         return bounce_map_single(dev, phys_addr, size,
3797                                  dir, attrs, *dev->dma_mask);
3798 }
3799
3800 static void
3801 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3802                   enum dma_data_direction dir, unsigned long attrs)
3803 {
3804         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3805 }
3806
3807 static void
3808 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3809                       enum dma_data_direction dir, unsigned long attrs)
3810 {
3811         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3812 }
3813
3814 static void
3815 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3816                 enum dma_data_direction dir, unsigned long attrs)
3817 {
3818         struct scatterlist *sg;
3819         int i;
3820
3821         for_each_sg(sglist, sg, nelems, i)
3822                 bounce_unmap_page(dev, sg->dma_address,
3823                                   sg_dma_len(sg), dir, attrs);
3824 }
3825
3826 static int
3827 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3828               enum dma_data_direction dir, unsigned long attrs)
3829 {
3830         int i;
3831         struct scatterlist *sg;
3832
3833         for_each_sg(sglist, sg, nelems, i) {
3834                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3835                                                   sg->offset, sg->length,
3836                                                   dir, attrs);
3837                 if (sg->dma_address == DMA_MAPPING_ERROR)
3838                         goto out_unmap;
3839                 sg_dma_len(sg) = sg->length;
3840         }
3841
3842         for_each_sg(sglist, sg, nelems, i)
3843                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3844
3845         return nelems;
3846
3847 out_unmap:
3848         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3849         return 0;
3850 }
3851
3852 static void
3853 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3854                            size_t size, enum dma_data_direction dir)
3855 {
3856         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3857 }
3858
3859 static void
3860 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3861                               size_t size, enum dma_data_direction dir)
3862 {
3863         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3864 }
3865
3866 static void
3867 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3868                        int nelems, enum dma_data_direction dir)
3869 {
3870         struct scatterlist *sg;
3871         int i;
3872
3873         for_each_sg(sglist, sg, nelems, i)
3874                 bounce_sync_single(dev, sg_dma_address(sg),
3875                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3876 }
3877
3878 static void
3879 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3880                           int nelems, enum dma_data_direction dir)
3881 {
3882         struct scatterlist *sg;
3883         int i;
3884
3885         for_each_sg(sglist, sg, nelems, i)
3886                 bounce_sync_single(dev, sg_dma_address(sg),
3887                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3888 }
3889
3890 static const struct dma_map_ops bounce_dma_ops = {
3891         .alloc                  = intel_alloc_coherent,
3892         .free                   = intel_free_coherent,
3893         .map_sg                 = bounce_map_sg,
3894         .unmap_sg               = bounce_unmap_sg,
3895         .map_page               = bounce_map_page,
3896         .unmap_page             = bounce_unmap_page,
3897         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3898         .sync_single_for_device = bounce_sync_single_for_device,
3899         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3900         .sync_sg_for_device     = bounce_sync_sg_for_device,
3901         .map_resource           = bounce_map_resource,
3902         .unmap_resource         = bounce_unmap_resource,
3903         .dma_supported          = dma_direct_supported,
3904 };
3905
3906 static inline int iommu_domain_cache_init(void)
3907 {
3908         int ret = 0;
3909
3910         iommu_domain_cache = kmem_cache_create("iommu_domain",
3911                                          sizeof(struct dmar_domain),
3912                                          0,
3913                                          SLAB_HWCACHE_ALIGN,
3914
3915                                          NULL);
3916         if (!iommu_domain_cache) {
3917                 pr_err("Couldn't create iommu_domain cache\n");
3918                 ret = -ENOMEM;
3919         }
3920
3921         return ret;
3922 }
3923
3924 static inline int iommu_devinfo_cache_init(void)
3925 {
3926         int ret = 0;
3927
3928         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3929                                          sizeof(struct device_domain_info),
3930                                          0,
3931                                          SLAB_HWCACHE_ALIGN,
3932                                          NULL);
3933         if (!iommu_devinfo_cache) {
3934                 pr_err("Couldn't create devinfo cache\n");
3935                 ret = -ENOMEM;
3936         }
3937
3938         return ret;
3939 }
3940
3941 static int __init iommu_init_mempool(void)
3942 {
3943         int ret;
3944         ret = iova_cache_get();
3945         if (ret)
3946                 return ret;
3947
3948         ret = iommu_domain_cache_init();
3949         if (ret)
3950                 goto domain_error;
3951
3952         ret = iommu_devinfo_cache_init();
3953         if (!ret)
3954                 return ret;
3955
3956         kmem_cache_destroy(iommu_domain_cache);
3957 domain_error:
3958         iova_cache_put();
3959
3960         return -ENOMEM;
3961 }
3962
3963 static void __init iommu_exit_mempool(void)
3964 {
3965         kmem_cache_destroy(iommu_devinfo_cache);
3966         kmem_cache_destroy(iommu_domain_cache);
3967         iova_cache_put();
3968 }
3969
3970 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3971 {
3972         struct dmar_drhd_unit *drhd;
3973         u32 vtbar;
3974         int rc;
3975
3976         /* We know that this device on this chipset has its own IOMMU.
3977          * If we find it under a different IOMMU, then the BIOS is lying
3978          * to us. Hope that the IOMMU for this device is actually
3979          * disabled, and it needs no translation...
3980          */
3981         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3982         if (rc) {
3983                 /* "can't" happen */
3984                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3985                 return;
3986         }
3987         vtbar &= 0xffff0000;
3988
3989         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3990         drhd = dmar_find_matched_drhd_unit(pdev);
3991         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3992                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3993                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3994                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3995         }
3996 }
3997 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3998
3999 static void __init init_no_remapping_devices(void)
4000 {
4001         struct dmar_drhd_unit *drhd;
4002         struct device *dev;
4003         int i;
4004
4005         for_each_drhd_unit(drhd) {
4006                 if (!drhd->include_all) {
4007                         for_each_active_dev_scope(drhd->devices,
4008                                                   drhd->devices_cnt, i, dev)
4009                                 break;
4010                         /* ignore DMAR unit if no devices exist */
4011                         if (i == drhd->devices_cnt)
4012                                 drhd->ignored = 1;
4013                 }
4014         }
4015
4016         for_each_active_drhd_unit(drhd) {
4017                 if (drhd->include_all)
4018                         continue;
4019
4020                 for_each_active_dev_scope(drhd->devices,
4021                                           drhd->devices_cnt, i, dev)
4022                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4023                                 break;
4024                 if (i < drhd->devices_cnt)
4025                         continue;
4026
4027                 /* This IOMMU has *only* gfx devices. Either bypass it or
4028                    set the gfx_mapped flag, as appropriate */
4029                 if (!dmar_map_gfx) {
4030                         drhd->ignored = 1;
4031                         for_each_active_dev_scope(drhd->devices,
4032                                                   drhd->devices_cnt, i, dev)
4033                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4034                 }
4035         }
4036 }
4037
4038 #ifdef CONFIG_SUSPEND
4039 static int init_iommu_hw(void)
4040 {
4041         struct dmar_drhd_unit *drhd;
4042         struct intel_iommu *iommu = NULL;
4043
4044         for_each_active_iommu(iommu, drhd)
4045                 if (iommu->qi)
4046                         dmar_reenable_qi(iommu);
4047
4048         for_each_iommu(iommu, drhd) {
4049                 if (drhd->ignored) {
4050                         /*
4051                          * we always have to disable PMRs or DMA may fail on
4052                          * this device
4053                          */
4054                         if (force_on)
4055                                 iommu_disable_protect_mem_regions(iommu);
4056                         continue;
4057                 }
4058
4059                 iommu_flush_write_buffer(iommu);
4060
4061                 iommu_set_root_entry(iommu);
4062
4063                 iommu->flush.flush_context(iommu, 0, 0, 0,
4064                                            DMA_CCMD_GLOBAL_INVL);
4065                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4066                 iommu_enable_translation(iommu);
4067                 iommu_disable_protect_mem_regions(iommu);
4068         }
4069
4070         return 0;
4071 }
4072
4073 static void iommu_flush_all(void)
4074 {
4075         struct dmar_drhd_unit *drhd;
4076         struct intel_iommu *iommu;
4077
4078         for_each_active_iommu(iommu, drhd) {
4079                 iommu->flush.flush_context(iommu, 0, 0, 0,
4080                                            DMA_CCMD_GLOBAL_INVL);
4081                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4082                                          DMA_TLB_GLOBAL_FLUSH);
4083         }
4084 }
4085
4086 static int iommu_suspend(void)
4087 {
4088         struct dmar_drhd_unit *drhd;
4089         struct intel_iommu *iommu = NULL;
4090         unsigned long flag;
4091
4092         for_each_active_iommu(iommu, drhd) {
4093                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4094                                                  GFP_ATOMIC);
4095                 if (!iommu->iommu_state)
4096                         goto nomem;
4097         }
4098
4099         iommu_flush_all();
4100
4101         for_each_active_iommu(iommu, drhd) {
4102                 iommu_disable_translation(iommu);
4103
4104                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4105
4106                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4107                         readl(iommu->reg + DMAR_FECTL_REG);
4108                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4109                         readl(iommu->reg + DMAR_FEDATA_REG);
4110                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4111                         readl(iommu->reg + DMAR_FEADDR_REG);
4112                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4113                         readl(iommu->reg + DMAR_FEUADDR_REG);
4114
4115                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4116         }
4117         return 0;
4118
4119 nomem:
4120         for_each_active_iommu(iommu, drhd)
4121                 kfree(iommu->iommu_state);
4122
4123         return -ENOMEM;
4124 }
4125
4126 static void iommu_resume(void)
4127 {
4128         struct dmar_drhd_unit *drhd;
4129         struct intel_iommu *iommu = NULL;
4130         unsigned long flag;
4131
4132         if (init_iommu_hw()) {
4133                 if (force_on)
4134                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4135                 else
4136                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4137                 return;
4138         }
4139
4140         for_each_active_iommu(iommu, drhd) {
4141
4142                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4143
4144                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4145                         iommu->reg + DMAR_FECTL_REG);
4146                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4147                         iommu->reg + DMAR_FEDATA_REG);
4148                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4149                         iommu->reg + DMAR_FEADDR_REG);
4150                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4151                         iommu->reg + DMAR_FEUADDR_REG);
4152
4153                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4154         }
4155
4156         for_each_active_iommu(iommu, drhd)
4157                 kfree(iommu->iommu_state);
4158 }
4159
4160 static struct syscore_ops iommu_syscore_ops = {
4161         .resume         = iommu_resume,
4162         .suspend        = iommu_suspend,
4163 };
4164
4165 static void __init init_iommu_pm_ops(void)
4166 {
4167         register_syscore_ops(&iommu_syscore_ops);
4168 }
4169
4170 #else
4171 static inline void init_iommu_pm_ops(void) {}
4172 #endif  /* CONFIG_PM */
4173
4174 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4175 {
4176         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4177             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4178             rmrr->end_address <= rmrr->base_address ||
4179             arch_rmrr_sanity_check(rmrr))
4180                 return -EINVAL;
4181
4182         return 0;
4183 }
4184
4185 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4186 {
4187         struct acpi_dmar_reserved_memory *rmrr;
4188         struct dmar_rmrr_unit *rmrru;
4189
4190         rmrr = (struct acpi_dmar_reserved_memory *)header;
4191         if (rmrr_sanity_check(rmrr)) {
4192                 pr_warn(FW_BUG
4193                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4194                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4195                            rmrr->base_address, rmrr->end_address,
4196                            dmi_get_system_info(DMI_BIOS_VENDOR),
4197                            dmi_get_system_info(DMI_BIOS_VERSION),
4198                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4199                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4200         }
4201
4202         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4203         if (!rmrru)
4204                 goto out;
4205
4206         rmrru->hdr = header;
4207
4208         rmrru->base_address = rmrr->base_address;
4209         rmrru->end_address = rmrr->end_address;
4210
4211         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4212                                 ((void *)rmrr) + rmrr->header.length,
4213                                 &rmrru->devices_cnt);
4214         if (rmrru->devices_cnt && rmrru->devices == NULL)
4215                 goto free_rmrru;
4216
4217         list_add(&rmrru->list, &dmar_rmrr_units);
4218
4219         return 0;
4220 free_rmrru:
4221         kfree(rmrru);
4222 out:
4223         return -ENOMEM;
4224 }
4225
4226 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4227 {
4228         struct dmar_atsr_unit *atsru;
4229         struct acpi_dmar_atsr *tmp;
4230
4231         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4232                                 dmar_rcu_check()) {
4233                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4234                 if (atsr->segment != tmp->segment)
4235                         continue;
4236                 if (atsr->header.length != tmp->header.length)
4237                         continue;
4238                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4239                         return atsru;
4240         }
4241
4242         return NULL;
4243 }
4244
4245 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4246 {
4247         struct acpi_dmar_atsr *atsr;
4248         struct dmar_atsr_unit *atsru;
4249
4250         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4251                 return 0;
4252
4253         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4254         atsru = dmar_find_atsr(atsr);
4255         if (atsru)
4256                 return 0;
4257
4258         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4259         if (!atsru)
4260                 return -ENOMEM;
4261
4262         /*
4263          * If memory is allocated from slab by ACPI _DSM method, we need to
4264          * copy the memory content because the memory buffer will be freed
4265          * on return.
4266          */
4267         atsru->hdr = (void *)(atsru + 1);
4268         memcpy(atsru->hdr, hdr, hdr->length);
4269         atsru->include_all = atsr->flags & 0x1;
4270         if (!atsru->include_all) {
4271                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4272                                 (void *)atsr + atsr->header.length,
4273                                 &atsru->devices_cnt);
4274                 if (atsru->devices_cnt && atsru->devices == NULL) {
4275                         kfree(atsru);
4276                         return -ENOMEM;
4277                 }
4278         }
4279
4280         list_add_rcu(&atsru->list, &dmar_atsr_units);
4281
4282         return 0;
4283 }
4284
4285 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4286 {
4287         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4288         kfree(atsru);
4289 }
4290
4291 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4292 {
4293         struct acpi_dmar_atsr *atsr;
4294         struct dmar_atsr_unit *atsru;
4295
4296         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4297         atsru = dmar_find_atsr(atsr);
4298         if (atsru) {
4299                 list_del_rcu(&atsru->list);
4300                 synchronize_rcu();
4301                 intel_iommu_free_atsr(atsru);
4302         }
4303
4304         return 0;
4305 }
4306
4307 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4308 {
4309         int i;
4310         struct device *dev;
4311         struct acpi_dmar_atsr *atsr;
4312         struct dmar_atsr_unit *atsru;
4313
4314         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4315         atsru = dmar_find_atsr(atsr);
4316         if (!atsru)
4317                 return 0;
4318
4319         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4320                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4321                                           i, dev)
4322                         return -EBUSY;
4323         }
4324
4325         return 0;
4326 }
4327
4328 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4329 {
4330         int sp, ret;
4331         struct intel_iommu *iommu = dmaru->iommu;
4332
4333         if (g_iommus[iommu->seq_id])
4334                 return 0;
4335
4336         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4337                 pr_warn("%s: Doesn't support hardware pass through.\n",
4338                         iommu->name);
4339                 return -ENXIO;
4340         }
4341         if (!ecap_sc_support(iommu->ecap) &&
4342             domain_update_iommu_snooping(iommu)) {
4343                 pr_warn("%s: Doesn't support snooping.\n",
4344                         iommu->name);
4345                 return -ENXIO;
4346         }
4347         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4348         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4349                 pr_warn("%s: Doesn't support large page.\n",
4350                         iommu->name);
4351                 return -ENXIO;
4352         }
4353
4354         /*
4355          * Disable translation if already enabled prior to OS handover.
4356          */
4357         if (iommu->gcmd & DMA_GCMD_TE)
4358                 iommu_disable_translation(iommu);
4359
4360         g_iommus[iommu->seq_id] = iommu;
4361         ret = iommu_init_domains(iommu);
4362         if (ret == 0)
4363                 ret = iommu_alloc_root_entry(iommu);
4364         if (ret)
4365                 goto out;
4366
4367         intel_svm_check(iommu);
4368
4369         if (dmaru->ignored) {
4370                 /*
4371                  * we always have to disable PMRs or DMA may fail on this device
4372                  */
4373                 if (force_on)
4374                         iommu_disable_protect_mem_regions(iommu);
4375                 return 0;
4376         }
4377
4378         intel_iommu_init_qi(iommu);
4379         iommu_flush_write_buffer(iommu);
4380
4381 #ifdef CONFIG_INTEL_IOMMU_SVM
4382         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4383                 ret = intel_svm_enable_prq(iommu);
4384                 if (ret)
4385                         goto disable_iommu;
4386         }
4387 #endif
4388         ret = dmar_set_interrupt(iommu);
4389         if (ret)
4390                 goto disable_iommu;
4391
4392         iommu_set_root_entry(iommu);
4393         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4394         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4395         iommu_enable_translation(iommu);
4396
4397         iommu_disable_protect_mem_regions(iommu);
4398         return 0;
4399
4400 disable_iommu:
4401         disable_dmar_iommu(iommu);
4402 out:
4403         free_dmar_iommu(iommu);
4404         return ret;
4405 }
4406
4407 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4408 {
4409         int ret = 0;
4410         struct intel_iommu *iommu = dmaru->iommu;
4411
4412         if (!intel_iommu_enabled)
4413                 return 0;
4414         if (iommu == NULL)
4415                 return -EINVAL;
4416
4417         if (insert) {
4418                 ret = intel_iommu_add(dmaru);
4419         } else {
4420                 disable_dmar_iommu(iommu);
4421                 free_dmar_iommu(iommu);
4422         }
4423
4424         return ret;
4425 }
4426
4427 static void intel_iommu_free_dmars(void)
4428 {
4429         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4430         struct dmar_atsr_unit *atsru, *atsr_n;
4431
4432         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4433                 list_del(&rmrru->list);
4434                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4435                 kfree(rmrru);
4436         }
4437
4438         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4439                 list_del(&atsru->list);
4440                 intel_iommu_free_atsr(atsru);
4441         }
4442 }
4443
4444 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4445 {
4446         int i, ret = 1;
4447         struct pci_bus *bus;
4448         struct pci_dev *bridge = NULL;
4449         struct device *tmp;
4450         struct acpi_dmar_atsr *atsr;
4451         struct dmar_atsr_unit *atsru;
4452
4453         dev = pci_physfn(dev);
4454         for (bus = dev->bus; bus; bus = bus->parent) {
4455                 bridge = bus->self;
4456                 /* If it's an integrated device, allow ATS */
4457                 if (!bridge)
4458                         return 1;
4459                 /* Connected via non-PCIe: no ATS */
4460                 if (!pci_is_pcie(bridge) ||
4461                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4462                         return 0;
4463                 /* If we found the root port, look it up in the ATSR */
4464                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4465                         break;
4466         }
4467
4468         rcu_read_lock();
4469         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4470                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4471                 if (atsr->segment != pci_domain_nr(dev->bus))
4472                         continue;
4473
4474                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4475                         if (tmp == &bridge->dev)
4476                                 goto out;
4477
4478                 if (atsru->include_all)
4479                         goto out;
4480         }
4481         ret = 0;
4482 out:
4483         rcu_read_unlock();
4484
4485         return ret;
4486 }
4487
4488 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4489 {
4490         int ret;
4491         struct dmar_rmrr_unit *rmrru;
4492         struct dmar_atsr_unit *atsru;
4493         struct acpi_dmar_atsr *atsr;
4494         struct acpi_dmar_reserved_memory *rmrr;
4495
4496         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4497                 return 0;
4498
4499         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4500                 rmrr = container_of(rmrru->hdr,
4501                                     struct acpi_dmar_reserved_memory, header);
4502                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4503                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4504                                 ((void *)rmrr) + rmrr->header.length,
4505                                 rmrr->segment, rmrru->devices,
4506                                 rmrru->devices_cnt);
4507                         if (ret < 0)
4508                                 return ret;
4509                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4510                         dmar_remove_dev_scope(info, rmrr->segment,
4511                                 rmrru->devices, rmrru->devices_cnt);
4512                 }
4513         }
4514
4515         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4516                 if (atsru->include_all)
4517                         continue;
4518
4519                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4520                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4521                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4522                                         (void *)atsr + atsr->header.length,
4523                                         atsr->segment, atsru->devices,
4524                                         atsru->devices_cnt);
4525                         if (ret > 0)
4526                                 break;
4527                         else if (ret < 0)
4528                                 return ret;
4529                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4530                         if (dmar_remove_dev_scope(info, atsr->segment,
4531                                         atsru->devices, atsru->devices_cnt))
4532                                 break;
4533                 }
4534         }
4535
4536         return 0;
4537 }
4538
4539 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4540                                        unsigned long val, void *v)
4541 {
4542         struct memory_notify *mhp = v;
4543         unsigned long long start, end;
4544         unsigned long start_vpfn, last_vpfn;
4545
4546         switch (val) {
4547         case MEM_GOING_ONLINE:
4548                 start = mhp->start_pfn << PAGE_SHIFT;
4549                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4550                 if (iommu_domain_identity_map(si_domain, start, end)) {
4551                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4552                                 start, end);
4553                         return NOTIFY_BAD;
4554                 }
4555                 break;
4556
4557         case MEM_OFFLINE:
4558         case MEM_CANCEL_ONLINE:
4559                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4560                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4561                 while (start_vpfn <= last_vpfn) {
4562                         struct iova *iova;
4563                         struct dmar_drhd_unit *drhd;
4564                         struct intel_iommu *iommu;
4565                         struct page *freelist;
4566
4567                         iova = find_iova(&si_domain->iovad, start_vpfn);
4568                         if (iova == NULL) {
4569                                 pr_debug("Failed get IOVA for PFN %lx\n",
4570                                          start_vpfn);
4571                                 break;
4572                         }
4573
4574                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4575                                                      start_vpfn, last_vpfn);
4576                         if (iova == NULL) {
4577                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4578                                         start_vpfn, last_vpfn);
4579                                 return NOTIFY_BAD;
4580                         }
4581
4582                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4583                                                iova->pfn_hi);
4584
4585                         rcu_read_lock();
4586                         for_each_active_iommu(iommu, drhd)
4587                                 iommu_flush_iotlb_psi(iommu, si_domain,
4588                                         iova->pfn_lo, iova_size(iova),
4589                                         !freelist, 0);
4590                         rcu_read_unlock();
4591                         dma_free_pagelist(freelist);
4592
4593                         start_vpfn = iova->pfn_hi + 1;
4594                         free_iova_mem(iova);
4595                 }
4596                 break;
4597         }
4598
4599         return NOTIFY_OK;
4600 }
4601
4602 static struct notifier_block intel_iommu_memory_nb = {
4603         .notifier_call = intel_iommu_memory_notifier,
4604         .priority = 0
4605 };
4606
4607 static void free_all_cpu_cached_iovas(unsigned int cpu)
4608 {
4609         int i;
4610
4611         for (i = 0; i < g_num_of_iommus; i++) {
4612                 struct intel_iommu *iommu = g_iommus[i];
4613                 struct dmar_domain *domain;
4614                 int did;
4615
4616                 if (!iommu)
4617                         continue;
4618
4619                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4620                         domain = get_iommu_domain(iommu, (u16)did);
4621
4622                         if (!domain)
4623                                 continue;
4624                         free_cpu_cached_iovas(cpu, &domain->iovad);
4625                 }
4626         }
4627 }
4628
4629 static int intel_iommu_cpu_dead(unsigned int cpu)
4630 {
4631         free_all_cpu_cached_iovas(cpu);
4632         return 0;
4633 }
4634
4635 static void intel_disable_iommus(void)
4636 {
4637         struct intel_iommu *iommu = NULL;
4638         struct dmar_drhd_unit *drhd;
4639
4640         for_each_iommu(iommu, drhd)
4641                 iommu_disable_translation(iommu);
4642 }
4643
4644 void intel_iommu_shutdown(void)
4645 {
4646         struct dmar_drhd_unit *drhd;
4647         struct intel_iommu *iommu = NULL;
4648
4649         if (no_iommu || dmar_disabled)
4650                 return;
4651
4652         down_write(&dmar_global_lock);
4653
4654         /* Disable PMRs explicitly here. */
4655         for_each_iommu(iommu, drhd)
4656                 iommu_disable_protect_mem_regions(iommu);
4657
4658         /* Make sure the IOMMUs are switched off */
4659         intel_disable_iommus();
4660
4661         up_write(&dmar_global_lock);
4662 }
4663
4664 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4665 {
4666         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4667
4668         return container_of(iommu_dev, struct intel_iommu, iommu);
4669 }
4670
4671 static ssize_t intel_iommu_show_version(struct device *dev,
4672                                         struct device_attribute *attr,
4673                                         char *buf)
4674 {
4675         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4676         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4677         return sprintf(buf, "%d:%d\n",
4678                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4679 }
4680 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4681
4682 static ssize_t intel_iommu_show_address(struct device *dev,
4683                                         struct device_attribute *attr,
4684                                         char *buf)
4685 {
4686         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4687         return sprintf(buf, "%llx\n", iommu->reg_phys);
4688 }
4689 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4690
4691 static ssize_t intel_iommu_show_cap(struct device *dev,
4692                                     struct device_attribute *attr,
4693                                     char *buf)
4694 {
4695         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4696         return sprintf(buf, "%llx\n", iommu->cap);
4697 }
4698 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4699
4700 static ssize_t intel_iommu_show_ecap(struct device *dev,
4701                                     struct device_attribute *attr,
4702                                     char *buf)
4703 {
4704         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4705         return sprintf(buf, "%llx\n", iommu->ecap);
4706 }
4707 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4708
4709 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4710                                       struct device_attribute *attr,
4711                                       char *buf)
4712 {
4713         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4714         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4715 }
4716 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4717
4718 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4719                                            struct device_attribute *attr,
4720                                            char *buf)
4721 {
4722         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4723         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4724                                                   cap_ndoms(iommu->cap)));
4725 }
4726 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4727
4728 static struct attribute *intel_iommu_attrs[] = {
4729         &dev_attr_version.attr,
4730         &dev_attr_address.attr,
4731         &dev_attr_cap.attr,
4732         &dev_attr_ecap.attr,
4733         &dev_attr_domains_supported.attr,
4734         &dev_attr_domains_used.attr,
4735         NULL,
4736 };
4737
4738 static struct attribute_group intel_iommu_group = {
4739         .name = "intel-iommu",
4740         .attrs = intel_iommu_attrs,
4741 };
4742
4743 const struct attribute_group *intel_iommu_groups[] = {
4744         &intel_iommu_group,
4745         NULL,
4746 };
4747
4748 static inline bool has_untrusted_dev(void)
4749 {
4750         struct pci_dev *pdev = NULL;
4751
4752         for_each_pci_dev(pdev)
4753                 if (pdev->untrusted)
4754                         return true;
4755
4756         return false;
4757 }
4758
4759 static int __init platform_optin_force_iommu(void)
4760 {
4761         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4762                 return 0;
4763
4764         if (no_iommu || dmar_disabled)
4765                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4766
4767         /*
4768          * If Intel-IOMMU is disabled by default, we will apply identity
4769          * map for all devices except those marked as being untrusted.
4770          */
4771         if (dmar_disabled)
4772                 iommu_set_default_passthrough(false);
4773
4774         dmar_disabled = 0;
4775         no_iommu = 0;
4776
4777         return 1;
4778 }
4779
4780 static int __init probe_acpi_namespace_devices(void)
4781 {
4782         struct dmar_drhd_unit *drhd;
4783         /* To avoid a -Wunused-but-set-variable warning. */
4784         struct intel_iommu *iommu __maybe_unused;
4785         struct device *dev;
4786         int i, ret = 0;
4787
4788         for_each_active_iommu(iommu, drhd) {
4789                 for_each_active_dev_scope(drhd->devices,
4790                                           drhd->devices_cnt, i, dev) {
4791                         struct acpi_device_physical_node *pn;
4792                         struct iommu_group *group;
4793                         struct acpi_device *adev;
4794
4795                         if (dev->bus != &acpi_bus_type)
4796                                 continue;
4797
4798                         adev = to_acpi_device(dev);
4799                         mutex_lock(&adev->physical_node_lock);
4800                         list_for_each_entry(pn,
4801                                             &adev->physical_node_list, node) {
4802                                 group = iommu_group_get(pn->dev);
4803                                 if (group) {
4804                                         iommu_group_put(group);
4805                                         continue;
4806                                 }
4807
4808                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4809                                 ret = iommu_probe_device(pn->dev);
4810                                 if (ret)
4811                                         break;
4812                         }
4813                         mutex_unlock(&adev->physical_node_lock);
4814
4815                         if (ret)
4816                                 return ret;
4817                 }
4818         }
4819
4820         return 0;
4821 }
4822
4823 int __init intel_iommu_init(void)
4824 {
4825         int ret = -ENODEV;
4826         struct dmar_drhd_unit *drhd;
4827         struct intel_iommu *iommu;
4828
4829         /*
4830          * Intel IOMMU is required for a TXT/tboot launch or platform
4831          * opt in, so enforce that.
4832          */
4833         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4834
4835         if (iommu_init_mempool()) {
4836                 if (force_on)
4837                         panic("tboot: Failed to initialize iommu memory\n");
4838                 return -ENOMEM;
4839         }
4840
4841         down_write(&dmar_global_lock);
4842         if (dmar_table_init()) {
4843                 if (force_on)
4844                         panic("tboot: Failed to initialize DMAR table\n");
4845                 goto out_free_dmar;
4846         }
4847
4848         if (dmar_dev_scope_init() < 0) {
4849                 if (force_on)
4850                         panic("tboot: Failed to initialize DMAR device scope\n");
4851                 goto out_free_dmar;
4852         }
4853
4854         up_write(&dmar_global_lock);
4855
4856         /*
4857          * The bus notifier takes the dmar_global_lock, so lockdep will
4858          * complain later when we register it under the lock.
4859          */
4860         dmar_register_bus_notifier();
4861
4862         down_write(&dmar_global_lock);
4863
4864         if (!no_iommu)
4865                 intel_iommu_debugfs_init();
4866
4867         if (no_iommu || dmar_disabled) {
4868                 /*
4869                  * We exit the function here to ensure IOMMU's remapping and
4870                  * mempool aren't setup, which means that the IOMMU's PMRs
4871                  * won't be disabled via the call to init_dmars(). So disable
4872                  * it explicitly here. The PMRs were setup by tboot prior to
4873                  * calling SENTER, but the kernel is expected to reset/tear
4874                  * down the PMRs.
4875                  */
4876                 if (intel_iommu_tboot_noforce) {
4877                         for_each_iommu(iommu, drhd)
4878                                 iommu_disable_protect_mem_regions(iommu);
4879                 }
4880
4881                 /*
4882                  * Make sure the IOMMUs are switched off, even when we
4883                  * boot into a kexec kernel and the previous kernel left
4884                  * them enabled
4885                  */
4886                 intel_disable_iommus();
4887                 goto out_free_dmar;
4888         }
4889
4890         if (list_empty(&dmar_rmrr_units))
4891                 pr_info("No RMRR found\n");
4892
4893         if (list_empty(&dmar_atsr_units))
4894                 pr_info("No ATSR found\n");
4895
4896         if (dmar_init_reserved_ranges()) {
4897                 if (force_on)
4898                         panic("tboot: Failed to reserve iommu ranges\n");
4899                 goto out_free_reserved_range;
4900         }
4901
4902         if (dmar_map_gfx)
4903                 intel_iommu_gfx_mapped = 1;
4904
4905         init_no_remapping_devices();
4906
4907         ret = init_dmars();
4908         if (ret) {
4909                 if (force_on)
4910                         panic("tboot: Failed to initialize DMARs\n");
4911                 pr_err("Initialization failed\n");
4912                 goto out_free_reserved_range;
4913         }
4914         up_write(&dmar_global_lock);
4915
4916         dma_ops = &intel_dma_ops;
4917
4918         init_iommu_pm_ops();
4919
4920         down_read(&dmar_global_lock);
4921         for_each_active_iommu(iommu, drhd) {
4922                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4923                                        intel_iommu_groups,
4924                                        "%s", iommu->name);
4925                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4926                 iommu_device_register(&iommu->iommu);
4927         }
4928         up_read(&dmar_global_lock);
4929
4930         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4931         if (si_domain && !hw_pass_through)
4932                 register_memory_notifier(&intel_iommu_memory_nb);
4933         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4934                           intel_iommu_cpu_dead);
4935
4936         down_read(&dmar_global_lock);
4937         if (probe_acpi_namespace_devices())
4938                 pr_warn("ACPI name space devices didn't probe correctly\n");
4939
4940         /* Finally, we enable the DMA remapping hardware. */
4941         for_each_iommu(iommu, drhd) {
4942                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4943                         iommu_enable_translation(iommu);
4944
4945                 iommu_disable_protect_mem_regions(iommu);
4946         }
4947         up_read(&dmar_global_lock);
4948
4949         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4950
4951         intel_iommu_enabled = 1;
4952
4953         return 0;
4954
4955 out_free_reserved_range:
4956         put_iova_domain(&reserved_iova_list);
4957 out_free_dmar:
4958         intel_iommu_free_dmars();
4959         up_write(&dmar_global_lock);
4960         iommu_exit_mempool();
4961         return ret;
4962 }
4963
4964 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4965 {
4966         struct intel_iommu *iommu = opaque;
4967
4968         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4969         return 0;
4970 }
4971
4972 /*
4973  * NB - intel-iommu lacks any sort of reference counting for the users of
4974  * dependent devices.  If multiple endpoints have intersecting dependent
4975  * devices, unbinding the driver from any one of them will possibly leave
4976  * the others unable to operate.
4977  */
4978 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4979 {
4980         if (!iommu || !dev || !dev_is_pci(dev))
4981                 return;
4982
4983         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4984 }
4985
4986 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4987 {
4988         struct dmar_domain *domain;
4989         struct intel_iommu *iommu;
4990         unsigned long flags;
4991
4992         assert_spin_locked(&device_domain_lock);
4993
4994         if (WARN_ON(!info))
4995                 return;
4996
4997         iommu = info->iommu;
4998         domain = info->domain;
4999
5000         if (info->dev) {
5001                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5002                         intel_pasid_tear_down_entry(iommu, info->dev,
5003                                         PASID_RID2PASID);
5004
5005                 iommu_disable_dev_iotlb(info);
5006                 domain_context_clear(iommu, info->dev);
5007                 intel_pasid_free_table(info->dev);
5008         }
5009
5010         unlink_domain_info(info);
5011
5012         spin_lock_irqsave(&iommu->lock, flags);
5013         domain_detach_iommu(domain, iommu);
5014         spin_unlock_irqrestore(&iommu->lock, flags);
5015
5016         free_devinfo_mem(info);
5017 }
5018
5019 static void dmar_remove_one_dev_info(struct device *dev)
5020 {
5021         struct device_domain_info *info;
5022         unsigned long flags;
5023
5024         spin_lock_irqsave(&device_domain_lock, flags);
5025         info = dev->archdata.iommu;
5026         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5027             && info != DUMMY_DEVICE_DOMAIN_INFO)
5028                 __dmar_remove_one_dev_info(info);
5029         spin_unlock_irqrestore(&device_domain_lock, flags);
5030 }
5031
5032 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5033 {
5034         int adjust_width;
5035
5036         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5037         domain_reserve_special_ranges(domain);
5038
5039         /* calculate AGAW */
5040         domain->gaw = guest_width;
5041         adjust_width = guestwidth_to_adjustwidth(guest_width);
5042         domain->agaw = width_to_agaw(adjust_width);
5043
5044         domain->iommu_coherency = 0;
5045         domain->iommu_snooping = 0;
5046         domain->iommu_superpage = 0;
5047         domain->max_addr = 0;
5048
5049         /* always allocate the top pgd */
5050         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5051         if (!domain->pgd)
5052                 return -ENOMEM;
5053         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5054         return 0;
5055 }
5056
5057 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5058 {
5059         struct dmar_domain *dmar_domain;
5060         struct iommu_domain *domain;
5061         int ret;
5062
5063         switch (type) {
5064         case IOMMU_DOMAIN_DMA:
5065         /* fallthrough */
5066         case IOMMU_DOMAIN_UNMANAGED:
5067                 dmar_domain = alloc_domain(0);
5068                 if (!dmar_domain) {
5069                         pr_err("Can't allocate dmar_domain\n");
5070                         return NULL;
5071                 }
5072                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5073                         pr_err("Domain initialization failed\n");
5074                         domain_exit(dmar_domain);
5075                         return NULL;
5076                 }
5077
5078                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5079                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5080                                                     iommu_flush_iova,
5081                                                     iova_entry_free);
5082                         if (ret)
5083                                 pr_info("iova flush queue initialization failed\n");
5084                 }
5085
5086                 domain_update_iommu_cap(dmar_domain);
5087
5088                 domain = &dmar_domain->domain;
5089                 domain->geometry.aperture_start = 0;
5090                 domain->geometry.aperture_end   =
5091                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5092                 domain->geometry.force_aperture = true;
5093
5094                 return domain;
5095         case IOMMU_DOMAIN_IDENTITY:
5096                 return &si_domain->domain;
5097         default:
5098                 return NULL;
5099         }
5100
5101         return NULL;
5102 }
5103
5104 static void intel_iommu_domain_free(struct iommu_domain *domain)
5105 {
5106         if (domain != &si_domain->domain)
5107                 domain_exit(to_dmar_domain(domain));
5108 }
5109
5110 /*
5111  * Check whether a @domain could be attached to the @dev through the
5112  * aux-domain attach/detach APIs.
5113  */
5114 static inline bool
5115 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5116 {
5117         struct device_domain_info *info = dev->archdata.iommu;
5118
5119         return info && info->auxd_enabled &&
5120                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5121 }
5122
5123 static void auxiliary_link_device(struct dmar_domain *domain,
5124                                   struct device *dev)
5125 {
5126         struct device_domain_info *info = dev->archdata.iommu;
5127
5128         assert_spin_locked(&device_domain_lock);
5129         if (WARN_ON(!info))
5130                 return;
5131
5132         domain->auxd_refcnt++;
5133         list_add(&domain->auxd, &info->auxiliary_domains);
5134 }
5135
5136 static void auxiliary_unlink_device(struct dmar_domain *domain,
5137                                     struct device *dev)
5138 {
5139         struct device_domain_info *info = dev->archdata.iommu;
5140
5141         assert_spin_locked(&device_domain_lock);
5142         if (WARN_ON(!info))
5143                 return;
5144
5145         list_del(&domain->auxd);
5146         domain->auxd_refcnt--;
5147
5148         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5149                 ioasid_free(domain->default_pasid);
5150 }
5151
5152 static int aux_domain_add_dev(struct dmar_domain *domain,
5153                               struct device *dev)
5154 {
5155         int ret;
5156         u8 bus, devfn;
5157         unsigned long flags;
5158         struct intel_iommu *iommu;
5159
5160         iommu = device_to_iommu(dev, &bus, &devfn);
5161         if (!iommu)
5162                 return -ENODEV;
5163
5164         if (domain->default_pasid <= 0) {
5165                 int pasid;
5166
5167                 /* No private data needed for the default pasid */
5168                 pasid = ioasid_alloc(NULL, PASID_MIN,
5169                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5170                                      NULL);
5171                 if (pasid == INVALID_IOASID) {
5172                         pr_err("Can't allocate default pasid\n");
5173                         return -ENODEV;
5174                 }
5175                 domain->default_pasid = pasid;
5176         }
5177
5178         spin_lock_irqsave(&device_domain_lock, flags);
5179         /*
5180          * iommu->lock must be held to attach domain to iommu and setup the
5181          * pasid entry for second level translation.
5182          */
5183         spin_lock(&iommu->lock);
5184         ret = domain_attach_iommu(domain, iommu);
5185         if (ret)
5186                 goto attach_failed;
5187
5188         /* Setup the PASID entry for mediated devices: */
5189         if (domain_use_first_level(domain))
5190                 ret = domain_setup_first_level(iommu, domain, dev,
5191                                                domain->default_pasid);
5192         else
5193                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5194                                                      domain->default_pasid);
5195         if (ret)
5196                 goto table_failed;
5197         spin_unlock(&iommu->lock);
5198
5199         auxiliary_link_device(domain, dev);
5200
5201         spin_unlock_irqrestore(&device_domain_lock, flags);
5202
5203         return 0;
5204
5205 table_failed:
5206         domain_detach_iommu(domain, iommu);
5207 attach_failed:
5208         spin_unlock(&iommu->lock);
5209         spin_unlock_irqrestore(&device_domain_lock, flags);
5210         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5211                 ioasid_free(domain->default_pasid);
5212
5213         return ret;
5214 }
5215
5216 static void aux_domain_remove_dev(struct dmar_domain *domain,
5217                                   struct device *dev)
5218 {
5219         struct device_domain_info *info;
5220         struct intel_iommu *iommu;
5221         unsigned long flags;
5222
5223         if (!is_aux_domain(dev, &domain->domain))
5224                 return;
5225
5226         spin_lock_irqsave(&device_domain_lock, flags);
5227         info = dev->archdata.iommu;
5228         iommu = info->iommu;
5229
5230         auxiliary_unlink_device(domain, dev);
5231
5232         spin_lock(&iommu->lock);
5233         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5234         domain_detach_iommu(domain, iommu);
5235         spin_unlock(&iommu->lock);
5236
5237         spin_unlock_irqrestore(&device_domain_lock, flags);
5238 }
5239
5240 static int prepare_domain_attach_device(struct iommu_domain *domain,
5241                                         struct device *dev)
5242 {
5243         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5244         struct intel_iommu *iommu;
5245         int addr_width;
5246         u8 bus, devfn;
5247
5248         iommu = device_to_iommu(dev, &bus, &devfn);
5249         if (!iommu)
5250                 return -ENODEV;
5251
5252         /* check if this iommu agaw is sufficient for max mapped address */
5253         addr_width = agaw_to_width(iommu->agaw);
5254         if (addr_width > cap_mgaw(iommu->cap))
5255                 addr_width = cap_mgaw(iommu->cap);
5256
5257         if (dmar_domain->max_addr > (1LL << addr_width)) {
5258                 dev_err(dev, "%s: iommu width (%d) is not "
5259                         "sufficient for the mapped address (%llx)\n",
5260                         __func__, addr_width, dmar_domain->max_addr);
5261                 return -EFAULT;
5262         }
5263         dmar_domain->gaw = addr_width;
5264
5265         /*
5266          * Knock out extra levels of page tables if necessary
5267          */
5268         while (iommu->agaw < dmar_domain->agaw) {
5269                 struct dma_pte *pte;
5270
5271                 pte = dmar_domain->pgd;
5272                 if (dma_pte_present(pte)) {
5273                         dmar_domain->pgd = (struct dma_pte *)
5274                                 phys_to_virt(dma_pte_addr(pte));
5275                         free_pgtable_page(pte);
5276                 }
5277                 dmar_domain->agaw--;
5278         }
5279
5280         return 0;
5281 }
5282
5283 static int intel_iommu_attach_device(struct iommu_domain *domain,
5284                                      struct device *dev)
5285 {
5286         int ret;
5287
5288         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5289             device_is_rmrr_locked(dev)) {
5290                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5291                 return -EPERM;
5292         }
5293
5294         if (is_aux_domain(dev, domain))
5295                 return -EPERM;
5296
5297         /* normally dev is not mapped */
5298         if (unlikely(domain_context_mapped(dev))) {
5299                 struct dmar_domain *old_domain;
5300
5301                 old_domain = find_domain(dev);
5302                 if (old_domain)
5303                         dmar_remove_one_dev_info(dev);
5304         }
5305
5306         ret = prepare_domain_attach_device(domain, dev);
5307         if (ret)
5308                 return ret;
5309
5310         return domain_add_dev_info(to_dmar_domain(domain), dev);
5311 }
5312
5313 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5314                                          struct device *dev)
5315 {
5316         int ret;
5317
5318         if (!is_aux_domain(dev, domain))
5319                 return -EPERM;
5320
5321         ret = prepare_domain_attach_device(domain, dev);
5322         if (ret)
5323                 return ret;
5324
5325         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5326 }
5327
5328 static void intel_iommu_detach_device(struct iommu_domain *domain,
5329                                       struct device *dev)
5330 {
5331         dmar_remove_one_dev_info(dev);
5332 }
5333
5334 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5335                                           struct device *dev)
5336 {
5337         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5338 }
5339
5340 static int intel_iommu_map(struct iommu_domain *domain,
5341                            unsigned long iova, phys_addr_t hpa,
5342                            size_t size, int iommu_prot, gfp_t gfp)
5343 {
5344         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5345         u64 max_addr;
5346         int prot = 0;
5347         int ret;
5348
5349         if (iommu_prot & IOMMU_READ)
5350                 prot |= DMA_PTE_READ;
5351         if (iommu_prot & IOMMU_WRITE)
5352                 prot |= DMA_PTE_WRITE;
5353         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5354                 prot |= DMA_PTE_SNP;
5355
5356         max_addr = iova + size;
5357         if (dmar_domain->max_addr < max_addr) {
5358                 u64 end;
5359
5360                 /* check if minimum agaw is sufficient for mapped address */
5361                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5362                 if (end < max_addr) {
5363                         pr_err("%s: iommu width (%d) is not "
5364                                "sufficient for the mapped address (%llx)\n",
5365                                __func__, dmar_domain->gaw, max_addr);
5366                         return -EFAULT;
5367                 }
5368                 dmar_domain->max_addr = max_addr;
5369         }
5370         /* Round up size to next multiple of PAGE_SIZE, if it and
5371            the low bits of hpa would take us onto the next page */
5372         size = aligned_nrpages(hpa, size);
5373         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5374                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5375         return ret;
5376 }
5377
5378 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5379                                 unsigned long iova, size_t size,
5380                                 struct iommu_iotlb_gather *gather)
5381 {
5382         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5383         struct page *freelist = NULL;
5384         unsigned long start_pfn, last_pfn;
5385         unsigned int npages;
5386         int iommu_id, level = 0;
5387
5388         /* Cope with horrid API which requires us to unmap more than the
5389            size argument if it happens to be a large-page mapping. */
5390         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5391
5392         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5393                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5394
5395         start_pfn = iova >> VTD_PAGE_SHIFT;
5396         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5397
5398         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5399
5400         npages = last_pfn - start_pfn + 1;
5401
5402         for_each_domain_iommu(iommu_id, dmar_domain)
5403                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5404                                       start_pfn, npages, !freelist, 0);
5405
5406         dma_free_pagelist(freelist);
5407
5408         if (dmar_domain->max_addr == iova + size)
5409                 dmar_domain->max_addr = iova;
5410
5411         return size;
5412 }
5413
5414 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5415                                             dma_addr_t iova)
5416 {
5417         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5418         struct dma_pte *pte;
5419         int level = 0;
5420         u64 phys = 0;
5421
5422         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5423         if (pte && dma_pte_present(pte))
5424                 phys = dma_pte_addr(pte) +
5425                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5426                                                 VTD_PAGE_SHIFT) - 1));
5427
5428         return phys;
5429 }
5430
5431 static inline bool scalable_mode_support(void)
5432 {
5433         struct dmar_drhd_unit *drhd;
5434         struct intel_iommu *iommu;
5435         bool ret = true;
5436
5437         rcu_read_lock();
5438         for_each_active_iommu(iommu, drhd) {
5439                 if (!sm_supported(iommu)) {
5440                         ret = false;
5441                         break;
5442                 }
5443         }
5444         rcu_read_unlock();
5445
5446         return ret;
5447 }
5448
5449 static inline bool iommu_pasid_support(void)
5450 {
5451         struct dmar_drhd_unit *drhd;
5452         struct intel_iommu *iommu;
5453         bool ret = true;
5454
5455         rcu_read_lock();
5456         for_each_active_iommu(iommu, drhd) {
5457                 if (!pasid_supported(iommu)) {
5458                         ret = false;
5459                         break;
5460                 }
5461         }
5462         rcu_read_unlock();
5463
5464         return ret;
5465 }
5466
5467 static inline bool nested_mode_support(void)
5468 {
5469         struct dmar_drhd_unit *drhd;
5470         struct intel_iommu *iommu;
5471         bool ret = true;
5472
5473         rcu_read_lock();
5474         for_each_active_iommu(iommu, drhd) {
5475                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5476                         ret = false;
5477                         break;
5478                 }
5479         }
5480         rcu_read_unlock();
5481
5482         return ret;
5483 }
5484
5485 static bool intel_iommu_capable(enum iommu_cap cap)
5486 {
5487         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5488                 return domain_update_iommu_snooping(NULL) == 1;
5489         if (cap == IOMMU_CAP_INTR_REMAP)
5490                 return irq_remapping_enabled == 1;
5491
5492         return false;
5493 }
5494
5495 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5496 {
5497         struct intel_iommu *iommu;
5498         u8 bus, devfn;
5499
5500         iommu = device_to_iommu(dev, &bus, &devfn);
5501         if (!iommu)
5502                 return ERR_PTR(-ENODEV);
5503
5504         if (translation_pre_enabled(iommu))
5505                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5506
5507         if (device_needs_bounce(dev)) {
5508                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5509                 set_dma_ops(dev, &bounce_dma_ops);
5510         }
5511
5512         return &iommu->iommu;
5513 }
5514
5515 static void intel_iommu_release_device(struct device *dev)
5516 {
5517         struct intel_iommu *iommu;
5518         u8 bus, devfn;
5519
5520         iommu = device_to_iommu(dev, &bus, &devfn);
5521         if (!iommu)
5522                 return;
5523
5524         dmar_remove_one_dev_info(dev);
5525
5526         if (device_needs_bounce(dev))
5527                 set_dma_ops(dev, NULL);
5528 }
5529
5530 static void intel_iommu_get_resv_regions(struct device *device,
5531                                          struct list_head *head)
5532 {
5533         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5534         struct iommu_resv_region *reg;
5535         struct dmar_rmrr_unit *rmrr;
5536         struct device *i_dev;
5537         int i;
5538
5539         down_read(&dmar_global_lock);
5540         for_each_rmrr_units(rmrr) {
5541                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5542                                           i, i_dev) {
5543                         struct iommu_resv_region *resv;
5544                         enum iommu_resv_type type;
5545                         size_t length;
5546
5547                         if (i_dev != device &&
5548                             !is_downstream_to_pci_bridge(device, i_dev))
5549                                 continue;
5550
5551                         length = rmrr->end_address - rmrr->base_address + 1;
5552
5553                         type = device_rmrr_is_relaxable(device) ?
5554                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5555
5556                         resv = iommu_alloc_resv_region(rmrr->base_address,
5557                                                        length, prot, type);
5558                         if (!resv)
5559                                 break;
5560
5561                         list_add_tail(&resv->list, head);
5562                 }
5563         }
5564         up_read(&dmar_global_lock);
5565
5566 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5567         if (dev_is_pci(device)) {
5568                 struct pci_dev *pdev = to_pci_dev(device);
5569
5570                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5571                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5572                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5573                         if (reg)
5574                                 list_add_tail(&reg->list, head);
5575                 }
5576         }
5577 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5578
5579         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5580                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5581                                       0, IOMMU_RESV_MSI);
5582         if (!reg)
5583                 return;
5584         list_add_tail(&reg->list, head);
5585 }
5586
5587 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5588 {
5589         struct device_domain_info *info;
5590         struct context_entry *context;
5591         struct dmar_domain *domain;
5592         unsigned long flags;
5593         u64 ctx_lo;
5594         int ret;
5595
5596         domain = find_domain(dev);
5597         if (!domain)
5598                 return -EINVAL;
5599
5600         spin_lock_irqsave(&device_domain_lock, flags);
5601         spin_lock(&iommu->lock);
5602
5603         ret = -EINVAL;
5604         info = dev->archdata.iommu;
5605         if (!info || !info->pasid_supported)
5606                 goto out;
5607
5608         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5609         if (WARN_ON(!context))
5610                 goto out;
5611
5612         ctx_lo = context[0].lo;
5613
5614         if (!(ctx_lo & CONTEXT_PASIDE)) {
5615                 ctx_lo |= CONTEXT_PASIDE;
5616                 context[0].lo = ctx_lo;
5617                 wmb();
5618                 iommu->flush.flush_context(iommu,
5619                                            domain->iommu_did[iommu->seq_id],
5620                                            PCI_DEVID(info->bus, info->devfn),
5621                                            DMA_CCMD_MASK_NOBIT,
5622                                            DMA_CCMD_DEVICE_INVL);
5623         }
5624
5625         /* Enable PASID support in the device, if it wasn't already */
5626         if (!info->pasid_enabled)
5627                 iommu_enable_dev_iotlb(info);
5628
5629         ret = 0;
5630
5631  out:
5632         spin_unlock(&iommu->lock);
5633         spin_unlock_irqrestore(&device_domain_lock, flags);
5634
5635         return ret;
5636 }
5637
5638 static void intel_iommu_apply_resv_region(struct device *dev,
5639                                           struct iommu_domain *domain,
5640                                           struct iommu_resv_region *region)
5641 {
5642         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5643         unsigned long start, end;
5644
5645         start = IOVA_PFN(region->start);
5646         end   = IOVA_PFN(region->start + region->length - 1);
5647
5648         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5649 }
5650
5651 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5652 {
5653         if (dev_is_pci(dev))
5654                 return pci_device_group(dev);
5655         return generic_device_group(dev);
5656 }
5657
5658 #ifdef CONFIG_INTEL_IOMMU_SVM
5659 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5660 {
5661         struct intel_iommu *iommu;
5662         u8 bus, devfn;
5663
5664         if (iommu_dummy(dev)) {
5665                 dev_warn(dev,
5666                          "No IOMMU translation for device; cannot enable SVM\n");
5667                 return NULL;
5668         }
5669
5670         iommu = device_to_iommu(dev, &bus, &devfn);
5671         if ((!iommu)) {
5672                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5673                 return NULL;
5674         }
5675
5676         return iommu;
5677 }
5678 #endif /* CONFIG_INTEL_IOMMU_SVM */
5679
5680 static int intel_iommu_enable_auxd(struct device *dev)
5681 {
5682         struct device_domain_info *info;
5683         struct intel_iommu *iommu;
5684         unsigned long flags;
5685         u8 bus, devfn;
5686         int ret;
5687
5688         iommu = device_to_iommu(dev, &bus, &devfn);
5689         if (!iommu || dmar_disabled)
5690                 return -EINVAL;
5691
5692         if (!sm_supported(iommu) || !pasid_supported(iommu))
5693                 return -EINVAL;
5694
5695         ret = intel_iommu_enable_pasid(iommu, dev);
5696         if (ret)
5697                 return -ENODEV;
5698
5699         spin_lock_irqsave(&device_domain_lock, flags);
5700         info = dev->archdata.iommu;
5701         info->auxd_enabled = 1;
5702         spin_unlock_irqrestore(&device_domain_lock, flags);
5703
5704         return 0;
5705 }
5706
5707 static int intel_iommu_disable_auxd(struct device *dev)
5708 {
5709         struct device_domain_info *info;
5710         unsigned long flags;
5711
5712         spin_lock_irqsave(&device_domain_lock, flags);
5713         info = dev->archdata.iommu;
5714         if (!WARN_ON(!info))
5715                 info->auxd_enabled = 0;
5716         spin_unlock_irqrestore(&device_domain_lock, flags);
5717
5718         return 0;
5719 }
5720
5721 /*
5722  * A PCI express designated vendor specific extended capability is defined
5723  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5724  * for system software and tools to detect endpoint devices supporting the
5725  * Intel scalable IO virtualization without host driver dependency.
5726  *
5727  * Returns the address of the matching extended capability structure within
5728  * the device's PCI configuration space or 0 if the device does not support
5729  * it.
5730  */
5731 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5732 {
5733         int pos;
5734         u16 vendor, id;
5735
5736         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5737         while (pos) {
5738                 pci_read_config_word(pdev, pos + 4, &vendor);
5739                 pci_read_config_word(pdev, pos + 8, &id);
5740                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5741                         return pos;
5742
5743                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5744         }
5745
5746         return 0;
5747 }
5748
5749 static bool
5750 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5751 {
5752         if (feat == IOMMU_DEV_FEAT_AUX) {
5753                 int ret;
5754
5755                 if (!dev_is_pci(dev) || dmar_disabled ||
5756                     !scalable_mode_support() || !iommu_pasid_support())
5757                         return false;
5758
5759                 ret = pci_pasid_features(to_pci_dev(dev));
5760                 if (ret < 0)
5761                         return false;
5762
5763                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5764         }
5765
5766         return false;
5767 }
5768
5769 static int
5770 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5771 {
5772         if (feat == IOMMU_DEV_FEAT_AUX)
5773                 return intel_iommu_enable_auxd(dev);
5774
5775         return -ENODEV;
5776 }
5777
5778 static int
5779 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5780 {
5781         if (feat == IOMMU_DEV_FEAT_AUX)
5782                 return intel_iommu_disable_auxd(dev);
5783
5784         return -ENODEV;
5785 }
5786
5787 static bool
5788 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5789 {
5790         struct device_domain_info *info = dev->archdata.iommu;
5791
5792         if (feat == IOMMU_DEV_FEAT_AUX)
5793                 return scalable_mode_support() && info && info->auxd_enabled;
5794
5795         return false;
5796 }
5797
5798 static int
5799 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5800 {
5801         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5802
5803         return dmar_domain->default_pasid > 0 ?
5804                         dmar_domain->default_pasid : -EINVAL;
5805 }
5806
5807 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5808                                            struct device *dev)
5809 {
5810         return attach_deferred(dev);
5811 }
5812
5813 static int
5814 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5815                             enum iommu_attr attr, void *data)
5816 {
5817         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5818         unsigned long flags;
5819         int ret = 0;
5820
5821         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5822                 return -EINVAL;
5823
5824         switch (attr) {
5825         case DOMAIN_ATTR_NESTING:
5826                 spin_lock_irqsave(&device_domain_lock, flags);
5827                 if (nested_mode_support() &&
5828                     list_empty(&dmar_domain->devices)) {
5829                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5830                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5831                 } else {
5832                         ret = -ENODEV;
5833                 }
5834                 spin_unlock_irqrestore(&device_domain_lock, flags);
5835                 break;
5836         default:
5837                 ret = -EINVAL;
5838                 break;
5839         }
5840
5841         return ret;
5842 }
5843
5844 const struct iommu_ops intel_iommu_ops = {
5845         .capable                = intel_iommu_capable,
5846         .domain_alloc           = intel_iommu_domain_alloc,
5847         .domain_free            = intel_iommu_domain_free,
5848         .domain_set_attr        = intel_iommu_domain_set_attr,
5849         .attach_dev             = intel_iommu_attach_device,
5850         .detach_dev             = intel_iommu_detach_device,
5851         .aux_attach_dev         = intel_iommu_aux_attach_device,
5852         .aux_detach_dev         = intel_iommu_aux_detach_device,
5853         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5854         .map                    = intel_iommu_map,
5855         .unmap                  = intel_iommu_unmap,
5856         .iova_to_phys           = intel_iommu_iova_to_phys,
5857         .probe_device           = intel_iommu_probe_device,
5858         .release_device         = intel_iommu_release_device,
5859         .get_resv_regions       = intel_iommu_get_resv_regions,
5860         .put_resv_regions       = generic_iommu_put_resv_regions,
5861         .apply_resv_region      = intel_iommu_apply_resv_region,
5862         .device_group           = intel_iommu_device_group,
5863         .dev_has_feat           = intel_iommu_dev_has_feat,
5864         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5865         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5866         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5867         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5868         .def_domain_type        = device_def_domain_type,
5869         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5870 };
5871
5872 static void quirk_iommu_igfx(struct pci_dev *dev)
5873 {
5874         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5875         dmar_map_gfx = 0;
5876 }
5877
5878 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5886
5887 /* Broadwell igfx malfunctions with dmar */
5888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5912
5913 static void quirk_iommu_rwbf(struct pci_dev *dev)
5914 {
5915         /*
5916          * Mobile 4 Series Chipset neglects to set RWBF capability,
5917          * but needs it. Same seems to hold for the desktop versions.
5918          */
5919         pci_info(dev, "Forcing write-buffer flush capability\n");
5920         rwbf_quirk = 1;
5921 }
5922
5923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5930
5931 #define GGC 0x52
5932 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5933 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5934 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5935 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5936 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5937 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5938 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5939 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5940
5941 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5942 {
5943         unsigned short ggc;
5944
5945         if (pci_read_config_word(dev, GGC, &ggc))
5946                 return;
5947
5948         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5949                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5950                 dmar_map_gfx = 0;
5951         } else if (dmar_map_gfx) {
5952                 /* we have to ensure the gfx device is idle before we flush */
5953                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5954                 intel_iommu_strict = 1;
5955        }
5956 }
5957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5961
5962 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5963    ISOCH DMAR unit for the Azalia sound device, but not give it any
5964    TLB entries, which causes it to deadlock. Check for that.  We do
5965    this in a function called from init_dmars(), instead of in a PCI
5966    quirk, because we don't want to print the obnoxious "BIOS broken"
5967    message if VT-d is actually disabled.
5968 */
5969 static void __init check_tylersburg_isoch(void)
5970 {
5971         struct pci_dev *pdev;
5972         uint32_t vtisochctrl;
5973
5974         /* If there's no Azalia in the system anyway, forget it. */
5975         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5976         if (!pdev)
5977                 return;
5978         pci_dev_put(pdev);
5979
5980         /* System Management Registers. Might be hidden, in which case
5981            we can't do the sanity check. But that's OK, because the
5982            known-broken BIOSes _don't_ actually hide it, so far. */
5983         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5984         if (!pdev)
5985                 return;
5986
5987         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5988                 pci_dev_put(pdev);
5989                 return;
5990         }
5991
5992         pci_dev_put(pdev);
5993
5994         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5995         if (vtisochctrl & 1)
5996                 return;
5997
5998         /* Drop all bits other than the number of TLB entries */
5999         vtisochctrl &= 0x1c;
6000
6001         /* If we have the recommended number of TLB entries (16), fine. */
6002         if (vtisochctrl == 0x10)
6003                 return;
6004
6005         /* Zero TLB entries? You get to ride the short bus to school. */
6006         if (!vtisochctrl) {
6007                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6008                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6009                      dmi_get_system_info(DMI_BIOS_VENDOR),
6010                      dmi_get_system_info(DMI_BIOS_VERSION),
6011                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6012                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6013                 return;
6014         }
6015
6016         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6017                vtisochctrl);
6018 }