nvme-pci: Hold cq_poll_lock while completing CQEs
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359                                  struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361                                struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364                                      struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366                                             dma_addr_t iova);
367
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
373
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
379
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
382
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
389
390 #define IDENTMAP_GFX            2
391 #define IDENTMAP_AZALIA         4
392
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
400
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
402                                 to_pci_dev(d)->untrusted)
403
404 /*
405  * Iterate over elements in device_domain_list and call the specified
406  * callback @fn against each element.
407  */
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409                                      void *data), void *data)
410 {
411         int ret = 0;
412         unsigned long flags;
413         struct device_domain_info *info;
414
415         spin_lock_irqsave(&device_domain_lock, flags);
416         list_for_each_entry(info, &device_domain_list, global) {
417                 ret = fn(info, data);
418                 if (ret) {
419                         spin_unlock_irqrestore(&device_domain_lock, flags);
420                         return ret;
421                 }
422         }
423         spin_unlock_irqrestore(&device_domain_lock, flags);
424
425         return 0;
426 }
427
428 const struct iommu_ops intel_iommu_ops;
429
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
433 }
434
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
436 {
437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
438 }
439
440 static void init_translation_status(struct intel_iommu *iommu)
441 {
442         u32 gsts;
443
444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
445         if (gsts & DMA_GSTS_TES)
446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
447 }
448
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
451 {
452         return container_of(dom, struct dmar_domain, domain);
453 }
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         pr_info("IOMMU enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         no_platform_optin = 1;
466                         pr_info("IOMMU disabled\n");
467                 } else if (!strncmp(str, "igfx_off", 8)) {
468                         dmar_map_gfx = 0;
469                         pr_info("Disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         pr_info("Forcing DAC for PCI devices\n");
472                         dmar_forcedac = 1;
473                 } else if (!strncmp(str, "strict", 6)) {
474                         pr_info("Disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         pr_info("Disable supported super page\n");
478                         intel_iommu_superpage = 0;
479                 } else if (!strncmp(str, "sm_on", 5)) {
480                         pr_info("Intel-IOMMU: scalable mode supported\n");
481                         intel_iommu_sm = 1;
482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
483                         printk(KERN_INFO
484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485                         intel_iommu_tboot_noforce = 1;
486                 } else if (!strncmp(str, "nobounce", 8)) {
487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488                         intel_no_bounce = 1;
489                 }
490
491                 str += strcspn(str, ",");
492                 while (*str == ',')
493                         str++;
494         }
495         return 0;
496 }
497 __setup("intel_iommu=", intel_iommu_setup);
498
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
501
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
503 {
504         struct dmar_domain **domains;
505         int idx = did >> 8;
506
507         domains = iommu->domains[idx];
508         if (!domains)
509                 return NULL;
510
511         return domains[did & 0xff];
512 }
513
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515                              struct dmar_domain *domain)
516 {
517         struct dmar_domain **domains;
518         int idx = did >> 8;
519
520         if (!iommu->domains[idx]) {
521                 size_t size = 256 * sizeof(struct dmar_domain *);
522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
523         }
524
525         domains = iommu->domains[idx];
526         if (WARN_ON(!domains))
527                 return;
528         else
529                 domains[did & 0xff] = domain;
530 }
531
532 void *alloc_pgtable_page(int node)
533 {
534         struct page *page;
535         void *vaddr = NULL;
536
537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538         if (page)
539                 vaddr = page_address(page);
540         return vaddr;
541 }
542
543 void free_pgtable_page(void *vaddr)
544 {
545         free_page((unsigned long)vaddr);
546 }
547
548 static inline void *alloc_domain_mem(void)
549 {
550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
551 }
552
553 static void free_domain_mem(void *vaddr)
554 {
555         kmem_cache_free(iommu_domain_cache, vaddr);
556 }
557
558 static inline void * alloc_devinfo_mem(void)
559 {
560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
561 }
562
563 static inline void free_devinfo_mem(void *vaddr)
564 {
565         kmem_cache_free(iommu_devinfo_cache, vaddr);
566 }
567
568 static inline int domain_type_is_si(struct dmar_domain *domain)
569 {
570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
571 }
572
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
574 {
575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
576 }
577
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579                                        unsigned long pfn)
580 {
581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
582
583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588         unsigned long sagaw;
589         int agaw = -1;
590
591         sagaw = cap_sagaw(iommu->cap);
592         for (agaw = width_to_agaw(max_gaw);
593              agaw >= 0; agaw--) {
594                 if (test_bit(agaw, &sagaw))
595                         break;
596         }
597
598         return agaw;
599 }
600
601 /*
602  * Calculate max SAGAW for each iommu.
603  */
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 {
606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
607 }
608
609 /*
610  * calculate agaw for each iommu.
611  * "SAGAW" may be different across iommus, use a default agaw, and
612  * get a supported less agaw for iommus that don't support the default agaw.
613  */
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 {
616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
617 }
618
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
621 {
622         int iommu_id;
623
624         /* si_domain and vm domain should not get here. */
625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626                 return NULL;
627
628         for_each_domain_iommu(iommu_id, domain)
629                 break;
630
631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632                 return NULL;
633
634         return g_iommus[iommu_id];
635 }
636
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
638 {
639         struct dmar_drhd_unit *drhd;
640         struct intel_iommu *iommu;
641         bool found = false;
642         int i;
643
644         domain->iommu_coherency = 1;
645
646         for_each_domain_iommu(i, domain) {
647                 found = true;
648                 if (!ecap_coherent(g_iommus[i]->ecap)) {
649                         domain->iommu_coherency = 0;
650                         break;
651                 }
652         }
653         if (found)
654                 return;
655
656         /* No hardware attached; use lowest common denominator */
657         rcu_read_lock();
658         for_each_active_iommu(iommu, drhd) {
659                 if (!ecap_coherent(iommu->ecap)) {
660                         domain->iommu_coherency = 0;
661                         break;
662                 }
663         }
664         rcu_read_unlock();
665 }
666
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
668 {
669         struct dmar_drhd_unit *drhd;
670         struct intel_iommu *iommu;
671         int ret = 1;
672
673         rcu_read_lock();
674         for_each_active_iommu(iommu, drhd) {
675                 if (iommu != skip) {
676                         if (!ecap_sc_support(iommu->ecap)) {
677                                 ret = 0;
678                                 break;
679                         }
680                 }
681         }
682         rcu_read_unlock();
683
684         return ret;
685 }
686
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688                                          struct intel_iommu *skip)
689 {
690         struct dmar_drhd_unit *drhd;
691         struct intel_iommu *iommu;
692         int mask = 0x3;
693
694         if (!intel_iommu_superpage) {
695                 return 0;
696         }
697
698         /* set iommu_superpage to the smallest common denominator */
699         rcu_read_lock();
700         for_each_active_iommu(iommu, drhd) {
701                 if (iommu != skip) {
702                         if (domain && domain_use_first_level(domain)) {
703                                 if (!cap_fl1gp_support(iommu->cap))
704                                         mask = 0x1;
705                         } else {
706                                 mask &= cap_super_page_val(iommu->cap);
707                         }
708
709                         if (!mask)
710                                 break;
711                 }
712         }
713         rcu_read_unlock();
714
715         return fls(mask);
716 }
717
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
720 {
721         domain_update_iommu_coherency(domain);
722         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
724 }
725
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
727                                          u8 devfn, int alloc)
728 {
729         struct root_entry *root = &iommu->root_entry[bus];
730         struct context_entry *context;
731         u64 *entry;
732
733         entry = &root->lo;
734         if (sm_supported(iommu)) {
735                 if (devfn >= 0x80) {
736                         devfn -= 0x80;
737                         entry = &root->hi;
738                 }
739                 devfn *= 2;
740         }
741         if (*entry & 1)
742                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
743         else {
744                 unsigned long phy_addr;
745                 if (!alloc)
746                         return NULL;
747
748                 context = alloc_pgtable_page(iommu->node);
749                 if (!context)
750                         return NULL;
751
752                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753                 phy_addr = virt_to_phys((void *)context);
754                 *entry = phy_addr | 1;
755                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
756         }
757         return &context[devfn];
758 }
759
760 static int iommu_dummy(struct device *dev)
761 {
762         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
763 }
764
765 static bool attach_deferred(struct device *dev)
766 {
767         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
768 }
769
770 /**
771  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772  *                               sub-hierarchy of a candidate PCI-PCI bridge
773  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774  * @bridge: the candidate PCI-PCI bridge
775  *
776  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
777  */
778 static bool
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
780 {
781         struct pci_dev *pdev, *pbridge;
782
783         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
784                 return false;
785
786         pdev = to_pci_dev(dev);
787         pbridge = to_pci_dev(bridge);
788
789         if (pbridge->subordinate &&
790             pbridge->subordinate->number <= pdev->bus->number &&
791             pbridge->subordinate->busn_res.end >= pdev->bus->number)
792                 return true;
793
794         return false;
795 }
796
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
798 {
799         struct dmar_drhd_unit *drhd = NULL;
800         struct intel_iommu *iommu;
801         struct device *tmp;
802         struct pci_dev *pdev = NULL;
803         u16 segment = 0;
804         int i;
805
806         if (iommu_dummy(dev))
807                 return NULL;
808
809         if (dev_is_pci(dev)) {
810                 struct pci_dev *pf_pdev;
811
812                 pdev = pci_real_dma_dev(to_pci_dev(dev));
813
814                 /* VFs aren't listed in scope tables; we need to look up
815                  * the PF instead to find the IOMMU. */
816                 pf_pdev = pci_physfn(pdev);
817                 dev = &pf_pdev->dev;
818                 segment = pci_domain_nr(pdev->bus);
819         } else if (has_acpi_companion(dev))
820                 dev = &ACPI_COMPANION(dev)->dev;
821
822         rcu_read_lock();
823         for_each_active_iommu(iommu, drhd) {
824                 if (pdev && segment != drhd->segment)
825                         continue;
826
827                 for_each_active_dev_scope(drhd->devices,
828                                           drhd->devices_cnt, i, tmp) {
829                         if (tmp == dev) {
830                                 /* For a VF use its original BDF# not that of the PF
831                                  * which we used for the IOMMU lookup. Strictly speaking
832                                  * we could do this for all PCI devices; we only need to
833                                  * get the BDF# from the scope table for ACPI matches. */
834                                 if (pdev && pdev->is_virtfn)
835                                         goto got_pdev;
836
837                                 *bus = drhd->devices[i].bus;
838                                 *devfn = drhd->devices[i].devfn;
839                                 goto out;
840                         }
841
842                         if (is_downstream_to_pci_bridge(dev, tmp))
843                                 goto got_pdev;
844                 }
845
846                 if (pdev && drhd->include_all) {
847                 got_pdev:
848                         *bus = pdev->bus->number;
849                         *devfn = pdev->devfn;
850                         goto out;
851                 }
852         }
853         iommu = NULL;
854  out:
855         rcu_read_unlock();
856
857         return iommu;
858 }
859
860 static void domain_flush_cache(struct dmar_domain *domain,
861                                void *addr, int size)
862 {
863         if (!domain->iommu_coherency)
864                 clflush_cache_range(addr, size);
865 }
866
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
868 {
869         struct context_entry *context;
870         int ret = 0;
871         unsigned long flags;
872
873         spin_lock_irqsave(&iommu->lock, flags);
874         context = iommu_context_addr(iommu, bus, devfn, 0);
875         if (context)
876                 ret = context_present(context);
877         spin_unlock_irqrestore(&iommu->lock, flags);
878         return ret;
879 }
880
881 static void free_context_table(struct intel_iommu *iommu)
882 {
883         int i;
884         unsigned long flags;
885         struct context_entry *context;
886
887         spin_lock_irqsave(&iommu->lock, flags);
888         if (!iommu->root_entry) {
889                 goto out;
890         }
891         for (i = 0; i < ROOT_ENTRY_NR; i++) {
892                 context = iommu_context_addr(iommu, i, 0, 0);
893                 if (context)
894                         free_pgtable_page(context);
895
896                 if (!sm_supported(iommu))
897                         continue;
898
899                 context = iommu_context_addr(iommu, i, 0x80, 0);
900                 if (context)
901                         free_pgtable_page(context);
902
903         }
904         free_pgtable_page(iommu->root_entry);
905         iommu->root_entry = NULL;
906 out:
907         spin_unlock_irqrestore(&iommu->lock, flags);
908 }
909
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911                                       unsigned long pfn, int *target_level)
912 {
913         struct dma_pte *parent, *pte;
914         int level = agaw_to_level(domain->agaw);
915         int offset;
916
917         BUG_ON(!domain->pgd);
918
919         if (!domain_pfn_supported(domain, pfn))
920                 /* Address beyond IOMMU's addressing capabilities. */
921                 return NULL;
922
923         parent = domain->pgd;
924
925         while (1) {
926                 void *tmp_page;
927
928                 offset = pfn_level_offset(pfn, level);
929                 pte = &parent[offset];
930                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931                         break;
932                 if (level == *target_level)
933                         break;
934
935                 if (!dma_pte_present(pte)) {
936                         uint64_t pteval;
937
938                         tmp_page = alloc_pgtable_page(domain->nid);
939
940                         if (!tmp_page)
941                                 return NULL;
942
943                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945                         if (domain_use_first_level(domain))
946                                 pteval |= DMA_FL_PTE_XD;
947                         if (cmpxchg64(&pte->val, 0ULL, pteval))
948                                 /* Someone else set it while we were thinking; use theirs. */
949                                 free_pgtable_page(tmp_page);
950                         else
951                                 domain_flush_cache(domain, pte, sizeof(*pte));
952                 }
953                 if (level == 1)
954                         break;
955
956                 parent = phys_to_virt(dma_pte_addr(pte));
957                 level--;
958         }
959
960         if (!*target_level)
961                 *target_level = level;
962
963         return pte;
964 }
965
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968                                          unsigned long pfn,
969                                          int level, int *large_page)
970 {
971         struct dma_pte *parent, *pte;
972         int total = agaw_to_level(domain->agaw);
973         int offset;
974
975         parent = domain->pgd;
976         while (level <= total) {
977                 offset = pfn_level_offset(pfn, total);
978                 pte = &parent[offset];
979                 if (level == total)
980                         return pte;
981
982                 if (!dma_pte_present(pte)) {
983                         *large_page = total;
984                         break;
985                 }
986
987                 if (dma_pte_superpage(pte)) {
988                         *large_page = total;
989                         return pte;
990                 }
991
992                 parent = phys_to_virt(dma_pte_addr(pte));
993                 total--;
994         }
995         return NULL;
996 }
997
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000                                 unsigned long start_pfn,
1001                                 unsigned long last_pfn)
1002 {
1003         unsigned int large_page;
1004         struct dma_pte *first_pte, *pte;
1005
1006         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008         BUG_ON(start_pfn > last_pfn);
1009
1010         /* we don't need lock here; nobody else touches the iova range */
1011         do {
1012                 large_page = 1;
1013                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014                 if (!pte) {
1015                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016                         continue;
1017                 }
1018                 do {
1019                         dma_clear_pte(pte);
1020                         start_pfn += lvl_to_nr_pages(large_page);
1021                         pte++;
1022                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023
1024                 domain_flush_cache(domain, first_pte,
1025                                    (void *)pte - (void *)first_pte);
1026
1027         } while (start_pfn && start_pfn <= last_pfn);
1028 }
1029
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031                                int retain_level, struct dma_pte *pte,
1032                                unsigned long pfn, unsigned long start_pfn,
1033                                unsigned long last_pfn)
1034 {
1035         pfn = max(start_pfn, pfn);
1036         pte = &pte[pfn_level_offset(pfn, level)];
1037
1038         do {
1039                 unsigned long level_pfn;
1040                 struct dma_pte *level_pte;
1041
1042                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043                         goto next;
1044
1045                 level_pfn = pfn & level_mask(level);
1046                 level_pte = phys_to_virt(dma_pte_addr(pte));
1047
1048                 if (level > 2) {
1049                         dma_pte_free_level(domain, level - 1, retain_level,
1050                                            level_pte, level_pfn, start_pfn,
1051                                            last_pfn);
1052                 }
1053
1054                 /*
1055                  * Free the page table if we're below the level we want to
1056                  * retain and the range covers the entire table.
1057                  */
1058                 if (level < retain_level && !(start_pfn > level_pfn ||
1059                       last_pfn < level_pfn + level_size(level) - 1)) {
1060                         dma_clear_pte(pte);
1061                         domain_flush_cache(domain, pte, sizeof(*pte));
1062                         free_pgtable_page(level_pte);
1063                 }
1064 next:
1065                 pfn += level_size(level);
1066         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074                                    unsigned long start_pfn,
1075                                    unsigned long last_pfn,
1076                                    int retain_level)
1077 {
1078         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080         BUG_ON(start_pfn > last_pfn);
1081
1082         dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084         /* We don't need lock here; nobody else touches the iova range */
1085         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086                            domain->pgd, 0, start_pfn, last_pfn);
1087
1088         /* free pgd */
1089         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090                 free_pgtable_page(domain->pgd);
1091                 domain->pgd = NULL;
1092         }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096    need to *modify* it at all. All we need to do is make a list of all the
1097    pages which can be freed just as soon as we've flushed the IOTLB and we
1098    know the hardware page-walk will no longer touch them.
1099    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100    be freed. */
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102                                             int level, struct dma_pte *pte,
1103                                             struct page *freelist)
1104 {
1105         struct page *pg;
1106
1107         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108         pg->freelist = freelist;
1109         freelist = pg;
1110
1111         if (level == 1)
1112                 return freelist;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         freelist = dma_pte_list_pagetables(domain, level - 1,
1118                                                            pte, freelist);
1119                 pte++;
1120         } while (!first_pte_in_page(pte));
1121
1122         return freelist;
1123 }
1124
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126                                         struct dma_pte *pte, unsigned long pfn,
1127                                         unsigned long start_pfn,
1128                                         unsigned long last_pfn,
1129                                         struct page *freelist)
1130 {
1131         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1132
1133         pfn = max(start_pfn, pfn);
1134         pte = &pte[pfn_level_offset(pfn, level)];
1135
1136         do {
1137                 unsigned long level_pfn;
1138
1139                 if (!dma_pte_present(pte))
1140                         goto next;
1141
1142                 level_pfn = pfn & level_mask(level);
1143
1144                 /* If range covers entire pagetable, free it */
1145                 if (start_pfn <= level_pfn &&
1146                     last_pfn >= level_pfn + level_size(level) - 1) {
1147                         /* These suborbinate page tables are going away entirely. Don't
1148                            bother to clear them; we're just going to *free* them. */
1149                         if (level > 1 && !dma_pte_superpage(pte))
1150                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1151
1152                         dma_clear_pte(pte);
1153                         if (!first_pte)
1154                                 first_pte = pte;
1155                         last_pte = pte;
1156                 } else if (level > 1) {
1157                         /* Recurse down into a level that isn't *entirely* obsolete */
1158                         freelist = dma_pte_clear_level(domain, level - 1,
1159                                                        phys_to_virt(dma_pte_addr(pte)),
1160                                                        level_pfn, start_pfn, last_pfn,
1161                                                        freelist);
1162                 }
1163 next:
1164                 pfn += level_size(level);
1165         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1166
1167         if (first_pte)
1168                 domain_flush_cache(domain, first_pte,
1169                                    (void *)++last_pte - (void *)first_pte);
1170
1171         return freelist;
1172 }
1173
1174 /* We can't just free the pages because the IOMMU may still be walking
1175    the page tables, and may have cached the intermediate levels. The
1176    pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178                                  unsigned long start_pfn,
1179                                  unsigned long last_pfn)
1180 {
1181         struct page *freelist;
1182
1183         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185         BUG_ON(start_pfn > last_pfn);
1186
1187         /* we don't need lock here; nobody else touches the iova range */
1188         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1190
1191         /* free pgd */
1192         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193                 struct page *pgd_page = virt_to_page(domain->pgd);
1194                 pgd_page->freelist = freelist;
1195                 freelist = pgd_page;
1196
1197                 domain->pgd = NULL;
1198         }
1199
1200         return freelist;
1201 }
1202
1203 static void dma_free_pagelist(struct page *freelist)
1204 {
1205         struct page *pg;
1206
1207         while ((pg = freelist)) {
1208                 freelist = pg->freelist;
1209                 free_pgtable_page(page_address(pg));
1210         }
1211 }
1212
1213 static void iova_entry_free(unsigned long data)
1214 {
1215         struct page *freelist = (struct page *)data;
1216
1217         dma_free_pagelist(freelist);
1218 }
1219
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1222 {
1223         struct root_entry *root;
1224         unsigned long flags;
1225
1226         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1227         if (!root) {
1228                 pr_err("Allocating root entry for %s failed\n",
1229                         iommu->name);
1230                 return -ENOMEM;
1231         }
1232
1233         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1234
1235         spin_lock_irqsave(&iommu->lock, flags);
1236         iommu->root_entry = root;
1237         spin_unlock_irqrestore(&iommu->lock, flags);
1238
1239         return 0;
1240 }
1241
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1243 {
1244         u64 addr;
1245         u32 sts;
1246         unsigned long flag;
1247
1248         addr = virt_to_phys(iommu->root_entry);
1249         if (sm_supported(iommu))
1250                 addr |= DMA_RTADDR_SMT;
1251
1252         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1254
1255         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1256
1257         /* Make sure hardware complete it */
1258         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259                       readl, (sts & DMA_GSTS_RTPS), sts);
1260
1261         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1265 {
1266         u32 val;
1267         unsigned long flag;
1268
1269         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1270                 return;
1271
1272         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1274
1275         /* Make sure hardware complete it */
1276         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277                       readl, (!(val & DMA_GSTS_WBFS)), val);
1278
1279         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1280 }
1281
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284                                   u16 did, u16 source_id, u8 function_mask,
1285                                   u64 type)
1286 {
1287         u64 val = 0;
1288         unsigned long flag;
1289
1290         switch (type) {
1291         case DMA_CCMD_GLOBAL_INVL:
1292                 val = DMA_CCMD_GLOBAL_INVL;
1293                 break;
1294         case DMA_CCMD_DOMAIN_INVL:
1295                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1296                 break;
1297         case DMA_CCMD_DEVICE_INVL:
1298                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1300                 break;
1301         default:
1302                 BUG();
1303         }
1304         val |= DMA_CCMD_ICC;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1308
1309         /* Make sure hardware complete it */
1310         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1312
1313         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1314 }
1315
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318                                 u64 addr, unsigned int size_order, u64 type)
1319 {
1320         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321         u64 val = 0, val_iva = 0;
1322         unsigned long flag;
1323
1324         switch (type) {
1325         case DMA_TLB_GLOBAL_FLUSH:
1326                 /* global flush doesn't need set IVA_REG */
1327                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1328                 break;
1329         case DMA_TLB_DSI_FLUSH:
1330                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1331                 break;
1332         case DMA_TLB_PSI_FLUSH:
1333                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334                 /* IH bit is passed in as part of address */
1335                 val_iva = size_order | addr;
1336                 break;
1337         default:
1338                 BUG();
1339         }
1340         /* Note: set drain read/write */
1341 #if 0
1342         /*
1343          * This is probably to be super secure.. Looks like we can
1344          * ignore it without any impact.
1345          */
1346         if (cap_read_drain(iommu->cap))
1347                 val |= DMA_TLB_READ_DRAIN;
1348 #endif
1349         if (cap_write_drain(iommu->cap))
1350                 val |= DMA_TLB_WRITE_DRAIN;
1351
1352         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353         /* Note: Only uses first TLB reg currently */
1354         if (val_iva)
1355                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1357
1358         /* Make sure hardware complete it */
1359         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1361
1362         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1363
1364         /* check IOTLB invalidation granularity */
1365         if (DMA_TLB_IAIG(val) == 0)
1366                 pr_err("Flush IOTLB failed\n");
1367         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369                         (unsigned long long)DMA_TLB_IIRG(type),
1370                         (unsigned long long)DMA_TLB_IAIG(val));
1371 }
1372
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1375                          u8 bus, u8 devfn)
1376 {
1377         struct device_domain_info *info;
1378
1379         assert_spin_locked(&device_domain_lock);
1380
1381         if (!iommu->qi)
1382                 return NULL;
1383
1384         list_for_each_entry(info, &domain->devices, link)
1385                 if (info->iommu == iommu && info->bus == bus &&
1386                     info->devfn == devfn) {
1387                         if (info->ats_supported && info->dev)
1388                                 return info;
1389                         break;
1390                 }
1391
1392         return NULL;
1393 }
1394
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1396 {
1397         struct device_domain_info *info;
1398         bool has_iotlb_device = false;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         list_for_each_entry(info, &domain->devices, link) {
1403                 struct pci_dev *pdev;
1404
1405                 if (!info->dev || !dev_is_pci(info->dev))
1406                         continue;
1407
1408                 pdev = to_pci_dev(info->dev);
1409                 if (pdev->ats_enabled) {
1410                         has_iotlb_device = true;
1411                         break;
1412                 }
1413         }
1414
1415         domain->has_iotlb_device = has_iotlb_device;
1416 }
1417
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1419 {
1420         struct pci_dev *pdev;
1421
1422         assert_spin_locked(&device_domain_lock);
1423
1424         if (!info || !dev_is_pci(info->dev))
1425                 return;
1426
1427         pdev = to_pci_dev(info->dev);
1428         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431          * reserved, which should be set to 0.
1432          */
1433         if (!ecap_dit(info->iommu->ecap))
1434                 info->pfsid = 0;
1435         else {
1436                 struct pci_dev *pf_pdev;
1437
1438                 /* pdev will be returned if device is not a vf */
1439                 pf_pdev = pci_physfn(pdev);
1440                 info->pfsid = pci_dev_id(pf_pdev);
1441         }
1442
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444         /* The PCIe spec, in its wisdom, declares that the behaviour of
1445            the device if you enable PASID support after ATS support is
1446            undefined. So always enable PASID support on devices which
1447            have it, even if we can't yet know if we're ever going to
1448            use it. */
1449         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450                 info->pasid_enabled = 1;
1451
1452         if (info->pri_supported &&
1453             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1454             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455                 info->pri_enabled = 1;
1456 #endif
1457         if (!pdev->untrusted && info->ats_supported &&
1458             pci_ats_page_aligned(pdev) &&
1459             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460                 info->ats_enabled = 1;
1461                 domain_update_iotlb(info->domain);
1462                 info->ats_qdep = pci_ats_queue_depth(pdev);
1463         }
1464 }
1465
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1467 {
1468         struct pci_dev *pdev;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         if (!dev_is_pci(info->dev))
1473                 return;
1474
1475         pdev = to_pci_dev(info->dev);
1476
1477         if (info->ats_enabled) {
1478                 pci_disable_ats(pdev);
1479                 info->ats_enabled = 0;
1480                 domain_update_iotlb(info->domain);
1481         }
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483         if (info->pri_enabled) {
1484                 pci_disable_pri(pdev);
1485                 info->pri_enabled = 0;
1486         }
1487         if (info->pasid_enabled) {
1488                 pci_disable_pasid(pdev);
1489                 info->pasid_enabled = 0;
1490         }
1491 #endif
1492 }
1493
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495                                   u64 addr, unsigned mask)
1496 {
1497         u16 sid, qdep;
1498         unsigned long flags;
1499         struct device_domain_info *info;
1500
1501         if (!domain->has_iotlb_device)
1502                 return;
1503
1504         spin_lock_irqsave(&device_domain_lock, flags);
1505         list_for_each_entry(info, &domain->devices, link) {
1506                 if (!info->ats_enabled)
1507                         continue;
1508
1509                 sid = info->bus << 8 | info->devfn;
1510                 qdep = info->ats_qdep;
1511                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1512                                 qdep, addr, mask);
1513         }
1514         spin_unlock_irqrestore(&device_domain_lock, flags);
1515 }
1516
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518                                 struct dmar_domain *domain,
1519                                 u64 addr, unsigned long npages, bool ih)
1520 {
1521         u16 did = domain->iommu_did[iommu->seq_id];
1522
1523         if (domain->default_pasid)
1524                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1525                                 addr, npages, ih);
1526
1527         if (!list_empty(&domain->devices))
1528                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1529 }
1530
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532                                   struct dmar_domain *domain,
1533                                   unsigned long pfn, unsigned int pages,
1534                                   int ih, int map)
1535 {
1536         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538         u16 did = domain->iommu_did[iommu->seq_id];
1539
1540         BUG_ON(pages == 0);
1541
1542         if (ih)
1543                 ih = 1 << 6;
1544
1545         if (domain_use_first_level(domain)) {
1546                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1547         } else {
1548                 /*
1549                  * Fallback to domain selective flush if no PSI support or
1550                  * the size is too big. PSI requires page size to be 2 ^ x,
1551                  * and the base address is naturally aligned to the size.
1552                  */
1553                 if (!cap_pgsel_inv(iommu->cap) ||
1554                     mask > cap_max_amask_val(iommu->cap))
1555                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1556                                                         DMA_TLB_DSI_FLUSH);
1557                 else
1558                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1559                                                         DMA_TLB_PSI_FLUSH);
1560         }
1561
1562         /*
1563          * In caching mode, changes of pages from non-present to present require
1564          * flush. However, device IOTLB doesn't need to be flushed in this case.
1565          */
1566         if (!cap_caching_mode(iommu->cap) || !map)
1567                 iommu_flush_dev_iotlb(domain, addr, mask);
1568 }
1569
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572                                         struct dmar_domain *domain,
1573                                         unsigned long pfn, unsigned int pages)
1574 {
1575         /*
1576          * It's a non-present to present mapping. Only flush if caching mode
1577          * and second level.
1578          */
1579         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1581         else
1582                 iommu_flush_write_buffer(iommu);
1583 }
1584
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1586 {
1587         struct dmar_domain *domain;
1588         int idx;
1589
1590         domain = container_of(iovad, struct dmar_domain, iovad);
1591
1592         for_each_domain_iommu(idx, domain) {
1593                 struct intel_iommu *iommu = g_iommus[idx];
1594                 u16 did = domain->iommu_did[iommu->seq_id];
1595
1596                 if (domain_use_first_level(domain))
1597                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1598                 else
1599                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1600                                                  DMA_TLB_DSI_FLUSH);
1601
1602                 if (!cap_caching_mode(iommu->cap))
1603                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604                                               0, MAX_AGAW_PFN_WIDTH);
1605         }
1606 }
1607
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1609 {
1610         u32 pmen;
1611         unsigned long flags;
1612
1613         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1614                 return;
1615
1616         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618         pmen &= ~DMA_PMEN_EPM;
1619         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1620
1621         /* wait for the protected region status bit to clear */
1622         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1624
1625         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 }
1627
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1629 {
1630         u32 sts;
1631         unsigned long flags;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634         iommu->gcmd |= DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (sts & DMA_GSTS_TES), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1645 {
1646         u32 sts;
1647         unsigned long flag;
1648
1649         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650         iommu->gcmd &= ~DMA_GCMD_TE;
1651         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653         /* Make sure hardware complete it */
1654         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655                       readl, (!(sts & DMA_GSTS_TES)), sts);
1656
1657         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1658 }
1659
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1661 {
1662         u32 ndomains, nlongs;
1663         size_t size;
1664
1665         ndomains = cap_ndoms(iommu->cap);
1666         pr_debug("%s: Number of Domains supported <%d>\n",
1667                  iommu->name, ndomains);
1668         nlongs = BITS_TO_LONGS(ndomains);
1669
1670         spin_lock_init(&iommu->lock);
1671
1672         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673         if (!iommu->domain_ids) {
1674                 pr_err("%s: Allocating domain id array failed\n",
1675                        iommu->name);
1676                 return -ENOMEM;
1677         }
1678
1679         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680         iommu->domains = kzalloc(size, GFP_KERNEL);
1681
1682         if (iommu->domains) {
1683                 size = 256 * sizeof(struct dmar_domain *);
1684                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1685         }
1686
1687         if (!iommu->domains || !iommu->domains[0]) {
1688                 pr_err("%s: Allocating domain array failed\n",
1689                        iommu->name);
1690                 kfree(iommu->domain_ids);
1691                 kfree(iommu->domains);
1692                 iommu->domain_ids = NULL;
1693                 iommu->domains    = NULL;
1694                 return -ENOMEM;
1695         }
1696
1697         /*
1698          * If Caching mode is set, then invalid translations are tagged
1699          * with domain-id 0, hence we need to pre-allocate it. We also
1700          * use domain-id 0 as a marker for non-allocated domain-id, so
1701          * make sure it is not used for a real domain.
1702          */
1703         set_bit(0, iommu->domain_ids);
1704
1705         /*
1706          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707          * entry for first-level or pass-through translation modes should
1708          * be programmed with a domain id different from those used for
1709          * second-level or nested translation. We reserve a domain id for
1710          * this purpose.
1711          */
1712         if (sm_supported(iommu))
1713                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1714
1715         return 0;
1716 }
1717
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1719 {
1720         struct device_domain_info *info, *tmp;
1721         unsigned long flags;
1722
1723         if (!iommu->domains || !iommu->domain_ids)
1724                 return;
1725
1726         spin_lock_irqsave(&device_domain_lock, flags);
1727         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728                 if (info->iommu != iommu)
1729                         continue;
1730
1731                 if (!info->dev || !info->domain)
1732                         continue;
1733
1734                 __dmar_remove_one_dev_info(info);
1735         }
1736         spin_unlock_irqrestore(&device_domain_lock, flags);
1737
1738         if (iommu->gcmd & DMA_GCMD_TE)
1739                 iommu_disable_translation(iommu);
1740 }
1741
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1743 {
1744         if ((iommu->domains) && (iommu->domain_ids)) {
1745                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1746                 int i;
1747
1748                 for (i = 0; i < elems; i++)
1749                         kfree(iommu->domains[i]);
1750                 kfree(iommu->domains);
1751                 kfree(iommu->domain_ids);
1752                 iommu->domains = NULL;
1753                 iommu->domain_ids = NULL;
1754         }
1755
1756         g_iommus[iommu->seq_id] = NULL;
1757
1758         /* free context mapping */
1759         free_context_table(iommu);
1760
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762         if (pasid_supported(iommu)) {
1763                 if (ecap_prs(iommu->ecap))
1764                         intel_svm_finish_prq(iommu);
1765         }
1766 #endif
1767 }
1768
1769 /*
1770  * Check and return whether first level is used by default for
1771  * DMA translation.
1772  */
1773 static bool first_level_by_default(void)
1774 {
1775         struct dmar_drhd_unit *drhd;
1776         struct intel_iommu *iommu;
1777         static int first_level_support = -1;
1778
1779         if (likely(first_level_support != -1))
1780                 return first_level_support;
1781
1782         first_level_support = 1;
1783
1784         rcu_read_lock();
1785         for_each_active_iommu(iommu, drhd) {
1786                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787                         first_level_support = 0;
1788                         break;
1789                 }
1790         }
1791         rcu_read_unlock();
1792
1793         return first_level_support;
1794 }
1795
1796 static struct dmar_domain *alloc_domain(int flags)
1797 {
1798         struct dmar_domain *domain;
1799
1800         domain = alloc_domain_mem();
1801         if (!domain)
1802                 return NULL;
1803
1804         memset(domain, 0, sizeof(*domain));
1805         domain->nid = NUMA_NO_NODE;
1806         domain->flags = flags;
1807         if (first_level_by_default())
1808                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809         domain->has_iotlb_device = false;
1810         INIT_LIST_HEAD(&domain->devices);
1811
1812         return domain;
1813 }
1814
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817                                struct intel_iommu *iommu)
1818 {
1819         unsigned long ndomains;
1820         int num;
1821
1822         assert_spin_locked(&device_domain_lock);
1823         assert_spin_locked(&iommu->lock);
1824
1825         domain->iommu_refcnt[iommu->seq_id] += 1;
1826         domain->iommu_count += 1;
1827         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828                 ndomains = cap_ndoms(iommu->cap);
1829                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1830
1831                 if (num >= ndomains) {
1832                         pr_err("%s: No free domain ids\n", iommu->name);
1833                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1834                         domain->iommu_count -= 1;
1835                         return -ENOSPC;
1836                 }
1837
1838                 set_bit(num, iommu->domain_ids);
1839                 set_iommu_domain(iommu, num, domain);
1840
1841                 domain->iommu_did[iommu->seq_id] = num;
1842                 domain->nid                      = iommu->node;
1843
1844                 domain_update_iommu_cap(domain);
1845         }
1846
1847         return 0;
1848 }
1849
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851                                struct intel_iommu *iommu)
1852 {
1853         int num, count;
1854
1855         assert_spin_locked(&device_domain_lock);
1856         assert_spin_locked(&iommu->lock);
1857
1858         domain->iommu_refcnt[iommu->seq_id] -= 1;
1859         count = --domain->iommu_count;
1860         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861                 num = domain->iommu_did[iommu->seq_id];
1862                 clear_bit(num, iommu->domain_ids);
1863                 set_iommu_domain(iommu, num, NULL);
1864
1865                 domain_update_iommu_cap(domain);
1866                 domain->iommu_did[iommu->seq_id] = 0;
1867         }
1868
1869         return count;
1870 }
1871
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1874
1875 static int dmar_init_reserved_ranges(void)
1876 {
1877         struct pci_dev *pdev = NULL;
1878         struct iova *iova;
1879         int i;
1880
1881         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1882
1883         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884                 &reserved_rbtree_key);
1885
1886         /* IOAPIC ranges shouldn't be accessed by DMA */
1887         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888                 IOVA_PFN(IOAPIC_RANGE_END));
1889         if (!iova) {
1890                 pr_err("Reserve IOAPIC range failed\n");
1891                 return -ENODEV;
1892         }
1893
1894         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895         for_each_pci_dev(pdev) {
1896                 struct resource *r;
1897
1898                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899                         r = &pdev->resource[i];
1900                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1901                                 continue;
1902                         iova = reserve_iova(&reserved_iova_list,
1903                                             IOVA_PFN(r->start),
1904                                             IOVA_PFN(r->end));
1905                         if (!iova) {
1906                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1907                                 return -ENODEV;
1908                         }
1909                 }
1910         }
1911         return 0;
1912 }
1913
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1915 {
1916         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1917 }
1918
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1920 {
1921         int agaw;
1922         int r = (gaw - 12) % 9;
1923
1924         if (r == 0)
1925                 agaw = gaw;
1926         else
1927                 agaw = gaw + 9 - r;
1928         if (agaw > 64)
1929                 agaw = 64;
1930         return agaw;
1931 }
1932
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1934                        int guest_width)
1935 {
1936         int adjust_width, agaw;
1937         unsigned long sagaw;
1938         int ret;
1939
1940         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1941
1942         if (!intel_iommu_strict) {
1943                 ret = init_iova_flush_queue(&domain->iovad,
1944                                             iommu_flush_iova, iova_entry_free);
1945                 if (ret)
1946                         pr_info("iova flush queue initialization failed\n");
1947         }
1948
1949         domain_reserve_special_ranges(domain);
1950
1951         /* calculate AGAW */
1952         if (guest_width > cap_mgaw(iommu->cap))
1953                 guest_width = cap_mgaw(iommu->cap);
1954         domain->gaw = guest_width;
1955         adjust_width = guestwidth_to_adjustwidth(guest_width);
1956         agaw = width_to_agaw(adjust_width);
1957         sagaw = cap_sagaw(iommu->cap);
1958         if (!test_bit(agaw, &sagaw)) {
1959                 /* hardware doesn't support it, choose a bigger one */
1960                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961                 agaw = find_next_bit(&sagaw, 5, agaw);
1962                 if (agaw >= 5)
1963                         return -ENODEV;
1964         }
1965         domain->agaw = agaw;
1966
1967         if (ecap_coherent(iommu->ecap))
1968                 domain->iommu_coherency = 1;
1969         else
1970                 domain->iommu_coherency = 0;
1971
1972         if (ecap_sc_support(iommu->ecap))
1973                 domain->iommu_snooping = 1;
1974         else
1975                 domain->iommu_snooping = 0;
1976
1977         if (intel_iommu_superpage)
1978                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1979         else
1980                 domain->iommu_superpage = 0;
1981
1982         domain->nid = iommu->node;
1983
1984         /* always allocate the top pgd */
1985         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1986         if (!domain->pgd)
1987                 return -ENOMEM;
1988         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1989         return 0;
1990 }
1991
1992 static void domain_exit(struct dmar_domain *domain)
1993 {
1994
1995         /* Remove associated devices and clear attached or cached domains */
1996         domain_remove_dev_info(domain);
1997
1998         /* destroy iovas */
1999         put_iova_domain(&domain->iovad);
2000
2001         if (domain->pgd) {
2002                 struct page *freelist;
2003
2004                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005                 dma_free_pagelist(freelist);
2006         }
2007
2008         free_domain_mem(domain);
2009 }
2010
2011 /*
2012  * Get the PASID directory size for scalable mode context entry.
2013  * Value of X in the PDTS field of a scalable mode context entry
2014  * indicates PASID directory with 2^(X + 7) entries.
2015  */
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2017 {
2018         int pds, max_pde;
2019
2020         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2022         if (pds < 7)
2023                 return 0;
2024
2025         return pds - 7;
2026 }
2027
2028 /*
2029  * Set the RID_PASID field of a scalable mode context entry. The
2030  * IOMMU hardware will use the PASID value set in this field for
2031  * DMA translations of DMA requests without PASID.
2032  */
2033 static inline void
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2035 {
2036         context->hi |= pasid & ((1 << 20) - 1);
2037         context->hi |= (1 << 20);
2038 }
2039
2040 /*
2041  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2042  * entry.
2043  */
2044 static inline void context_set_sm_dte(struct context_entry *context)
2045 {
2046         context->lo |= (1 << 2);
2047 }
2048
2049 /*
2050  * Set the PRE(Page Request Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_pre(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 4);
2056 }
2057
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2060
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062                                       struct intel_iommu *iommu,
2063                                       struct pasid_table *table,
2064                                       u8 bus, u8 devfn)
2065 {
2066         u16 did = domain->iommu_did[iommu->seq_id];
2067         int translation = CONTEXT_TT_MULTI_LEVEL;
2068         struct device_domain_info *info = NULL;
2069         struct context_entry *context;
2070         unsigned long flags;
2071         int ret;
2072
2073         WARN_ON(did == 0);
2074
2075         if (hw_pass_through && domain_type_is_si(domain))
2076                 translation = CONTEXT_TT_PASS_THROUGH;
2077
2078         pr_debug("Set context mapping for %02x:%02x.%d\n",
2079                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2080
2081         BUG_ON(!domain->pgd);
2082
2083         spin_lock_irqsave(&device_domain_lock, flags);
2084         spin_lock(&iommu->lock);
2085
2086         ret = -ENOMEM;
2087         context = iommu_context_addr(iommu, bus, devfn, 1);
2088         if (!context)
2089                 goto out_unlock;
2090
2091         ret = 0;
2092         if (context_present(context))
2093                 goto out_unlock;
2094
2095         /*
2096          * For kdump cases, old valid entries may be cached due to the
2097          * in-flight DMA and copied pgtable, but there is no unmapping
2098          * behaviour for them, thus we need an explicit cache flush for
2099          * the newly-mapped device. For kdump, at this point, the device
2100          * is supposed to finish reset at its driver probe stage, so no
2101          * in-flight DMA will exist, and we don't need to worry anymore
2102          * hereafter.
2103          */
2104         if (context_copied(context)) {
2105                 u16 did_old = context_domain_id(context);
2106
2107                 if (did_old < cap_ndoms(iommu->cap)) {
2108                         iommu->flush.flush_context(iommu, did_old,
2109                                                    (((u16)bus) << 8) | devfn,
2110                                                    DMA_CCMD_MASK_NOBIT,
2111                                                    DMA_CCMD_DEVICE_INVL);
2112                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2113                                                  DMA_TLB_DSI_FLUSH);
2114                 }
2115         }
2116
2117         context_clear_entry(context);
2118
2119         if (sm_supported(iommu)) {
2120                 unsigned long pds;
2121
2122                 WARN_ON(!table);
2123
2124                 /* Setup the PASID DIR pointer: */
2125                 pds = context_get_sm_pds(table);
2126                 context->lo = (u64)virt_to_phys(table->table) |
2127                                 context_pdts(pds);
2128
2129                 /* Setup the RID_PASID field: */
2130                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2131
2132                 /*
2133                  * Setup the Device-TLB enable bit and Page request
2134                  * Enable bit:
2135                  */
2136                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137                 if (info && info->ats_supported)
2138                         context_set_sm_dte(context);
2139                 if (info && info->pri_supported)
2140                         context_set_sm_pre(context);
2141         } else {
2142                 struct dma_pte *pgd = domain->pgd;
2143                 int agaw;
2144
2145                 context_set_domain_id(context, did);
2146
2147                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2148                         /*
2149                          * Skip top levels of page tables for iommu which has
2150                          * less agaw than default. Unnecessary for PT mode.
2151                          */
2152                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2153                                 ret = -ENOMEM;
2154                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2155                                 if (!dma_pte_present(pgd))
2156                                         goto out_unlock;
2157                         }
2158
2159                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160                         if (info && info->ats_supported)
2161                                 translation = CONTEXT_TT_DEV_IOTLB;
2162                         else
2163                                 translation = CONTEXT_TT_MULTI_LEVEL;
2164
2165                         context_set_address_root(context, virt_to_phys(pgd));
2166                         context_set_address_width(context, agaw);
2167                 } else {
2168                         /*
2169                          * In pass through mode, AW must be programmed to
2170                          * indicate the largest AGAW value supported by
2171                          * hardware. And ASR is ignored by hardware.
2172                          */
2173                         context_set_address_width(context, iommu->msagaw);
2174                 }
2175
2176                 context_set_translation_type(context, translation);
2177         }
2178
2179         context_set_fault_enable(context);
2180         context_set_present(context);
2181         domain_flush_cache(domain, context, sizeof(*context));
2182
2183         /*
2184          * It's a non-present to present mapping. If hardware doesn't cache
2185          * non-present entry we only need to flush the write-buffer. If the
2186          * _does_ cache non-present entries, then it does so in the special
2187          * domain #0, which we have to flush:
2188          */
2189         if (cap_caching_mode(iommu->cap)) {
2190                 iommu->flush.flush_context(iommu, 0,
2191                                            (((u16)bus) << 8) | devfn,
2192                                            DMA_CCMD_MASK_NOBIT,
2193                                            DMA_CCMD_DEVICE_INVL);
2194                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2195         } else {
2196                 iommu_flush_write_buffer(iommu);
2197         }
2198         iommu_enable_dev_iotlb(info);
2199
2200         ret = 0;
2201
2202 out_unlock:
2203         spin_unlock(&iommu->lock);
2204         spin_unlock_irqrestore(&device_domain_lock, flags);
2205
2206         return ret;
2207 }
2208
2209 struct domain_context_mapping_data {
2210         struct dmar_domain *domain;
2211         struct intel_iommu *iommu;
2212         struct pasid_table *table;
2213 };
2214
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216                                      u16 alias, void *opaque)
2217 {
2218         struct domain_context_mapping_data *data = opaque;
2219
2220         return domain_context_mapping_one(data->domain, data->iommu,
2221                                           data->table, PCI_BUS_NUM(alias),
2222                                           alias & 0xff);
2223 }
2224
2225 static int
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2227 {
2228         struct domain_context_mapping_data data;
2229         struct pasid_table *table;
2230         struct intel_iommu *iommu;
2231         u8 bus, devfn;
2232
2233         iommu = device_to_iommu(dev, &bus, &devfn);
2234         if (!iommu)
2235                 return -ENODEV;
2236
2237         table = intel_pasid_get_table(dev);
2238
2239         if (!dev_is_pci(dev))
2240                 return domain_context_mapping_one(domain, iommu, table,
2241                                                   bus, devfn);
2242
2243         data.domain = domain;
2244         data.iommu = iommu;
2245         data.table = table;
2246
2247         return pci_for_each_dma_alias(to_pci_dev(dev),
2248                                       &domain_context_mapping_cb, &data);
2249 }
2250
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252                                     u16 alias, void *opaque)
2253 {
2254         struct intel_iommu *iommu = opaque;
2255
2256         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2257 }
2258
2259 static int domain_context_mapped(struct device *dev)
2260 {
2261         struct intel_iommu *iommu;
2262         u8 bus, devfn;
2263
2264         iommu = device_to_iommu(dev, &bus, &devfn);
2265         if (!iommu)
2266                 return -ENODEV;
2267
2268         if (!dev_is_pci(dev))
2269                 return device_context_mapped(iommu, bus, devfn);
2270
2271         return !pci_for_each_dma_alias(to_pci_dev(dev),
2272                                        domain_context_mapped_cb, iommu);
2273 }
2274
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2277                                             size_t size)
2278 {
2279         host_addr &= ~PAGE_MASK;
2280         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2281 }
2282
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285                                           unsigned long iov_pfn,
2286                                           unsigned long phy_pfn,
2287                                           unsigned long pages)
2288 {
2289         int support, level = 1;
2290         unsigned long pfnmerge;
2291
2292         support = domain->iommu_superpage;
2293
2294         /* To use a large page, the virtual *and* physical addresses
2295            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296            of them will mean we have to use smaller pages. So just
2297            merge them and check both at once. */
2298         pfnmerge = iov_pfn | phy_pfn;
2299
2300         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301                 pages >>= VTD_STRIDE_SHIFT;
2302                 if (!pages)
2303                         break;
2304                 pfnmerge >>= VTD_STRIDE_SHIFT;
2305                 level++;
2306                 support--;
2307         }
2308         return level;
2309 }
2310
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312                             struct scatterlist *sg, unsigned long phys_pfn,
2313                             unsigned long nr_pages, int prot)
2314 {
2315         struct dma_pte *first_pte = NULL, *pte = NULL;
2316         phys_addr_t uninitialized_var(pteval);
2317         unsigned long sg_res = 0;
2318         unsigned int largepage_lvl = 0;
2319         unsigned long lvl_pages = 0;
2320         u64 attr;
2321
2322         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2323
2324         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2325                 return -EINVAL;
2326
2327         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328         if (domain_use_first_level(domain))
2329                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2330
2331         if (!sg) {
2332                 sg_res = nr_pages;
2333                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2334         }
2335
2336         while (nr_pages > 0) {
2337                 uint64_t tmp;
2338
2339                 if (!sg_res) {
2340                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2341
2342                         sg_res = aligned_nrpages(sg->offset, sg->length);
2343                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344                         sg->dma_length = sg->length;
2345                         pteval = (sg_phys(sg) - pgoff) | attr;
2346                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2347                 }
2348
2349                 if (!pte) {
2350                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2351
2352                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353                         if (!pte)
2354                                 return -ENOMEM;
2355                         /* It is large page*/
2356                         if (largepage_lvl > 1) {
2357                                 unsigned long nr_superpages, end_pfn;
2358
2359                                 pteval |= DMA_PTE_LARGE_PAGE;
2360                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2361
2362                                 nr_superpages = sg_res / lvl_pages;
2363                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2364
2365                                 /*
2366                                  * Ensure that old small page tables are
2367                                  * removed to make room for superpage(s).
2368                                  * We're adding new large pages, so make sure
2369                                  * we don't remove their parent tables.
2370                                  */
2371                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2372                                                        largepage_lvl + 1);
2373                         } else {
2374                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2375                         }
2376
2377                 }
2378                 /* We don't need lock here, nobody else
2379                  * touches the iova range
2380                  */
2381                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2382                 if (tmp) {
2383                         static int dumps = 5;
2384                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385                                 iov_pfn, tmp, (unsigned long long)pteval);
2386                         if (dumps) {
2387                                 dumps--;
2388                                 debug_dma_dump_mappings(NULL);
2389                         }
2390                         WARN_ON(1);
2391                 }
2392
2393                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2394
2395                 BUG_ON(nr_pages < lvl_pages);
2396                 BUG_ON(sg_res < lvl_pages);
2397
2398                 nr_pages -= lvl_pages;
2399                 iov_pfn += lvl_pages;
2400                 phys_pfn += lvl_pages;
2401                 pteval += lvl_pages * VTD_PAGE_SIZE;
2402                 sg_res -= lvl_pages;
2403
2404                 /* If the next PTE would be the first in a new page, then we
2405                    need to flush the cache on the entries we've just written.
2406                    And then we'll need to recalculate 'pte', so clear it and
2407                    let it get set again in the if (!pte) block above.
2408
2409                    If we're done (!nr_pages) we need to flush the cache too.
2410
2411                    Also if we've been setting superpages, we may need to
2412                    recalculate 'pte' and switch back to smaller pages for the
2413                    end of the mapping, if the trailing size is not enough to
2414                    use another superpage (i.e. sg_res < lvl_pages). */
2415                 pte++;
2416                 if (!nr_pages || first_pte_in_page(pte) ||
2417                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418                         domain_flush_cache(domain, first_pte,
2419                                            (void *)pte - (void *)first_pte);
2420                         pte = NULL;
2421                 }
2422
2423                 if (!sg_res && nr_pages)
2424                         sg = sg_next(sg);
2425         }
2426         return 0;
2427 }
2428
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430                           struct scatterlist *sg, unsigned long phys_pfn,
2431                           unsigned long nr_pages, int prot)
2432 {
2433         int iommu_id, ret;
2434         struct intel_iommu *iommu;
2435
2436         /* Do the real mapping first */
2437         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2438         if (ret)
2439                 return ret;
2440
2441         for_each_domain_iommu(iommu_id, domain) {
2442                 iommu = g_iommus[iommu_id];
2443                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2444         }
2445
2446         return 0;
2447 }
2448
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450                                     struct scatterlist *sg, unsigned long nr_pages,
2451                                     int prot)
2452 {
2453         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2454 }
2455
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457                                      unsigned long phys_pfn, unsigned long nr_pages,
2458                                      int prot)
2459 {
2460         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2461 }
2462
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2464 {
2465         unsigned long flags;
2466         struct context_entry *context;
2467         u16 did_old;
2468
2469         if (!iommu)
2470                 return;
2471
2472         spin_lock_irqsave(&iommu->lock, flags);
2473         context = iommu_context_addr(iommu, bus, devfn, 0);
2474         if (!context) {
2475                 spin_unlock_irqrestore(&iommu->lock, flags);
2476                 return;
2477         }
2478         did_old = context_domain_id(context);
2479         context_clear_entry(context);
2480         __iommu_flush_cache(iommu, context, sizeof(*context));
2481         spin_unlock_irqrestore(&iommu->lock, flags);
2482         iommu->flush.flush_context(iommu,
2483                                    did_old,
2484                                    (((u16)bus) << 8) | devfn,
2485                                    DMA_CCMD_MASK_NOBIT,
2486                                    DMA_CCMD_DEVICE_INVL);
2487         iommu->flush.flush_iotlb(iommu,
2488                                  did_old,
2489                                  0,
2490                                  0,
2491                                  DMA_TLB_DSI_FLUSH);
2492 }
2493
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2495 {
2496         assert_spin_locked(&device_domain_lock);
2497         list_del(&info->link);
2498         list_del(&info->global);
2499         if (info->dev)
2500                 info->dev->archdata.iommu = NULL;
2501 }
2502
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2504 {
2505         struct device_domain_info *info, *tmp;
2506         unsigned long flags;
2507
2508         spin_lock_irqsave(&device_domain_lock, flags);
2509         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510                 __dmar_remove_one_dev_info(info);
2511         spin_unlock_irqrestore(&device_domain_lock, flags);
2512 }
2513
2514 struct dmar_domain *find_domain(struct device *dev)
2515 {
2516         struct device_domain_info *info;
2517
2518         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2519                 return NULL;
2520
2521         if (dev_is_pci(dev))
2522                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2523
2524         /* No lock here, assumes no domain exit in normal case */
2525         info = dev->archdata.iommu;
2526         if (likely(info))
2527                 return info->domain;
2528
2529         return NULL;
2530 }
2531
2532 static void do_deferred_attach(struct device *dev)
2533 {
2534         struct iommu_domain *domain;
2535
2536         dev->archdata.iommu = NULL;
2537         domain = iommu_get_domain_for_dev(dev);
2538         if (domain)
2539                 intel_iommu_attach_device(domain, dev);
2540 }
2541
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2544 {
2545         struct device_domain_info *info;
2546
2547         list_for_each_entry(info, &device_domain_list, global)
2548                 if (info->iommu->segment == segment && info->bus == bus &&
2549                     info->devfn == devfn)
2550                         return info;
2551
2552         return NULL;
2553 }
2554
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556                                     struct dmar_domain *domain,
2557                                     struct device *dev,
2558                                     int pasid)
2559 {
2560         int flags = PASID_FLAG_SUPERVISOR_MODE;
2561         struct dma_pte *pgd = domain->pgd;
2562         int agaw, level;
2563
2564         /*
2565          * Skip top levels of page tables for iommu which has
2566          * less agaw than default. Unnecessary for PT mode.
2567          */
2568         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569                 pgd = phys_to_virt(dma_pte_addr(pgd));
2570                 if (!dma_pte_present(pgd))
2571                         return -ENOMEM;
2572         }
2573
2574         level = agaw_to_level(agaw);
2575         if (level != 4 && level != 5)
2576                 return -EINVAL;
2577
2578         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2579
2580         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581                                              domain->iommu_did[iommu->seq_id],
2582                                              flags);
2583 }
2584
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2586                                                     int bus, int devfn,
2587                                                     struct device *dev,
2588                                                     struct dmar_domain *domain)
2589 {
2590         struct dmar_domain *found = NULL;
2591         struct device_domain_info *info;
2592         unsigned long flags;
2593         int ret;
2594
2595         info = alloc_devinfo_mem();
2596         if (!info)
2597                 return NULL;
2598
2599         info->bus = bus;
2600         info->devfn = devfn;
2601         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2603         info->ats_qdep = 0;
2604         info->dev = dev;
2605         info->domain = domain;
2606         info->iommu = iommu;
2607         info->pasid_table = NULL;
2608         info->auxd_enabled = 0;
2609         INIT_LIST_HEAD(&info->auxiliary_domains);
2610
2611         if (dev && dev_is_pci(dev)) {
2612                 struct pci_dev *pdev = to_pci_dev(info->dev);
2613
2614                 if (!pdev->untrusted &&
2615                     !pci_ats_disabled() &&
2616                     ecap_dev_iotlb_support(iommu->ecap) &&
2617                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618                     dmar_find_matched_atsr_unit(pdev))
2619                         info->ats_supported = 1;
2620
2621                 if (sm_supported(iommu)) {
2622                         if (pasid_supported(iommu)) {
2623                                 int features = pci_pasid_features(pdev);
2624                                 if (features >= 0)
2625                                         info->pasid_supported = features | 1;
2626                         }
2627
2628                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630                                 info->pri_supported = 1;
2631                 }
2632         }
2633
2634         spin_lock_irqsave(&device_domain_lock, flags);
2635         if (dev)
2636                 found = find_domain(dev);
2637
2638         if (!found) {
2639                 struct device_domain_info *info2;
2640                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2641                 if (info2) {
2642                         found      = info2->domain;
2643                         info2->dev = dev;
2644                 }
2645         }
2646
2647         if (found) {
2648                 spin_unlock_irqrestore(&device_domain_lock, flags);
2649                 free_devinfo_mem(info);
2650                 /* Caller must free the original domain */
2651                 return found;
2652         }
2653
2654         spin_lock(&iommu->lock);
2655         ret = domain_attach_iommu(domain, iommu);
2656         spin_unlock(&iommu->lock);
2657
2658         if (ret) {
2659                 spin_unlock_irqrestore(&device_domain_lock, flags);
2660                 free_devinfo_mem(info);
2661                 return NULL;
2662         }
2663
2664         list_add(&info->link, &domain->devices);
2665         list_add(&info->global, &device_domain_list);
2666         if (dev)
2667                 dev->archdata.iommu = info;
2668         spin_unlock_irqrestore(&device_domain_lock, flags);
2669
2670         /* PASID table is mandatory for a PCI device in scalable mode. */
2671         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672                 ret = intel_pasid_alloc_table(dev);
2673                 if (ret) {
2674                         dev_err(dev, "PASID table allocation failed\n");
2675                         dmar_remove_one_dev_info(dev);
2676                         return NULL;
2677                 }
2678
2679                 /* Setup the PASID entry for requests without PASID: */
2680                 spin_lock(&iommu->lock);
2681                 if (hw_pass_through && domain_type_is_si(domain))
2682                         ret = intel_pasid_setup_pass_through(iommu, domain,
2683                                         dev, PASID_RID2PASID);
2684                 else if (domain_use_first_level(domain))
2685                         ret = domain_setup_first_level(iommu, domain, dev,
2686                                         PASID_RID2PASID);
2687                 else
2688                         ret = intel_pasid_setup_second_level(iommu, domain,
2689                                         dev, PASID_RID2PASID);
2690                 spin_unlock(&iommu->lock);
2691                 if (ret) {
2692                         dev_err(dev, "Setup RID2PASID failed\n");
2693                         dmar_remove_one_dev_info(dev);
2694                         return NULL;
2695                 }
2696         }
2697
2698         if (dev && domain_context_mapping(domain, dev)) {
2699                 dev_err(dev, "Domain context map failed\n");
2700                 dmar_remove_one_dev_info(dev);
2701                 return NULL;
2702         }
2703
2704         return domain;
2705 }
2706
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2708 {
2709         *(u16 *)opaque = alias;
2710         return 0;
2711 }
2712
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2714 {
2715         struct device_domain_info *info;
2716         struct dmar_domain *domain = NULL;
2717         struct intel_iommu *iommu;
2718         u16 dma_alias;
2719         unsigned long flags;
2720         u8 bus, devfn;
2721
2722         iommu = device_to_iommu(dev, &bus, &devfn);
2723         if (!iommu)
2724                 return NULL;
2725
2726         if (dev_is_pci(dev)) {
2727                 struct pci_dev *pdev = to_pci_dev(dev);
2728
2729                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2730
2731                 spin_lock_irqsave(&device_domain_lock, flags);
2732                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733                                                       PCI_BUS_NUM(dma_alias),
2734                                                       dma_alias & 0xff);
2735                 if (info) {
2736                         iommu = info->iommu;
2737                         domain = info->domain;
2738                 }
2739                 spin_unlock_irqrestore(&device_domain_lock, flags);
2740
2741                 /* DMA alias already has a domain, use it */
2742                 if (info)
2743                         goto out;
2744         }
2745
2746         /* Allocate and initialize new domain for the device */
2747         domain = alloc_domain(0);
2748         if (!domain)
2749                 return NULL;
2750         if (domain_init(domain, iommu, gaw)) {
2751                 domain_exit(domain);
2752                 return NULL;
2753         }
2754
2755 out:
2756         return domain;
2757 }
2758
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760                                               struct dmar_domain *domain)
2761 {
2762         struct intel_iommu *iommu;
2763         struct dmar_domain *tmp;
2764         u16 req_id, dma_alias;
2765         u8 bus, devfn;
2766
2767         iommu = device_to_iommu(dev, &bus, &devfn);
2768         if (!iommu)
2769                 return NULL;
2770
2771         req_id = ((u16)bus << 8) | devfn;
2772
2773         if (dev_is_pci(dev)) {
2774                 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2777
2778                 /* register PCI DMA alias device */
2779                 if (req_id != dma_alias) {
2780                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781                                         dma_alias & 0xff, NULL, domain);
2782
2783                         if (!tmp || tmp != domain)
2784                                 return tmp;
2785                 }
2786         }
2787
2788         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789         if (!tmp || tmp != domain)
2790                 return tmp;
2791
2792         return domain;
2793 }
2794
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796                                      unsigned long long start,
2797                                      unsigned long long end)
2798 {
2799         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2801
2802         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803                           dma_to_mm_pfn(last_vpfn))) {
2804                 pr_err("Reserving iova failed\n");
2805                 return -ENOMEM;
2806         }
2807
2808         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2809         /*
2810          * RMRR range might have overlap with physical memory range,
2811          * clear it first
2812          */
2813         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2814
2815         return __domain_mapping(domain, first_vpfn, NULL,
2816                                 first_vpfn, last_vpfn - first_vpfn + 1,
2817                                 DMA_PTE_READ|DMA_PTE_WRITE);
2818 }
2819
2820 static int domain_prepare_identity_map(struct device *dev,
2821                                        struct dmar_domain *domain,
2822                                        unsigned long long start,
2823                                        unsigned long long end)
2824 {
2825         /* For _hardware_ passthrough, don't bother. But for software
2826            passthrough, we do it anyway -- it may indicate a memory
2827            range which is reserved in E820, so which didn't get set
2828            up to start with in si_domain */
2829         if (domain == si_domain && hw_pass_through) {
2830                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2831                          start, end);
2832                 return 0;
2833         }
2834
2835         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2836
2837         if (end < start) {
2838                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840                         dmi_get_system_info(DMI_BIOS_VENDOR),
2841                         dmi_get_system_info(DMI_BIOS_VERSION),
2842                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2843                 return -EIO;
2844         }
2845
2846         if (end >> agaw_to_width(domain->agaw)) {
2847                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849                      agaw_to_width(domain->agaw),
2850                      dmi_get_system_info(DMI_BIOS_VENDOR),
2851                      dmi_get_system_info(DMI_BIOS_VERSION),
2852                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2853                 return -EIO;
2854         }
2855
2856         return iommu_domain_identity_map(domain, start, end);
2857 }
2858
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2860
2861 static int __init si_domain_init(int hw)
2862 {
2863         struct dmar_rmrr_unit *rmrr;
2864         struct device *dev;
2865         int i, nid, ret;
2866
2867         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2868         if (!si_domain)
2869                 return -EFAULT;
2870
2871         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872                 domain_exit(si_domain);
2873                 return -EFAULT;
2874         }
2875
2876         if (hw)
2877                 return 0;
2878
2879         for_each_online_node(nid) {
2880                 unsigned long start_pfn, end_pfn;
2881                 int i;
2882
2883                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884                         ret = iommu_domain_identity_map(si_domain,
2885                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2886                         if (ret)
2887                                 return ret;
2888                 }
2889         }
2890
2891         /*
2892          * Identity map the RMRRs so that devices with RMRRs could also use
2893          * the si_domain.
2894          */
2895         for_each_rmrr_units(rmrr) {
2896                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2897                                           i, dev) {
2898                         unsigned long long start = rmrr->base_address;
2899                         unsigned long long end = rmrr->end_address;
2900
2901                         if (WARN_ON(end < start ||
2902                                     end >> agaw_to_width(si_domain->agaw)))
2903                                 continue;
2904
2905                         ret = iommu_domain_identity_map(si_domain, start, end);
2906                         if (ret)
2907                                 return ret;
2908                 }
2909         }
2910
2911         return 0;
2912 }
2913
2914 static int identity_mapping(struct device *dev)
2915 {
2916         struct device_domain_info *info;
2917
2918         info = dev->archdata.iommu;
2919         if (info)
2920                 return (info->domain == si_domain);
2921
2922         return 0;
2923 }
2924
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2926 {
2927         struct dmar_domain *ndomain;
2928         struct intel_iommu *iommu;
2929         u8 bus, devfn;
2930
2931         iommu = device_to_iommu(dev, &bus, &devfn);
2932         if (!iommu)
2933                 return -ENODEV;
2934
2935         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936         if (ndomain != domain)
2937                 return -EBUSY;
2938
2939         return 0;
2940 }
2941
2942 static bool device_has_rmrr(struct device *dev)
2943 {
2944         struct dmar_rmrr_unit *rmrr;
2945         struct device *tmp;
2946         int i;
2947
2948         rcu_read_lock();
2949         for_each_rmrr_units(rmrr) {
2950                 /*
2951                  * Return TRUE if this RMRR contains the device that
2952                  * is passed in.
2953                  */
2954                 for_each_active_dev_scope(rmrr->devices,
2955                                           rmrr->devices_cnt, i, tmp)
2956                         if (tmp == dev ||
2957                             is_downstream_to_pci_bridge(dev, tmp)) {
2958                                 rcu_read_unlock();
2959                                 return true;
2960                         }
2961         }
2962         rcu_read_unlock();
2963         return false;
2964 }
2965
2966 /**
2967  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968  * is relaxable (ie. is allowed to be not enforced under some conditions)
2969  * @dev: device handle
2970  *
2971  * We assume that PCI USB devices with RMRRs have them largely
2972  * for historical reasons and that the RMRR space is not actively used post
2973  * boot.  This exclusion may change if vendors begin to abuse it.
2974  *
2975  * The same exception is made for graphics devices, with the requirement that
2976  * any use of the RMRR regions will be torn down before assigning the device
2977  * to a guest.
2978  *
2979  * Return: true if the RMRR is relaxable, false otherwise
2980  */
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2982 {
2983         struct pci_dev *pdev;
2984
2985         if (!dev_is_pci(dev))
2986                 return false;
2987
2988         pdev = to_pci_dev(dev);
2989         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2990                 return true;
2991         else
2992                 return false;
2993 }
2994
2995 /*
2996  * There are a couple cases where we need to restrict the functionality of
2997  * devices associated with RMRRs.  The first is when evaluating a device for
2998  * identity mapping because problems exist when devices are moved in and out
2999  * of domains and their respective RMRR information is lost.  This means that
3000  * a device with associated RMRRs will never be in a "passthrough" domain.
3001  * The second is use of the device through the IOMMU API.  This interface
3002  * expects to have full control of the IOVA space for the device.  We cannot
3003  * satisfy both the requirement that RMRR access is maintained and have an
3004  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3005  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006  * We therefore prevent devices associated with an RMRR from participating in
3007  * the IOMMU API, which eliminates them from device assignment.
3008  *
3009  * In both cases, devices which have relaxable RMRRs are not concerned by this
3010  * restriction. See device_rmrr_is_relaxable comment.
3011  */
3012 static bool device_is_rmrr_locked(struct device *dev)
3013 {
3014         if (!device_has_rmrr(dev))
3015                 return false;
3016
3017         if (device_rmrr_is_relaxable(dev))
3018                 return false;
3019
3020         return true;
3021 }
3022
3023 /*
3024  * Return the required default domain type for a specific device.
3025  *
3026  * @dev: the device in query
3027  * @startup: true if this is during early boot
3028  *
3029  * Returns:
3030  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032  *  - 0: both identity and dynamic domains work for this device
3033  */
3034 static int device_def_domain_type(struct device *dev)
3035 {
3036         if (dev_is_pci(dev)) {
3037                 struct pci_dev *pdev = to_pci_dev(dev);
3038
3039                 /*
3040                  * Prevent any device marked as untrusted from getting
3041                  * placed into the statically identity mapping domain.
3042                  */
3043                 if (pdev->untrusted)
3044                         return IOMMU_DOMAIN_DMA;
3045
3046                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047                         return IOMMU_DOMAIN_IDENTITY;
3048
3049                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050                         return IOMMU_DOMAIN_IDENTITY;
3051
3052                 /*
3053                  * We want to start off with all devices in the 1:1 domain, and
3054                  * take them out later if we find they can't access all of memory.
3055                  *
3056                  * However, we can't do this for PCI devices behind bridges,
3057                  * because all PCI devices behind the same bridge will end up
3058                  * with the same source-id on their transactions.
3059                  *
3060                  * Practically speaking, we can't change things around for these
3061                  * devices at run-time, because we can't be sure there'll be no
3062                  * DMA transactions in flight for any of their siblings.
3063                  *
3064                  * So PCI devices (unless they're on the root bus) as well as
3065                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066                  * the 1:1 domain, just in _case_ one of their siblings turns out
3067                  * not to be able to map all of memory.
3068                  */
3069                 if (!pci_is_pcie(pdev)) {
3070                         if (!pci_is_root_bus(pdev->bus))
3071                                 return IOMMU_DOMAIN_DMA;
3072                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073                                 return IOMMU_DOMAIN_DMA;
3074                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075                         return IOMMU_DOMAIN_DMA;
3076         }
3077
3078         return 0;
3079 }
3080
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3082 {
3083         /*
3084          * Start from the sane iommu hardware state.
3085          * If the queued invalidation is already initialized by us
3086          * (for example, while enabling interrupt-remapping) then
3087          * we got the things already rolling from a sane state.
3088          */
3089         if (!iommu->qi) {
3090                 /*
3091                  * Clear any previous faults.
3092                  */
3093                 dmar_fault(-1, iommu);
3094                 /*
3095                  * Disable queued invalidation if supported and already enabled
3096                  * before OS handover.
3097                  */
3098                 dmar_disable_qi(iommu);
3099         }
3100
3101         if (dmar_enable_qi(iommu)) {
3102                 /*
3103                  * Queued Invalidate not enabled, use Register Based Invalidate
3104                  */
3105                 iommu->flush.flush_context = __iommu_flush_context;
3106                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107                 pr_info("%s: Using Register based invalidation\n",
3108                         iommu->name);
3109         } else {
3110                 iommu->flush.flush_context = qi_flush_context;
3111                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3113         }
3114 }
3115
3116 static int copy_context_table(struct intel_iommu *iommu,
3117                               struct root_entry *old_re,
3118                               struct context_entry **tbl,
3119                               int bus, bool ext)
3120 {
3121         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122         struct context_entry *new_ce = NULL, ce;
3123         struct context_entry *old_ce = NULL;
3124         struct root_entry re;
3125         phys_addr_t old_ce_phys;
3126
3127         tbl_idx = ext ? bus * 2 : bus;
3128         memcpy(&re, old_re, sizeof(re));
3129
3130         for (devfn = 0; devfn < 256; devfn++) {
3131                 /* First calculate the correct index */
3132                 idx = (ext ? devfn * 2 : devfn) % 256;
3133
3134                 if (idx == 0) {
3135                         /* First save what we may have and clean up */
3136                         if (new_ce) {
3137                                 tbl[tbl_idx] = new_ce;
3138                                 __iommu_flush_cache(iommu, new_ce,
3139                                                     VTD_PAGE_SIZE);
3140                                 pos = 1;
3141                         }
3142
3143                         if (old_ce)
3144                                 memunmap(old_ce);
3145
3146                         ret = 0;
3147                         if (devfn < 0x80)
3148                                 old_ce_phys = root_entry_lctp(&re);
3149                         else
3150                                 old_ce_phys = root_entry_uctp(&re);
3151
3152                         if (!old_ce_phys) {
3153                                 if (ext && devfn == 0) {
3154                                         /* No LCTP, try UCTP */
3155                                         devfn = 0x7f;
3156                                         continue;
3157                                 } else {
3158                                         goto out;
3159                                 }
3160                         }
3161
3162                         ret = -ENOMEM;
3163                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3164                                         MEMREMAP_WB);
3165                         if (!old_ce)
3166                                 goto out;
3167
3168                         new_ce = alloc_pgtable_page(iommu->node);
3169                         if (!new_ce)
3170                                 goto out_unmap;
3171
3172                         ret = 0;
3173                 }
3174
3175                 /* Now copy the context entry */
3176                 memcpy(&ce, old_ce + idx, sizeof(ce));
3177
3178                 if (!__context_present(&ce))
3179                         continue;
3180
3181                 did = context_domain_id(&ce);
3182                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183                         set_bit(did, iommu->domain_ids);
3184
3185                 /*
3186                  * We need a marker for copied context entries. This
3187                  * marker needs to work for the old format as well as
3188                  * for extended context entries.
3189                  *
3190                  * Bit 67 of the context entry is used. In the old
3191                  * format this bit is available to software, in the
3192                  * extended format it is the PGE bit, but PGE is ignored
3193                  * by HW if PASIDs are disabled (and thus still
3194                  * available).
3195                  *
3196                  * So disable PASIDs first and then mark the entry
3197                  * copied. This means that we don't copy PASID
3198                  * translations from the old kernel, but this is fine as
3199                  * faults there are not fatal.
3200                  */
3201                 context_clear_pasid_enable(&ce);
3202                 context_set_copied(&ce);
3203
3204                 new_ce[idx] = ce;
3205         }
3206
3207         tbl[tbl_idx + pos] = new_ce;
3208
3209         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3210
3211 out_unmap:
3212         memunmap(old_ce);
3213
3214 out:
3215         return ret;
3216 }
3217
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3219 {
3220         struct context_entry **ctxt_tbls;
3221         struct root_entry *old_rt;
3222         phys_addr_t old_rt_phys;
3223         int ctxt_table_entries;
3224         unsigned long flags;
3225         u64 rtaddr_reg;
3226         int bus, ret;
3227         bool new_ext, ext;
3228
3229         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231         new_ext    = !!ecap_ecs(iommu->ecap);
3232
3233         /*
3234          * The RTT bit can only be changed when translation is disabled,
3235          * but disabling translation means to open a window for data
3236          * corruption. So bail out and don't copy anything if we would
3237          * have to change the bit.
3238          */
3239         if (new_ext != ext)
3240                 return -EINVAL;
3241
3242         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3243         if (!old_rt_phys)
3244                 return -EINVAL;
3245
3246         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3247         if (!old_rt)
3248                 return -ENOMEM;
3249
3250         /* This is too big for the stack - allocate it from slab */
3251         ctxt_table_entries = ext ? 512 : 256;
3252         ret = -ENOMEM;
3253         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3254         if (!ctxt_tbls)
3255                 goto out_unmap;
3256
3257         for (bus = 0; bus < 256; bus++) {
3258                 ret = copy_context_table(iommu, &old_rt[bus],
3259                                          ctxt_tbls, bus, ext);
3260                 if (ret) {
3261                         pr_err("%s: Failed to copy context table for bus %d\n",
3262                                 iommu->name, bus);
3263                         continue;
3264                 }
3265         }
3266
3267         spin_lock_irqsave(&iommu->lock, flags);
3268
3269         /* Context tables are copied, now write them to the root_entry table */
3270         for (bus = 0; bus < 256; bus++) {
3271                 int idx = ext ? bus * 2 : bus;
3272                 u64 val;
3273
3274                 if (ctxt_tbls[idx]) {
3275                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276                         iommu->root_entry[bus].lo = val;
3277                 }
3278
3279                 if (!ext || !ctxt_tbls[idx + 1])
3280                         continue;
3281
3282                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283                 iommu->root_entry[bus].hi = val;
3284         }
3285
3286         spin_unlock_irqrestore(&iommu->lock, flags);
3287
3288         kfree(ctxt_tbls);
3289
3290         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3291
3292         ret = 0;
3293
3294 out_unmap:
3295         memunmap(old_rt);
3296
3297         return ret;
3298 }
3299
3300 static int __init init_dmars(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu;
3304         int ret;
3305
3306         /*
3307          * for each drhd
3308          *    allocate root
3309          *    initialize and program root entry to not present
3310          * endfor
3311          */
3312         for_each_drhd_unit(drhd) {
3313                 /*
3314                  * lock not needed as this is only incremented in the single
3315                  * threaded kernel __init code path all other access are read
3316                  * only
3317                  */
3318                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3319                         g_num_of_iommus++;
3320                         continue;
3321                 }
3322                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3323         }
3324
3325         /* Preallocate enough resources for IOMMU hot-addition */
3326         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3328
3329         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3330                         GFP_KERNEL);
3331         if (!g_iommus) {
3332                 pr_err("Allocating global iommu array failed\n");
3333                 ret = -ENOMEM;
3334                 goto error;
3335         }
3336
3337         for_each_iommu(iommu, drhd) {
3338                 if (drhd->ignored) {
3339                         iommu_disable_translation(iommu);
3340                         continue;
3341                 }
3342
3343                 /*
3344                  * Find the max pasid size of all IOMMU's in the system.
3345                  * We need to ensure the system pasid table is no bigger
3346                  * than the smallest supported.
3347                  */
3348                 if (pasid_supported(iommu)) {
3349                         u32 temp = 2 << ecap_pss(iommu->ecap);
3350
3351                         intel_pasid_max_id = min_t(u32, temp,
3352                                                    intel_pasid_max_id);
3353                 }
3354
3355                 g_iommus[iommu->seq_id] = iommu;
3356
3357                 intel_iommu_init_qi(iommu);
3358
3359                 ret = iommu_init_domains(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362
3363                 init_translation_status(iommu);
3364
3365                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366                         iommu_disable_translation(iommu);
3367                         clear_translation_pre_enabled(iommu);
3368                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3369                                 iommu->name);
3370                 }
3371
3372                 /*
3373                  * TBD:
3374                  * we could share the same root & context tables
3375                  * among all IOMMU's. Need to Split it later.
3376                  */
3377                 ret = iommu_alloc_root_entry(iommu);
3378                 if (ret)
3379                         goto free_iommu;
3380
3381                 if (translation_pre_enabled(iommu)) {
3382                         pr_info("Translation already enabled - trying to copy translation structures\n");
3383
3384                         ret = copy_translation_tables(iommu);
3385                         if (ret) {
3386                                 /*
3387                                  * We found the IOMMU with translation
3388                                  * enabled - but failed to copy over the
3389                                  * old root-entry table. Try to proceed
3390                                  * by disabling translation now and
3391                                  * allocating a clean root-entry table.
3392                                  * This might cause DMAR faults, but
3393                                  * probably the dump will still succeed.
3394                                  */
3395                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3396                                        iommu->name);
3397                                 iommu_disable_translation(iommu);
3398                                 clear_translation_pre_enabled(iommu);
3399                         } else {
3400                                 pr_info("Copied translation tables from previous kernel for %s\n",
3401                                         iommu->name);
3402                         }
3403                 }
3404
3405                 if (!ecap_pass_through(iommu->ecap))
3406                         hw_pass_through = 0;
3407                 intel_svm_check(iommu);
3408         }
3409
3410         /*
3411          * Now that qi is enabled on all iommus, set the root entry and flush
3412          * caches. This is required on some Intel X58 chipsets, otherwise the
3413          * flush_context function will loop forever and the boot hangs.
3414          */
3415         for_each_active_iommu(iommu, drhd) {
3416                 iommu_flush_write_buffer(iommu);
3417                 iommu_set_root_entry(iommu);
3418                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3420         }
3421
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3423         dmar_map_gfx = 0;
3424 #endif
3425
3426         if (!dmar_map_gfx)
3427                 iommu_identity_mapping |= IDENTMAP_GFX;
3428
3429         check_tylersburg_isoch();
3430
3431         ret = si_domain_init(hw_pass_through);
3432         if (ret)
3433                 goto free_iommu;
3434
3435         /*
3436          * for each drhd
3437          *   enable fault log
3438          *   global invalidate context cache
3439          *   global invalidate iotlb
3440          *   enable translation
3441          */
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457                         /*
3458                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459                          * could cause possible lock race condition.
3460                          */
3461                         up_write(&dmar_global_lock);
3462                         ret = intel_svm_enable_prq(iommu);
3463                         down_write(&dmar_global_lock);
3464                         if (ret)
3465                                 goto free_iommu;
3466                 }
3467 #endif
3468                 ret = dmar_set_interrupt(iommu);
3469                 if (ret)
3470                         goto free_iommu;
3471         }
3472
3473         return 0;
3474
3475 free_iommu:
3476         for_each_active_iommu(iommu, drhd) {
3477                 disable_dmar_iommu(iommu);
3478                 free_dmar_iommu(iommu);
3479         }
3480
3481         kfree(g_iommus);
3482
3483 error:
3484         return ret;
3485 }
3486
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489                                      struct dmar_domain *domain,
3490                                      unsigned long nrpages, uint64_t dma_mask)
3491 {
3492         unsigned long iova_pfn;
3493
3494         /*
3495          * Restrict dma_mask to the width that the iommu can handle.
3496          * First-level translation restricts the input-address to a
3497          * canonical address (i.e., address bits 63:N have the same
3498          * value as address bit [N-1], where N is 48-bits with 4-level
3499          * paging and 57-bits with 5-level paging). Hence, skip bit
3500          * [N-1].
3501          */
3502         if (domain_use_first_level(domain))
3503                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3504                                  dma_mask);
3505         else
3506                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3507                                  dma_mask);
3508
3509         /* Ensure we reserve the whole size-aligned region */
3510         nrpages = __roundup_pow_of_two(nrpages);
3511
3512         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3513                 /*
3514                  * First try to allocate an io virtual address in
3515                  * DMA_BIT_MASK(32) and if that fails then try allocating
3516                  * from higher range
3517                  */
3518                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3520                 if (iova_pfn)
3521                         return iova_pfn;
3522         }
3523         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524                                    IOVA_PFN(dma_mask), true);
3525         if (unlikely(!iova_pfn)) {
3526                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3527                              nrpages);
3528                 return 0;
3529         }
3530
3531         return iova_pfn;
3532 }
3533
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3535 {
3536         struct dmar_domain *domain, *tmp;
3537         struct dmar_rmrr_unit *rmrr;
3538         struct device *i_dev;
3539         int i, ret;
3540
3541         /* Device shouldn't be attached by any domains. */
3542         domain = find_domain(dev);
3543         if (domain)
3544                 return NULL;
3545
3546         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3547         if (!domain)
3548                 goto out;
3549
3550         /* We have a new domain - setup possible RMRRs for the device */
3551         rcu_read_lock();
3552         for_each_rmrr_units(rmrr) {
3553                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3554                                           i, i_dev) {
3555                         if (i_dev != dev)
3556                                 continue;
3557
3558                         ret = domain_prepare_identity_map(dev, domain,
3559                                                           rmrr->base_address,
3560                                                           rmrr->end_address);
3561                         if (ret)
3562                                 dev_err(dev, "Mapping reserved region failed\n");
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         tmp = set_domain_for_dev(dev, domain);
3568         if (!tmp || domain != tmp) {
3569                 domain_exit(domain);
3570                 domain = tmp;
3571         }
3572
3573 out:
3574         if (!domain)
3575                 dev_err(dev, "Allocating domain failed\n");
3576         else
3577                 domain->domain.type = IOMMU_DOMAIN_DMA;
3578
3579         return domain;
3580 }
3581
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3584 {
3585         int ret;
3586
3587         if (iommu_dummy(dev))
3588                 return false;
3589
3590         if (unlikely(attach_deferred(dev)))
3591                 do_deferred_attach(dev);
3592
3593         ret = identity_mapping(dev);
3594         if (ret) {
3595                 u64 dma_mask = *dev->dma_mask;
3596
3597                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598                         dma_mask = dev->coherent_dma_mask;
3599
3600                 if (dma_mask >= dma_direct_get_required_mask(dev))
3601                         return false;
3602
3603                 /*
3604                  * 32 bit DMA is removed from si_domain and fall back to
3605                  * non-identity mapping.
3606                  */
3607                 dmar_remove_one_dev_info(dev);
3608                 ret = iommu_request_dma_domain_for_dev(dev);
3609                 if (ret) {
3610                         struct iommu_domain *domain;
3611                         struct dmar_domain *dmar_domain;
3612
3613                         domain = iommu_get_domain_for_dev(dev);
3614                         if (domain) {
3615                                 dmar_domain = to_dmar_domain(domain);
3616                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3617                         }
3618                         dmar_remove_one_dev_info(dev);
3619                         get_private_domain_for_dev(dev);
3620                 }
3621
3622                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3623         }
3624
3625         return true;
3626 }
3627
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629                                      size_t size, int dir, u64 dma_mask)
3630 {
3631         struct dmar_domain *domain;
3632         phys_addr_t start_paddr;
3633         unsigned long iova_pfn;
3634         int prot = 0;
3635         int ret;
3636         struct intel_iommu *iommu;
3637         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3638
3639         BUG_ON(dir == DMA_NONE);
3640
3641         domain = find_domain(dev);
3642         if (!domain)
3643                 return DMA_MAPPING_ERROR;
3644
3645         iommu = domain_get_iommu(domain);
3646         size = aligned_nrpages(paddr, size);
3647
3648         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3649         if (!iova_pfn)
3650                 goto error;
3651
3652         /*
3653          * Check if DMAR supports zero-length reads on write only
3654          * mappings..
3655          */
3656         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657                         !cap_zlr(iommu->cap))
3658                 prot |= DMA_PTE_READ;
3659         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660                 prot |= DMA_PTE_WRITE;
3661         /*
3662          * paddr - (paddr + size) might be partial page, we should map the whole
3663          * page.  Note: if two part of one page are separately mapped, we
3664          * might have two guest_addr mapping to the same host paddr, but this
3665          * is not a big problem
3666          */
3667         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3669         if (ret)
3670                 goto error;
3671
3672         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673         start_paddr += paddr & ~PAGE_MASK;
3674
3675         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3676
3677         return start_paddr;
3678
3679 error:
3680         if (iova_pfn)
3681                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683                 size, (unsigned long long)paddr, dir);
3684         return DMA_MAPPING_ERROR;
3685 }
3686
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688                                  unsigned long offset, size_t size,
3689                                  enum dma_data_direction dir,
3690                                  unsigned long attrs)
3691 {
3692         if (iommu_need_mapping(dev))
3693                 return __intel_map_single(dev, page_to_phys(page) + offset,
3694                                 size, dir, *dev->dma_mask);
3695         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3696 }
3697
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699                                      size_t size, enum dma_data_direction dir,
3700                                      unsigned long attrs)
3701 {
3702         if (iommu_need_mapping(dev))
3703                 return __intel_map_single(dev, phys_addr, size, dir,
3704                                 *dev->dma_mask);
3705         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3706 }
3707
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3709 {
3710         struct dmar_domain *domain;
3711         unsigned long start_pfn, last_pfn;
3712         unsigned long nrpages;
3713         unsigned long iova_pfn;
3714         struct intel_iommu *iommu;
3715         struct page *freelist;
3716         struct pci_dev *pdev = NULL;
3717
3718         domain = find_domain(dev);
3719         BUG_ON(!domain);
3720
3721         iommu = domain_get_iommu(domain);
3722
3723         iova_pfn = IOVA_PFN(dev_addr);
3724
3725         nrpages = aligned_nrpages(dev_addr, size);
3726         start_pfn = mm_to_dma_pfn(iova_pfn);
3727         last_pfn = start_pfn + nrpages - 1;
3728
3729         if (dev_is_pci(dev))
3730                 pdev = to_pci_dev(dev);
3731
3732         freelist = domain_unmap(domain, start_pfn, last_pfn);
3733         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734                         !has_iova_flush_queue(&domain->iovad)) {
3735                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736                                       nrpages, !freelist, 0);
3737                 /* free iova */
3738                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739                 dma_free_pagelist(freelist);
3740         } else {
3741                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742                            (unsigned long)freelist);
3743                 /*
3744                  * queue up the release of the unmap to save the 1/6th of the
3745                  * cpu used up by the iotlb flush operation...
3746                  */
3747         }
3748
3749         trace_unmap_single(dev, dev_addr, size);
3750 }
3751
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753                              size_t size, enum dma_data_direction dir,
3754                              unsigned long attrs)
3755 {
3756         if (iommu_need_mapping(dev))
3757                 intel_unmap(dev, dev_addr, size);
3758         else
3759                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3760 }
3761
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3764 {
3765         if (iommu_need_mapping(dev))
3766                 intel_unmap(dev, dev_addr, size);
3767 }
3768
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770                                   dma_addr_t *dma_handle, gfp_t flags,
3771                                   unsigned long attrs)
3772 {
3773         struct page *page = NULL;
3774         int order;
3775
3776         if (!iommu_need_mapping(dev))
3777                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3778
3779         size = PAGE_ALIGN(size);
3780         order = get_order(size);
3781
3782         if (gfpflags_allow_blocking(flags)) {
3783                 unsigned int count = size >> PAGE_SHIFT;
3784
3785                 page = dma_alloc_from_contiguous(dev, count, order,
3786                                                  flags & __GFP_NOWARN);
3787         }
3788
3789         if (!page)
3790                 page = alloc_pages(flags, order);
3791         if (!page)
3792                 return NULL;
3793         memset(page_address(page), 0, size);
3794
3795         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3796                                          DMA_BIDIRECTIONAL,
3797                                          dev->coherent_dma_mask);
3798         if (*dma_handle != DMA_MAPPING_ERROR)
3799                 return page_address(page);
3800         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801                 __free_pages(page, order);
3802
3803         return NULL;
3804 }
3805
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807                                 dma_addr_t dma_handle, unsigned long attrs)
3808 {
3809         int order;
3810         struct page *page = virt_to_page(vaddr);
3811
3812         if (!iommu_need_mapping(dev))
3813                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3814
3815         size = PAGE_ALIGN(size);
3816         order = get_order(size);
3817
3818         intel_unmap(dev, dma_handle, size);
3819         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820                 __free_pages(page, order);
3821 }
3822
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824                            int nelems, enum dma_data_direction dir,
3825                            unsigned long attrs)
3826 {
3827         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828         unsigned long nrpages = 0;
3829         struct scatterlist *sg;
3830         int i;
3831
3832         if (!iommu_need_mapping(dev))
3833                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3834
3835         for_each_sg(sglist, sg, nelems, i) {
3836                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3837         }
3838
3839         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3840
3841         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3842 }
3843
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845                         enum dma_data_direction dir, unsigned long attrs)
3846 {
3847         int i;
3848         struct dmar_domain *domain;
3849         size_t size = 0;
3850         int prot = 0;
3851         unsigned long iova_pfn;
3852         int ret;
3853         struct scatterlist *sg;
3854         unsigned long start_vpfn;
3855         struct intel_iommu *iommu;
3856
3857         BUG_ON(dir == DMA_NONE);
3858         if (!iommu_need_mapping(dev))
3859                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3860
3861         domain = find_domain(dev);
3862         if (!domain)
3863                 return 0;
3864
3865         iommu = domain_get_iommu(domain);
3866
3867         for_each_sg(sglist, sg, nelems, i)
3868                 size += aligned_nrpages(sg->offset, sg->length);
3869
3870         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3871                                 *dev->dma_mask);
3872         if (!iova_pfn) {
3873                 sglist->dma_length = 0;
3874                 return 0;
3875         }
3876
3877         /*
3878          * Check if DMAR supports zero-length reads on write only
3879          * mappings..
3880          */
3881         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882                         !cap_zlr(iommu->cap))
3883                 prot |= DMA_PTE_READ;
3884         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885                 prot |= DMA_PTE_WRITE;
3886
3887         start_vpfn = mm_to_dma_pfn(iova_pfn);
3888
3889         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890         if (unlikely(ret)) {
3891                 dma_pte_free_pagetable(domain, start_vpfn,
3892                                        start_vpfn + size - 1,
3893                                        agaw_to_level(domain->agaw) + 1);
3894                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3895                 return 0;
3896         }
3897
3898         for_each_sg(sglist, sg, nelems, i)
3899                 trace_map_sg(dev, i + 1, nelems, sg);
3900
3901         return nelems;
3902 }
3903
3904 static u64 intel_get_required_mask(struct device *dev)
3905 {
3906         if (!iommu_need_mapping(dev))
3907                 return dma_direct_get_required_mask(dev);
3908         return DMA_BIT_MASK(32);
3909 }
3910
3911 static const struct dma_map_ops intel_dma_ops = {
3912         .alloc = intel_alloc_coherent,
3913         .free = intel_free_coherent,
3914         .map_sg = intel_map_sg,
3915         .unmap_sg = intel_unmap_sg,
3916         .map_page = intel_map_page,
3917         .unmap_page = intel_unmap_page,
3918         .map_resource = intel_map_resource,
3919         .unmap_resource = intel_unmap_resource,
3920         .dma_supported = dma_direct_supported,
3921         .mmap = dma_common_mmap,
3922         .get_sgtable = dma_common_get_sgtable,
3923         .get_required_mask = intel_get_required_mask,
3924 };
3925
3926 static void
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928                    enum dma_data_direction dir, enum dma_sync_target target)
3929 {
3930         struct dmar_domain *domain;
3931         phys_addr_t tlb_addr;
3932
3933         domain = find_domain(dev);
3934         if (WARN_ON(!domain))
3935                 return;
3936
3937         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938         if (is_swiotlb_buffer(tlb_addr))
3939                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944                   enum dma_data_direction dir, unsigned long attrs,
3945                   u64 dma_mask)
3946 {
3947         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948         struct dmar_domain *domain;
3949         struct intel_iommu *iommu;
3950         unsigned long iova_pfn;
3951         unsigned long nrpages;
3952         phys_addr_t tlb_addr;
3953         int prot = 0;
3954         int ret;
3955
3956         if (unlikely(attach_deferred(dev)))
3957                 do_deferred_attach(dev);
3958
3959         domain = find_domain(dev);
3960
3961         if (WARN_ON(dir == DMA_NONE || !domain))
3962                 return DMA_MAPPING_ERROR;
3963
3964         iommu = domain_get_iommu(domain);
3965         if (WARN_ON(!iommu))
3966                 return DMA_MAPPING_ERROR;
3967
3968         nrpages = aligned_nrpages(0, size);
3969         iova_pfn = intel_alloc_iova(dev, domain,
3970                                     dma_to_mm_pfn(nrpages), dma_mask);
3971         if (!iova_pfn)
3972                 return DMA_MAPPING_ERROR;
3973
3974         /*
3975          * Check if DMAR supports zero-length reads on write only
3976          * mappings..
3977          */
3978         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979                         !cap_zlr(iommu->cap))
3980                 prot |= DMA_PTE_READ;
3981         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982                 prot |= DMA_PTE_WRITE;
3983
3984         /*
3985          * If both the physical buffer start address and size are
3986          * page aligned, we don't need to use a bounce page.
3987          */
3988         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989                 tlb_addr = swiotlb_tbl_map_single(dev,
3990                                 __phys_to_dma(dev, io_tlb_start),
3991                                 paddr, size, aligned_size, dir, attrs);
3992                 if (tlb_addr == DMA_MAPPING_ERROR) {
3993                         goto swiotlb_error;
3994                 } else {
3995                         /* Cleanup the padding area. */
3996                         void *padding_start = phys_to_virt(tlb_addr);
3997                         size_t padding_size = aligned_size;
3998
3999                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000                             (dir == DMA_TO_DEVICE ||
4001                              dir == DMA_BIDIRECTIONAL)) {
4002                                 padding_start += size;
4003                                 padding_size -= size;
4004                         }
4005
4006                         memset(padding_start, 0, padding_size);
4007                 }
4008         } else {
4009                 tlb_addr = paddr;
4010         }
4011
4012         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4014         if (ret)
4015                 goto mapping_error;
4016
4017         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4018
4019         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4020
4021 mapping_error:
4022         if (is_swiotlb_buffer(tlb_addr))
4023                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024                                          aligned_size, dir, attrs);
4025 swiotlb_error:
4026         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028                 size, (unsigned long long)paddr, dir);
4029
4030         return DMA_MAPPING_ERROR;
4031 }
4032
4033 static void
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035                     enum dma_data_direction dir, unsigned long attrs)
4036 {
4037         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038         struct dmar_domain *domain;
4039         phys_addr_t tlb_addr;
4040
4041         domain = find_domain(dev);
4042         if (WARN_ON(!domain))
4043                 return;
4044
4045         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046         if (WARN_ON(!tlb_addr))
4047                 return;
4048
4049         intel_unmap(dev, dev_addr, size);
4050         if (is_swiotlb_buffer(tlb_addr))
4051                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052                                          aligned_size, dir, attrs);
4053
4054         trace_bounce_unmap_single(dev, dev_addr, size);
4055 }
4056
4057 static dma_addr_t
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4060 {
4061         return bounce_map_single(dev, page_to_phys(page) + offset,
4062                                  size, dir, attrs, *dev->dma_mask);
4063 }
4064
4065 static dma_addr_t
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067                     enum dma_data_direction dir, unsigned long attrs)
4068 {
4069         return bounce_map_single(dev, phys_addr, size,
4070                                  dir, attrs, *dev->dma_mask);
4071 }
4072
4073 static void
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075                   enum dma_data_direction dir, unsigned long attrs)
4076 {
4077         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4078 }
4079
4080 static void
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082                       enum dma_data_direction dir, unsigned long attrs)
4083 {
4084         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4085 }
4086
4087 static void
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089                 enum dma_data_direction dir, unsigned long attrs)
4090 {
4091         struct scatterlist *sg;
4092         int i;
4093
4094         for_each_sg(sglist, sg, nelems, i)
4095                 bounce_unmap_page(dev, sg->dma_address,
4096                                   sg_dma_len(sg), dir, attrs);
4097 }
4098
4099 static int
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101               enum dma_data_direction dir, unsigned long attrs)
4102 {
4103         int i;
4104         struct scatterlist *sg;
4105
4106         for_each_sg(sglist, sg, nelems, i) {
4107                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108                                                   sg->offset, sg->length,
4109                                                   dir, attrs);
4110                 if (sg->dma_address == DMA_MAPPING_ERROR)
4111                         goto out_unmap;
4112                 sg_dma_len(sg) = sg->length;
4113         }
4114
4115         for_each_sg(sglist, sg, nelems, i)
4116                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4117
4118         return nelems;
4119
4120 out_unmap:
4121         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4122         return 0;
4123 }
4124
4125 static void
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127                            size_t size, enum dma_data_direction dir)
4128 {
4129         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4130 }
4131
4132 static void
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134                               size_t size, enum dma_data_direction dir)
4135 {
4136         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4137 }
4138
4139 static void
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141                        int nelems, enum dma_data_direction dir)
4142 {
4143         struct scatterlist *sg;
4144         int i;
4145
4146         for_each_sg(sglist, sg, nelems, i)
4147                 bounce_sync_single(dev, sg_dma_address(sg),
4148                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4149 }
4150
4151 static void
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153                           int nelems, enum dma_data_direction dir)
4154 {
4155         struct scatterlist *sg;
4156         int i;
4157
4158         for_each_sg(sglist, sg, nelems, i)
4159                 bounce_sync_single(dev, sg_dma_address(sg),
4160                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4161 }
4162
4163 static const struct dma_map_ops bounce_dma_ops = {
4164         .alloc                  = intel_alloc_coherent,
4165         .free                   = intel_free_coherent,
4166         .map_sg                 = bounce_map_sg,
4167         .unmap_sg               = bounce_unmap_sg,
4168         .map_page               = bounce_map_page,
4169         .unmap_page             = bounce_unmap_page,
4170         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4171         .sync_single_for_device = bounce_sync_single_for_device,
4172         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4173         .sync_sg_for_device     = bounce_sync_sg_for_device,
4174         .map_resource           = bounce_map_resource,
4175         .unmap_resource         = bounce_unmap_resource,
4176         .dma_supported          = dma_direct_supported,
4177 };
4178
4179 static inline int iommu_domain_cache_init(void)
4180 {
4181         int ret = 0;
4182
4183         iommu_domain_cache = kmem_cache_create("iommu_domain",
4184                                          sizeof(struct dmar_domain),
4185                                          0,
4186                                          SLAB_HWCACHE_ALIGN,
4187
4188                                          NULL);
4189         if (!iommu_domain_cache) {
4190                 pr_err("Couldn't create iommu_domain cache\n");
4191                 ret = -ENOMEM;
4192         }
4193
4194         return ret;
4195 }
4196
4197 static inline int iommu_devinfo_cache_init(void)
4198 {
4199         int ret = 0;
4200
4201         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202                                          sizeof(struct device_domain_info),
4203                                          0,
4204                                          SLAB_HWCACHE_ALIGN,
4205                                          NULL);
4206         if (!iommu_devinfo_cache) {
4207                 pr_err("Couldn't create devinfo cache\n");
4208                 ret = -ENOMEM;
4209         }
4210
4211         return ret;
4212 }
4213
4214 static int __init iommu_init_mempool(void)
4215 {
4216         int ret;
4217         ret = iova_cache_get();
4218         if (ret)
4219                 return ret;
4220
4221         ret = iommu_domain_cache_init();
4222         if (ret)
4223                 goto domain_error;
4224
4225         ret = iommu_devinfo_cache_init();
4226         if (!ret)
4227                 return ret;
4228
4229         kmem_cache_destroy(iommu_domain_cache);
4230 domain_error:
4231         iova_cache_put();
4232
4233         return -ENOMEM;
4234 }
4235
4236 static void __init iommu_exit_mempool(void)
4237 {
4238         kmem_cache_destroy(iommu_devinfo_cache);
4239         kmem_cache_destroy(iommu_domain_cache);
4240         iova_cache_put();
4241 }
4242
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4244 {
4245         struct dmar_drhd_unit *drhd;
4246         u32 vtbar;
4247         int rc;
4248
4249         /* We know that this device on this chipset has its own IOMMU.
4250          * If we find it under a different IOMMU, then the BIOS is lying
4251          * to us. Hope that the IOMMU for this device is actually
4252          * disabled, and it needs no translation...
4253          */
4254         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4255         if (rc) {
4256                 /* "can't" happen */
4257                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4258                 return;
4259         }
4260         vtbar &= 0xffff0000;
4261
4262         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263         drhd = dmar_find_matched_drhd_unit(pdev);
4264         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4265                             TAINT_FIRMWARE_WORKAROUND,
4266                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4267                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4268 }
4269 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4270
4271 static void __init init_no_remapping_devices(void)
4272 {
4273         struct dmar_drhd_unit *drhd;
4274         struct device *dev;
4275         int i;
4276
4277         for_each_drhd_unit(drhd) {
4278                 if (!drhd->include_all) {
4279                         for_each_active_dev_scope(drhd->devices,
4280                                                   drhd->devices_cnt, i, dev)
4281                                 break;
4282                         /* ignore DMAR unit if no devices exist */
4283                         if (i == drhd->devices_cnt)
4284                                 drhd->ignored = 1;
4285                 }
4286         }
4287
4288         for_each_active_drhd_unit(drhd) {
4289                 if (drhd->include_all)
4290                         continue;
4291
4292                 for_each_active_dev_scope(drhd->devices,
4293                                           drhd->devices_cnt, i, dev)
4294                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4295                                 break;
4296                 if (i < drhd->devices_cnt)
4297                         continue;
4298
4299                 /* This IOMMU has *only* gfx devices. Either bypass it or
4300                    set the gfx_mapped flag, as appropriate */
4301                 if (!dmar_map_gfx) {
4302                         drhd->ignored = 1;
4303                         for_each_active_dev_scope(drhd->devices,
4304                                                   drhd->devices_cnt, i, dev)
4305                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4306                 }
4307         }
4308 }
4309
4310 #ifdef CONFIG_SUSPEND
4311 static int init_iommu_hw(void)
4312 {
4313         struct dmar_drhd_unit *drhd;
4314         struct intel_iommu *iommu = NULL;
4315
4316         for_each_active_iommu(iommu, drhd)
4317                 if (iommu->qi)
4318                         dmar_reenable_qi(iommu);
4319
4320         for_each_iommu(iommu, drhd) {
4321                 if (drhd->ignored) {
4322                         /*
4323                          * we always have to disable PMRs or DMA may fail on
4324                          * this device
4325                          */
4326                         if (force_on)
4327                                 iommu_disable_protect_mem_regions(iommu);
4328                         continue;
4329                 }
4330
4331                 iommu_flush_write_buffer(iommu);
4332
4333                 iommu_set_root_entry(iommu);
4334
4335                 iommu->flush.flush_context(iommu, 0, 0, 0,
4336                                            DMA_CCMD_GLOBAL_INVL);
4337                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4338                 iommu_enable_translation(iommu);
4339                 iommu_disable_protect_mem_regions(iommu);
4340         }
4341
4342         return 0;
4343 }
4344
4345 static void iommu_flush_all(void)
4346 {
4347         struct dmar_drhd_unit *drhd;
4348         struct intel_iommu *iommu;
4349
4350         for_each_active_iommu(iommu, drhd) {
4351                 iommu->flush.flush_context(iommu, 0, 0, 0,
4352                                            DMA_CCMD_GLOBAL_INVL);
4353                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4354                                          DMA_TLB_GLOBAL_FLUSH);
4355         }
4356 }
4357
4358 static int iommu_suspend(void)
4359 {
4360         struct dmar_drhd_unit *drhd;
4361         struct intel_iommu *iommu = NULL;
4362         unsigned long flag;
4363
4364         for_each_active_iommu(iommu, drhd) {
4365                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4366                                                  GFP_ATOMIC);
4367                 if (!iommu->iommu_state)
4368                         goto nomem;
4369         }
4370
4371         iommu_flush_all();
4372
4373         for_each_active_iommu(iommu, drhd) {
4374                 iommu_disable_translation(iommu);
4375
4376                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4377
4378                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4379                         readl(iommu->reg + DMAR_FECTL_REG);
4380                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4381                         readl(iommu->reg + DMAR_FEDATA_REG);
4382                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4383                         readl(iommu->reg + DMAR_FEADDR_REG);
4384                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4385                         readl(iommu->reg + DMAR_FEUADDR_REG);
4386
4387                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4388         }
4389         return 0;
4390
4391 nomem:
4392         for_each_active_iommu(iommu, drhd)
4393                 kfree(iommu->iommu_state);
4394
4395         return -ENOMEM;
4396 }
4397
4398 static void iommu_resume(void)
4399 {
4400         struct dmar_drhd_unit *drhd;
4401         struct intel_iommu *iommu = NULL;
4402         unsigned long flag;
4403
4404         if (init_iommu_hw()) {
4405                 if (force_on)
4406                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4407                 else
4408                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4409                 return;
4410         }
4411
4412         for_each_active_iommu(iommu, drhd) {
4413
4414                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4415
4416                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4417                         iommu->reg + DMAR_FECTL_REG);
4418                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4419                         iommu->reg + DMAR_FEDATA_REG);
4420                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4421                         iommu->reg + DMAR_FEADDR_REG);
4422                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4423                         iommu->reg + DMAR_FEUADDR_REG);
4424
4425                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4426         }
4427
4428         for_each_active_iommu(iommu, drhd)
4429                 kfree(iommu->iommu_state);
4430 }
4431
4432 static struct syscore_ops iommu_syscore_ops = {
4433         .resume         = iommu_resume,
4434         .suspend        = iommu_suspend,
4435 };
4436
4437 static void __init init_iommu_pm_ops(void)
4438 {
4439         register_syscore_ops(&iommu_syscore_ops);
4440 }
4441
4442 #else
4443 static inline void init_iommu_pm_ops(void) {}
4444 #endif  /* CONFIG_PM */
4445
4446 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4447 {
4448         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4449             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4450             rmrr->end_address <= rmrr->base_address ||
4451             arch_rmrr_sanity_check(rmrr))
4452                 return -EINVAL;
4453
4454         return 0;
4455 }
4456
4457 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4458 {
4459         struct acpi_dmar_reserved_memory *rmrr;
4460         struct dmar_rmrr_unit *rmrru;
4461
4462         rmrr = (struct acpi_dmar_reserved_memory *)header;
4463         if (rmrr_sanity_check(rmrr))
4464                 WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
4465                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4466                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4467                            rmrr->base_address, rmrr->end_address,
4468                            dmi_get_system_info(DMI_BIOS_VENDOR),
4469                            dmi_get_system_info(DMI_BIOS_VERSION),
4470                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4471
4472         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4473         if (!rmrru)
4474                 goto out;
4475
4476         rmrru->hdr = header;
4477
4478         rmrru->base_address = rmrr->base_address;
4479         rmrru->end_address = rmrr->end_address;
4480
4481         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4482                                 ((void *)rmrr) + rmrr->header.length,
4483                                 &rmrru->devices_cnt);
4484         if (rmrru->devices_cnt && rmrru->devices == NULL)
4485                 goto free_rmrru;
4486
4487         list_add(&rmrru->list, &dmar_rmrr_units);
4488
4489         return 0;
4490 free_rmrru:
4491         kfree(rmrru);
4492 out:
4493         return -ENOMEM;
4494 }
4495
4496 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4497 {
4498         struct dmar_atsr_unit *atsru;
4499         struct acpi_dmar_atsr *tmp;
4500
4501         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4502                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4503                 if (atsr->segment != tmp->segment)
4504                         continue;
4505                 if (atsr->header.length != tmp->header.length)
4506                         continue;
4507                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4508                         return atsru;
4509         }
4510
4511         return NULL;
4512 }
4513
4514 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4515 {
4516         struct acpi_dmar_atsr *atsr;
4517         struct dmar_atsr_unit *atsru;
4518
4519         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4520                 return 0;
4521
4522         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4523         atsru = dmar_find_atsr(atsr);
4524         if (atsru)
4525                 return 0;
4526
4527         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4528         if (!atsru)
4529                 return -ENOMEM;
4530
4531         /*
4532          * If memory is allocated from slab by ACPI _DSM method, we need to
4533          * copy the memory content because the memory buffer will be freed
4534          * on return.
4535          */
4536         atsru->hdr = (void *)(atsru + 1);
4537         memcpy(atsru->hdr, hdr, hdr->length);
4538         atsru->include_all = atsr->flags & 0x1;
4539         if (!atsru->include_all) {
4540                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4541                                 (void *)atsr + atsr->header.length,
4542                                 &atsru->devices_cnt);
4543                 if (atsru->devices_cnt && atsru->devices == NULL) {
4544                         kfree(atsru);
4545                         return -ENOMEM;
4546                 }
4547         }
4548
4549         list_add_rcu(&atsru->list, &dmar_atsr_units);
4550
4551         return 0;
4552 }
4553
4554 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4555 {
4556         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4557         kfree(atsru);
4558 }
4559
4560 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4561 {
4562         struct acpi_dmar_atsr *atsr;
4563         struct dmar_atsr_unit *atsru;
4564
4565         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4566         atsru = dmar_find_atsr(atsr);
4567         if (atsru) {
4568                 list_del_rcu(&atsru->list);
4569                 synchronize_rcu();
4570                 intel_iommu_free_atsr(atsru);
4571         }
4572
4573         return 0;
4574 }
4575
4576 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4577 {
4578         int i;
4579         struct device *dev;
4580         struct acpi_dmar_atsr *atsr;
4581         struct dmar_atsr_unit *atsru;
4582
4583         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4584         atsru = dmar_find_atsr(atsr);
4585         if (!atsru)
4586                 return 0;
4587
4588         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4589                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4590                                           i, dev)
4591                         return -EBUSY;
4592         }
4593
4594         return 0;
4595 }
4596
4597 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4598 {
4599         int sp, ret;
4600         struct intel_iommu *iommu = dmaru->iommu;
4601
4602         if (g_iommus[iommu->seq_id])
4603                 return 0;
4604
4605         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4606                 pr_warn("%s: Doesn't support hardware pass through.\n",
4607                         iommu->name);
4608                 return -ENXIO;
4609         }
4610         if (!ecap_sc_support(iommu->ecap) &&
4611             domain_update_iommu_snooping(iommu)) {
4612                 pr_warn("%s: Doesn't support snooping.\n",
4613                         iommu->name);
4614                 return -ENXIO;
4615         }
4616         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4617         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4618                 pr_warn("%s: Doesn't support large page.\n",
4619                         iommu->name);
4620                 return -ENXIO;
4621         }
4622
4623         /*
4624          * Disable translation if already enabled prior to OS handover.
4625          */
4626         if (iommu->gcmd & DMA_GCMD_TE)
4627                 iommu_disable_translation(iommu);
4628
4629         g_iommus[iommu->seq_id] = iommu;
4630         ret = iommu_init_domains(iommu);
4631         if (ret == 0)
4632                 ret = iommu_alloc_root_entry(iommu);
4633         if (ret)
4634                 goto out;
4635
4636         intel_svm_check(iommu);
4637
4638         if (dmaru->ignored) {
4639                 /*
4640                  * we always have to disable PMRs or DMA may fail on this device
4641                  */
4642                 if (force_on)
4643                         iommu_disable_protect_mem_regions(iommu);
4644                 return 0;
4645         }
4646
4647         intel_iommu_init_qi(iommu);
4648         iommu_flush_write_buffer(iommu);
4649
4650 #ifdef CONFIG_INTEL_IOMMU_SVM
4651         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4652                 ret = intel_svm_enable_prq(iommu);
4653                 if (ret)
4654                         goto disable_iommu;
4655         }
4656 #endif
4657         ret = dmar_set_interrupt(iommu);
4658         if (ret)
4659                 goto disable_iommu;
4660
4661         iommu_set_root_entry(iommu);
4662         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4663         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4664         iommu_enable_translation(iommu);
4665
4666         iommu_disable_protect_mem_regions(iommu);
4667         return 0;
4668
4669 disable_iommu:
4670         disable_dmar_iommu(iommu);
4671 out:
4672         free_dmar_iommu(iommu);
4673         return ret;
4674 }
4675
4676 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4677 {
4678         int ret = 0;
4679         struct intel_iommu *iommu = dmaru->iommu;
4680
4681         if (!intel_iommu_enabled)
4682                 return 0;
4683         if (iommu == NULL)
4684                 return -EINVAL;
4685
4686         if (insert) {
4687                 ret = intel_iommu_add(dmaru);
4688         } else {
4689                 disable_dmar_iommu(iommu);
4690                 free_dmar_iommu(iommu);
4691         }
4692
4693         return ret;
4694 }
4695
4696 static void intel_iommu_free_dmars(void)
4697 {
4698         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4699         struct dmar_atsr_unit *atsru, *atsr_n;
4700
4701         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4702                 list_del(&rmrru->list);
4703                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4704                 kfree(rmrru);
4705         }
4706
4707         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4708                 list_del(&atsru->list);
4709                 intel_iommu_free_atsr(atsru);
4710         }
4711 }
4712
4713 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4714 {
4715         int i, ret = 1;
4716         struct pci_bus *bus;
4717         struct pci_dev *bridge = NULL;
4718         struct device *tmp;
4719         struct acpi_dmar_atsr *atsr;
4720         struct dmar_atsr_unit *atsru;
4721
4722         dev = pci_physfn(dev);
4723         for (bus = dev->bus; bus; bus = bus->parent) {
4724                 bridge = bus->self;
4725                 /* If it's an integrated device, allow ATS */
4726                 if (!bridge)
4727                         return 1;
4728                 /* Connected via non-PCIe: no ATS */
4729                 if (!pci_is_pcie(bridge) ||
4730                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4731                         return 0;
4732                 /* If we found the root port, look it up in the ATSR */
4733                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4734                         break;
4735         }
4736
4737         rcu_read_lock();
4738         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4739                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4740                 if (atsr->segment != pci_domain_nr(dev->bus))
4741                         continue;
4742
4743                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4744                         if (tmp == &bridge->dev)
4745                                 goto out;
4746
4747                 if (atsru->include_all)
4748                         goto out;
4749         }
4750         ret = 0;
4751 out:
4752         rcu_read_unlock();
4753
4754         return ret;
4755 }
4756
4757 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4758 {
4759         int ret;
4760         struct dmar_rmrr_unit *rmrru;
4761         struct dmar_atsr_unit *atsru;
4762         struct acpi_dmar_atsr *atsr;
4763         struct acpi_dmar_reserved_memory *rmrr;
4764
4765         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4766                 return 0;
4767
4768         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4769                 rmrr = container_of(rmrru->hdr,
4770                                     struct acpi_dmar_reserved_memory, header);
4771                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4772                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4773                                 ((void *)rmrr) + rmrr->header.length,
4774                                 rmrr->segment, rmrru->devices,
4775                                 rmrru->devices_cnt);
4776                         if (ret < 0)
4777                                 return ret;
4778                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4779                         dmar_remove_dev_scope(info, rmrr->segment,
4780                                 rmrru->devices, rmrru->devices_cnt);
4781                 }
4782         }
4783
4784         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4785                 if (atsru->include_all)
4786                         continue;
4787
4788                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4789                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4790                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4791                                         (void *)atsr + atsr->header.length,
4792                                         atsr->segment, atsru->devices,
4793                                         atsru->devices_cnt);
4794                         if (ret > 0)
4795                                 break;
4796                         else if (ret < 0)
4797                                 return ret;
4798                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4799                         if (dmar_remove_dev_scope(info, atsr->segment,
4800                                         atsru->devices, atsru->devices_cnt))
4801                                 break;
4802                 }
4803         }
4804
4805         return 0;
4806 }
4807
4808 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4809                                        unsigned long val, void *v)
4810 {
4811         struct memory_notify *mhp = v;
4812         unsigned long long start, end;
4813         unsigned long start_vpfn, last_vpfn;
4814
4815         switch (val) {
4816         case MEM_GOING_ONLINE:
4817                 start = mhp->start_pfn << PAGE_SHIFT;
4818                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4819                 if (iommu_domain_identity_map(si_domain, start, end)) {
4820                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4821                                 start, end);
4822                         return NOTIFY_BAD;
4823                 }
4824                 break;
4825
4826         case MEM_OFFLINE:
4827         case MEM_CANCEL_ONLINE:
4828                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4829                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4830                 while (start_vpfn <= last_vpfn) {
4831                         struct iova *iova;
4832                         struct dmar_drhd_unit *drhd;
4833                         struct intel_iommu *iommu;
4834                         struct page *freelist;
4835
4836                         iova = find_iova(&si_domain->iovad, start_vpfn);
4837                         if (iova == NULL) {
4838                                 pr_debug("Failed get IOVA for PFN %lx\n",
4839                                          start_vpfn);
4840                                 break;
4841                         }
4842
4843                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4844                                                      start_vpfn, last_vpfn);
4845                         if (iova == NULL) {
4846                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4847                                         start_vpfn, last_vpfn);
4848                                 return NOTIFY_BAD;
4849                         }
4850
4851                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4852                                                iova->pfn_hi);
4853
4854                         rcu_read_lock();
4855                         for_each_active_iommu(iommu, drhd)
4856                                 iommu_flush_iotlb_psi(iommu, si_domain,
4857                                         iova->pfn_lo, iova_size(iova),
4858                                         !freelist, 0);
4859                         rcu_read_unlock();
4860                         dma_free_pagelist(freelist);
4861
4862                         start_vpfn = iova->pfn_hi + 1;
4863                         free_iova_mem(iova);
4864                 }
4865                 break;
4866         }
4867
4868         return NOTIFY_OK;
4869 }
4870
4871 static struct notifier_block intel_iommu_memory_nb = {
4872         .notifier_call = intel_iommu_memory_notifier,
4873         .priority = 0
4874 };
4875
4876 static void free_all_cpu_cached_iovas(unsigned int cpu)
4877 {
4878         int i;
4879
4880         for (i = 0; i < g_num_of_iommus; i++) {
4881                 struct intel_iommu *iommu = g_iommus[i];
4882                 struct dmar_domain *domain;
4883                 int did;
4884
4885                 if (!iommu)
4886                         continue;
4887
4888                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4889                         domain = get_iommu_domain(iommu, (u16)did);
4890
4891                         if (!domain)
4892                                 continue;
4893                         free_cpu_cached_iovas(cpu, &domain->iovad);
4894                 }
4895         }
4896 }
4897
4898 static int intel_iommu_cpu_dead(unsigned int cpu)
4899 {
4900         free_all_cpu_cached_iovas(cpu);
4901         return 0;
4902 }
4903
4904 static void intel_disable_iommus(void)
4905 {
4906         struct intel_iommu *iommu = NULL;
4907         struct dmar_drhd_unit *drhd;
4908
4909         for_each_iommu(iommu, drhd)
4910                 iommu_disable_translation(iommu);
4911 }
4912
4913 void intel_iommu_shutdown(void)
4914 {
4915         struct dmar_drhd_unit *drhd;
4916         struct intel_iommu *iommu = NULL;
4917
4918         if (no_iommu || dmar_disabled)
4919                 return;
4920
4921         down_write(&dmar_global_lock);
4922
4923         /* Disable PMRs explicitly here. */
4924         for_each_iommu(iommu, drhd)
4925                 iommu_disable_protect_mem_regions(iommu);
4926
4927         /* Make sure the IOMMUs are switched off */
4928         intel_disable_iommus();
4929
4930         up_write(&dmar_global_lock);
4931 }
4932
4933 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4934 {
4935         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4936
4937         return container_of(iommu_dev, struct intel_iommu, iommu);
4938 }
4939
4940 static ssize_t intel_iommu_show_version(struct device *dev,
4941                                         struct device_attribute *attr,
4942                                         char *buf)
4943 {
4944         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4945         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4946         return sprintf(buf, "%d:%d\n",
4947                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4948 }
4949 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4950
4951 static ssize_t intel_iommu_show_address(struct device *dev,
4952                                         struct device_attribute *attr,
4953                                         char *buf)
4954 {
4955         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4956         return sprintf(buf, "%llx\n", iommu->reg_phys);
4957 }
4958 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4959
4960 static ssize_t intel_iommu_show_cap(struct device *dev,
4961                                     struct device_attribute *attr,
4962                                     char *buf)
4963 {
4964         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4965         return sprintf(buf, "%llx\n", iommu->cap);
4966 }
4967 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4968
4969 static ssize_t intel_iommu_show_ecap(struct device *dev,
4970                                     struct device_attribute *attr,
4971                                     char *buf)
4972 {
4973         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4974         return sprintf(buf, "%llx\n", iommu->ecap);
4975 }
4976 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4977
4978 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4979                                       struct device_attribute *attr,
4980                                       char *buf)
4981 {
4982         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4983         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4984 }
4985 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4986
4987 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4988                                            struct device_attribute *attr,
4989                                            char *buf)
4990 {
4991         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4992         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4993                                                   cap_ndoms(iommu->cap)));
4994 }
4995 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4996
4997 static struct attribute *intel_iommu_attrs[] = {
4998         &dev_attr_version.attr,
4999         &dev_attr_address.attr,
5000         &dev_attr_cap.attr,
5001         &dev_attr_ecap.attr,
5002         &dev_attr_domains_supported.attr,
5003         &dev_attr_domains_used.attr,
5004         NULL,
5005 };
5006
5007 static struct attribute_group intel_iommu_group = {
5008         .name = "intel-iommu",
5009         .attrs = intel_iommu_attrs,
5010 };
5011
5012 const struct attribute_group *intel_iommu_groups[] = {
5013         &intel_iommu_group,
5014         NULL,
5015 };
5016
5017 static inline bool has_untrusted_dev(void)
5018 {
5019         struct pci_dev *pdev = NULL;
5020
5021         for_each_pci_dev(pdev)
5022                 if (pdev->untrusted)
5023                         return true;
5024
5025         return false;
5026 }
5027
5028 static int __init platform_optin_force_iommu(void)
5029 {
5030         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5031                 return 0;
5032
5033         if (no_iommu || dmar_disabled)
5034                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5035
5036         /*
5037          * If Intel-IOMMU is disabled by default, we will apply identity
5038          * map for all devices except those marked as being untrusted.
5039          */
5040         if (dmar_disabled)
5041                 iommu_set_default_passthrough(false);
5042
5043         dmar_disabled = 0;
5044         no_iommu = 0;
5045
5046         return 1;
5047 }
5048
5049 static int __init probe_acpi_namespace_devices(void)
5050 {
5051         struct dmar_drhd_unit *drhd;
5052         /* To avoid a -Wunused-but-set-variable warning. */
5053         struct intel_iommu *iommu __maybe_unused;
5054         struct device *dev;
5055         int i, ret = 0;
5056
5057         for_each_active_iommu(iommu, drhd) {
5058                 for_each_active_dev_scope(drhd->devices,
5059                                           drhd->devices_cnt, i, dev) {
5060                         struct acpi_device_physical_node *pn;
5061                         struct iommu_group *group;
5062                         struct acpi_device *adev;
5063
5064                         if (dev->bus != &acpi_bus_type)
5065                                 continue;
5066
5067                         adev = to_acpi_device(dev);
5068                         mutex_lock(&adev->physical_node_lock);
5069                         list_for_each_entry(pn,
5070                                             &adev->physical_node_list, node) {
5071                                 group = iommu_group_get(pn->dev);
5072                                 if (group) {
5073                                         iommu_group_put(group);
5074                                         continue;
5075                                 }
5076
5077                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5078                                 ret = iommu_probe_device(pn->dev);
5079                                 if (ret)
5080                                         break;
5081                         }
5082                         mutex_unlock(&adev->physical_node_lock);
5083
5084                         if (ret)
5085                                 return ret;
5086                 }
5087         }
5088
5089         return 0;
5090 }
5091
5092 int __init intel_iommu_init(void)
5093 {
5094         int ret = -ENODEV;
5095         struct dmar_drhd_unit *drhd;
5096         struct intel_iommu *iommu;
5097
5098         /*
5099          * Intel IOMMU is required for a TXT/tboot launch or platform
5100          * opt in, so enforce that.
5101          */
5102         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5103
5104         if (iommu_init_mempool()) {
5105                 if (force_on)
5106                         panic("tboot: Failed to initialize iommu memory\n");
5107                 return -ENOMEM;
5108         }
5109
5110         down_write(&dmar_global_lock);
5111         if (dmar_table_init()) {
5112                 if (force_on)
5113                         panic("tboot: Failed to initialize DMAR table\n");
5114                 goto out_free_dmar;
5115         }
5116
5117         if (dmar_dev_scope_init() < 0) {
5118                 if (force_on)
5119                         panic("tboot: Failed to initialize DMAR device scope\n");
5120                 goto out_free_dmar;
5121         }
5122
5123         up_write(&dmar_global_lock);
5124
5125         /*
5126          * The bus notifier takes the dmar_global_lock, so lockdep will
5127          * complain later when we register it under the lock.
5128          */
5129         dmar_register_bus_notifier();
5130
5131         down_write(&dmar_global_lock);
5132
5133         if (no_iommu || dmar_disabled) {
5134                 /*
5135                  * We exit the function here to ensure IOMMU's remapping and
5136                  * mempool aren't setup, which means that the IOMMU's PMRs
5137                  * won't be disabled via the call to init_dmars(). So disable
5138                  * it explicitly here. The PMRs were setup by tboot prior to
5139                  * calling SENTER, but the kernel is expected to reset/tear
5140                  * down the PMRs.
5141                  */
5142                 if (intel_iommu_tboot_noforce) {
5143                         for_each_iommu(iommu, drhd)
5144                                 iommu_disable_protect_mem_regions(iommu);
5145                 }
5146
5147                 /*
5148                  * Make sure the IOMMUs are switched off, even when we
5149                  * boot into a kexec kernel and the previous kernel left
5150                  * them enabled
5151                  */
5152                 intel_disable_iommus();
5153                 goto out_free_dmar;
5154         }
5155
5156         if (list_empty(&dmar_rmrr_units))
5157                 pr_info("No RMRR found\n");
5158
5159         if (list_empty(&dmar_atsr_units))
5160                 pr_info("No ATSR found\n");
5161
5162         if (dmar_init_reserved_ranges()) {
5163                 if (force_on)
5164                         panic("tboot: Failed to reserve iommu ranges\n");
5165                 goto out_free_reserved_range;
5166         }
5167
5168         if (dmar_map_gfx)
5169                 intel_iommu_gfx_mapped = 1;
5170
5171         init_no_remapping_devices();
5172
5173         ret = init_dmars();
5174         if (ret) {
5175                 if (force_on)
5176                         panic("tboot: Failed to initialize DMARs\n");
5177                 pr_err("Initialization failed\n");
5178                 goto out_free_reserved_range;
5179         }
5180         up_write(&dmar_global_lock);
5181
5182 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5183         /*
5184          * If the system has no untrusted device or the user has decided
5185          * to disable the bounce page mechanisms, we don't need swiotlb.
5186          * Mark this and the pre-allocated bounce pages will be released
5187          * later.
5188          */
5189         if (!has_untrusted_dev() || intel_no_bounce)
5190                 swiotlb = 0;
5191 #endif
5192         dma_ops = &intel_dma_ops;
5193
5194         init_iommu_pm_ops();
5195
5196         for_each_active_iommu(iommu, drhd) {
5197                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5198                                        intel_iommu_groups,
5199                                        "%s", iommu->name);
5200                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5201                 iommu_device_register(&iommu->iommu);
5202         }
5203
5204         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5205         if (si_domain && !hw_pass_through)
5206                 register_memory_notifier(&intel_iommu_memory_nb);
5207         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5208                           intel_iommu_cpu_dead);
5209
5210         down_read(&dmar_global_lock);
5211         if (probe_acpi_namespace_devices())
5212                 pr_warn("ACPI name space devices didn't probe correctly\n");
5213         up_read(&dmar_global_lock);
5214
5215         /* Finally, we enable the DMA remapping hardware. */
5216         for_each_iommu(iommu, drhd) {
5217                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5218                         iommu_enable_translation(iommu);
5219
5220                 iommu_disable_protect_mem_regions(iommu);
5221         }
5222         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5223
5224         intel_iommu_enabled = 1;
5225         intel_iommu_debugfs_init();
5226
5227         return 0;
5228
5229 out_free_reserved_range:
5230         put_iova_domain(&reserved_iova_list);
5231 out_free_dmar:
5232         intel_iommu_free_dmars();
5233         up_write(&dmar_global_lock);
5234         iommu_exit_mempool();
5235         return ret;
5236 }
5237
5238 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5239 {
5240         struct intel_iommu *iommu = opaque;
5241
5242         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5243         return 0;
5244 }
5245
5246 /*
5247  * NB - intel-iommu lacks any sort of reference counting for the users of
5248  * dependent devices.  If multiple endpoints have intersecting dependent
5249  * devices, unbinding the driver from any one of them will possibly leave
5250  * the others unable to operate.
5251  */
5252 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5253 {
5254         if (!iommu || !dev || !dev_is_pci(dev))
5255                 return;
5256
5257         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5258 }
5259
5260 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5261 {
5262         struct dmar_domain *domain;
5263         struct intel_iommu *iommu;
5264         unsigned long flags;
5265
5266         assert_spin_locked(&device_domain_lock);
5267
5268         if (WARN_ON(!info))
5269                 return;
5270
5271         iommu = info->iommu;
5272         domain = info->domain;
5273
5274         if (info->dev) {
5275                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5276                         intel_pasid_tear_down_entry(iommu, info->dev,
5277                                         PASID_RID2PASID);
5278
5279                 iommu_disable_dev_iotlb(info);
5280                 domain_context_clear(iommu, info->dev);
5281                 intel_pasid_free_table(info->dev);
5282         }
5283
5284         unlink_domain_info(info);
5285
5286         spin_lock_irqsave(&iommu->lock, flags);
5287         domain_detach_iommu(domain, iommu);
5288         spin_unlock_irqrestore(&iommu->lock, flags);
5289
5290         /* free the private domain */
5291         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5292             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5293             list_empty(&domain->devices))
5294                 domain_exit(info->domain);
5295
5296         free_devinfo_mem(info);
5297 }
5298
5299 static void dmar_remove_one_dev_info(struct device *dev)
5300 {
5301         struct device_domain_info *info;
5302         unsigned long flags;
5303
5304         spin_lock_irqsave(&device_domain_lock, flags);
5305         info = dev->archdata.iommu;
5306         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5307             && info != DUMMY_DEVICE_DOMAIN_INFO)
5308                 __dmar_remove_one_dev_info(info);
5309         spin_unlock_irqrestore(&device_domain_lock, flags);
5310 }
5311
5312 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5313 {
5314         int adjust_width;
5315
5316         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5317         domain_reserve_special_ranges(domain);
5318
5319         /* calculate AGAW */
5320         domain->gaw = guest_width;
5321         adjust_width = guestwidth_to_adjustwidth(guest_width);
5322         domain->agaw = width_to_agaw(adjust_width);
5323
5324         domain->iommu_coherency = 0;
5325         domain->iommu_snooping = 0;
5326         domain->iommu_superpage = 0;
5327         domain->max_addr = 0;
5328
5329         /* always allocate the top pgd */
5330         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5331         if (!domain->pgd)
5332                 return -ENOMEM;
5333         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5334         return 0;
5335 }
5336
5337 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5338 {
5339         struct dmar_domain *dmar_domain;
5340         struct iommu_domain *domain;
5341         int ret;
5342
5343         switch (type) {
5344         case IOMMU_DOMAIN_DMA:
5345         /* fallthrough */
5346         case IOMMU_DOMAIN_UNMANAGED:
5347                 dmar_domain = alloc_domain(0);
5348                 if (!dmar_domain) {
5349                         pr_err("Can't allocate dmar_domain\n");
5350                         return NULL;
5351                 }
5352                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5353                         pr_err("Domain initialization failed\n");
5354                         domain_exit(dmar_domain);
5355                         return NULL;
5356                 }
5357
5358                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5359                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5360                                                     iommu_flush_iova,
5361                                                     iova_entry_free);
5362                         if (ret)
5363                                 pr_info("iova flush queue initialization failed\n");
5364                 }
5365
5366                 domain_update_iommu_cap(dmar_domain);
5367
5368                 domain = &dmar_domain->domain;
5369                 domain->geometry.aperture_start = 0;
5370                 domain->geometry.aperture_end   =
5371                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5372                 domain->geometry.force_aperture = true;
5373
5374                 return domain;
5375         case IOMMU_DOMAIN_IDENTITY:
5376                 return &si_domain->domain;
5377         default:
5378                 return NULL;
5379         }
5380
5381         return NULL;
5382 }
5383
5384 static void intel_iommu_domain_free(struct iommu_domain *domain)
5385 {
5386         if (domain != &si_domain->domain)
5387                 domain_exit(to_dmar_domain(domain));
5388 }
5389
5390 /*
5391  * Check whether a @domain could be attached to the @dev through the
5392  * aux-domain attach/detach APIs.
5393  */
5394 static inline bool
5395 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5396 {
5397         struct device_domain_info *info = dev->archdata.iommu;
5398
5399         return info && info->auxd_enabled &&
5400                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5401 }
5402
5403 static void auxiliary_link_device(struct dmar_domain *domain,
5404                                   struct device *dev)
5405 {
5406         struct device_domain_info *info = dev->archdata.iommu;
5407
5408         assert_spin_locked(&device_domain_lock);
5409         if (WARN_ON(!info))
5410                 return;
5411
5412         domain->auxd_refcnt++;
5413         list_add(&domain->auxd, &info->auxiliary_domains);
5414 }
5415
5416 static void auxiliary_unlink_device(struct dmar_domain *domain,
5417                                     struct device *dev)
5418 {
5419         struct device_domain_info *info = dev->archdata.iommu;
5420
5421         assert_spin_locked(&device_domain_lock);
5422         if (WARN_ON(!info))
5423                 return;
5424
5425         list_del(&domain->auxd);
5426         domain->auxd_refcnt--;
5427
5428         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5429                 ioasid_free(domain->default_pasid);
5430 }
5431
5432 static int aux_domain_add_dev(struct dmar_domain *domain,
5433                               struct device *dev)
5434 {
5435         int ret;
5436         u8 bus, devfn;
5437         unsigned long flags;
5438         struct intel_iommu *iommu;
5439
5440         iommu = device_to_iommu(dev, &bus, &devfn);
5441         if (!iommu)
5442                 return -ENODEV;
5443
5444         if (domain->default_pasid <= 0) {
5445                 int pasid;
5446
5447                 /* No private data needed for the default pasid */
5448                 pasid = ioasid_alloc(NULL, PASID_MIN,
5449                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5450                                      NULL);
5451                 if (pasid == INVALID_IOASID) {
5452                         pr_err("Can't allocate default pasid\n");
5453                         return -ENODEV;
5454                 }
5455                 domain->default_pasid = pasid;
5456         }
5457
5458         spin_lock_irqsave(&device_domain_lock, flags);
5459         /*
5460          * iommu->lock must be held to attach domain to iommu and setup the
5461          * pasid entry for second level translation.
5462          */
5463         spin_lock(&iommu->lock);
5464         ret = domain_attach_iommu(domain, iommu);
5465         if (ret)
5466                 goto attach_failed;
5467
5468         /* Setup the PASID entry for mediated devices: */
5469         if (domain_use_first_level(domain))
5470                 ret = domain_setup_first_level(iommu, domain, dev,
5471                                                domain->default_pasid);
5472         else
5473                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5474                                                      domain->default_pasid);
5475         if (ret)
5476                 goto table_failed;
5477         spin_unlock(&iommu->lock);
5478
5479         auxiliary_link_device(domain, dev);
5480
5481         spin_unlock_irqrestore(&device_domain_lock, flags);
5482
5483         return 0;
5484
5485 table_failed:
5486         domain_detach_iommu(domain, iommu);
5487 attach_failed:
5488         spin_unlock(&iommu->lock);
5489         spin_unlock_irqrestore(&device_domain_lock, flags);
5490         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5491                 ioasid_free(domain->default_pasid);
5492
5493         return ret;
5494 }
5495
5496 static void aux_domain_remove_dev(struct dmar_domain *domain,
5497                                   struct device *dev)
5498 {
5499         struct device_domain_info *info;
5500         struct intel_iommu *iommu;
5501         unsigned long flags;
5502
5503         if (!is_aux_domain(dev, &domain->domain))
5504                 return;
5505
5506         spin_lock_irqsave(&device_domain_lock, flags);
5507         info = dev->archdata.iommu;
5508         iommu = info->iommu;
5509
5510         auxiliary_unlink_device(domain, dev);
5511
5512         spin_lock(&iommu->lock);
5513         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5514         domain_detach_iommu(domain, iommu);
5515         spin_unlock(&iommu->lock);
5516
5517         spin_unlock_irqrestore(&device_domain_lock, flags);
5518 }
5519
5520 static int prepare_domain_attach_device(struct iommu_domain *domain,
5521                                         struct device *dev)
5522 {
5523         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524         struct intel_iommu *iommu;
5525         int addr_width;
5526         u8 bus, devfn;
5527
5528         iommu = device_to_iommu(dev, &bus, &devfn);
5529         if (!iommu)
5530                 return -ENODEV;
5531
5532         /* check if this iommu agaw is sufficient for max mapped address */
5533         addr_width = agaw_to_width(iommu->agaw);
5534         if (addr_width > cap_mgaw(iommu->cap))
5535                 addr_width = cap_mgaw(iommu->cap);
5536
5537         if (dmar_domain->max_addr > (1LL << addr_width)) {
5538                 dev_err(dev, "%s: iommu width (%d) is not "
5539                         "sufficient for the mapped address (%llx)\n",
5540                         __func__, addr_width, dmar_domain->max_addr);
5541                 return -EFAULT;
5542         }
5543         dmar_domain->gaw = addr_width;
5544
5545         /*
5546          * Knock out extra levels of page tables if necessary
5547          */
5548         while (iommu->agaw < dmar_domain->agaw) {
5549                 struct dma_pte *pte;
5550
5551                 pte = dmar_domain->pgd;
5552                 if (dma_pte_present(pte)) {
5553                         dmar_domain->pgd = (struct dma_pte *)
5554                                 phys_to_virt(dma_pte_addr(pte));
5555                         free_pgtable_page(pte);
5556                 }
5557                 dmar_domain->agaw--;
5558         }
5559
5560         return 0;
5561 }
5562
5563 static int intel_iommu_attach_device(struct iommu_domain *domain,
5564                                      struct device *dev)
5565 {
5566         int ret;
5567
5568         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5569             device_is_rmrr_locked(dev)) {
5570                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5571                 return -EPERM;
5572         }
5573
5574         if (is_aux_domain(dev, domain))
5575                 return -EPERM;
5576
5577         /* normally dev is not mapped */
5578         if (unlikely(domain_context_mapped(dev))) {
5579                 struct dmar_domain *old_domain;
5580
5581                 old_domain = find_domain(dev);
5582                 if (old_domain)
5583                         dmar_remove_one_dev_info(dev);
5584         }
5585
5586         ret = prepare_domain_attach_device(domain, dev);
5587         if (ret)
5588                 return ret;
5589
5590         return domain_add_dev_info(to_dmar_domain(domain), dev);
5591 }
5592
5593 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5594                                          struct device *dev)
5595 {
5596         int ret;
5597
5598         if (!is_aux_domain(dev, domain))
5599                 return -EPERM;
5600
5601         ret = prepare_domain_attach_device(domain, dev);
5602         if (ret)
5603                 return ret;
5604
5605         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5606 }
5607
5608 static void intel_iommu_detach_device(struct iommu_domain *domain,
5609                                       struct device *dev)
5610 {
5611         dmar_remove_one_dev_info(dev);
5612 }
5613
5614 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5615                                           struct device *dev)
5616 {
5617         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5618 }
5619
5620 static int intel_iommu_map(struct iommu_domain *domain,
5621                            unsigned long iova, phys_addr_t hpa,
5622                            size_t size, int iommu_prot, gfp_t gfp)
5623 {
5624         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5625         u64 max_addr;
5626         int prot = 0;
5627         int ret;
5628
5629         if (iommu_prot & IOMMU_READ)
5630                 prot |= DMA_PTE_READ;
5631         if (iommu_prot & IOMMU_WRITE)
5632                 prot |= DMA_PTE_WRITE;
5633         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5634                 prot |= DMA_PTE_SNP;
5635
5636         max_addr = iova + size;
5637         if (dmar_domain->max_addr < max_addr) {
5638                 u64 end;
5639
5640                 /* check if minimum agaw is sufficient for mapped address */
5641                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5642                 if (end < max_addr) {
5643                         pr_err("%s: iommu width (%d) is not "
5644                                "sufficient for the mapped address (%llx)\n",
5645                                __func__, dmar_domain->gaw, max_addr);
5646                         return -EFAULT;
5647                 }
5648                 dmar_domain->max_addr = max_addr;
5649         }
5650         /* Round up size to next multiple of PAGE_SIZE, if it and
5651            the low bits of hpa would take us onto the next page */
5652         size = aligned_nrpages(hpa, size);
5653         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5654                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5655         return ret;
5656 }
5657
5658 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5659                                 unsigned long iova, size_t size,
5660                                 struct iommu_iotlb_gather *gather)
5661 {
5662         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5663         struct page *freelist = NULL;
5664         unsigned long start_pfn, last_pfn;
5665         unsigned int npages;
5666         int iommu_id, level = 0;
5667
5668         /* Cope with horrid API which requires us to unmap more than the
5669            size argument if it happens to be a large-page mapping. */
5670         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5671
5672         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5673                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5674
5675         start_pfn = iova >> VTD_PAGE_SHIFT;
5676         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5677
5678         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5679
5680         npages = last_pfn - start_pfn + 1;
5681
5682         for_each_domain_iommu(iommu_id, dmar_domain)
5683                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5684                                       start_pfn, npages, !freelist, 0);
5685
5686         dma_free_pagelist(freelist);
5687
5688         if (dmar_domain->max_addr == iova + size)
5689                 dmar_domain->max_addr = iova;
5690
5691         return size;
5692 }
5693
5694 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5695                                             dma_addr_t iova)
5696 {
5697         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5698         struct dma_pte *pte;
5699         int level = 0;
5700         u64 phys = 0;
5701
5702         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5703         if (pte)
5704                 phys = dma_pte_addr(pte);
5705
5706         return phys;
5707 }
5708
5709 static inline bool scalable_mode_support(void)
5710 {
5711         struct dmar_drhd_unit *drhd;
5712         struct intel_iommu *iommu;
5713         bool ret = true;
5714
5715         rcu_read_lock();
5716         for_each_active_iommu(iommu, drhd) {
5717                 if (!sm_supported(iommu)) {
5718                         ret = false;
5719                         break;
5720                 }
5721         }
5722         rcu_read_unlock();
5723
5724         return ret;
5725 }
5726
5727 static inline bool iommu_pasid_support(void)
5728 {
5729         struct dmar_drhd_unit *drhd;
5730         struct intel_iommu *iommu;
5731         bool ret = true;
5732
5733         rcu_read_lock();
5734         for_each_active_iommu(iommu, drhd) {
5735                 if (!pasid_supported(iommu)) {
5736                         ret = false;
5737                         break;
5738                 }
5739         }
5740         rcu_read_unlock();
5741
5742         return ret;
5743 }
5744
5745 static inline bool nested_mode_support(void)
5746 {
5747         struct dmar_drhd_unit *drhd;
5748         struct intel_iommu *iommu;
5749         bool ret = true;
5750
5751         rcu_read_lock();
5752         for_each_active_iommu(iommu, drhd) {
5753                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5754                         ret = false;
5755                         break;
5756                 }
5757         }
5758         rcu_read_unlock();
5759
5760         return ret;
5761 }
5762
5763 static bool intel_iommu_capable(enum iommu_cap cap)
5764 {
5765         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5766                 return domain_update_iommu_snooping(NULL) == 1;
5767         if (cap == IOMMU_CAP_INTR_REMAP)
5768                 return irq_remapping_enabled == 1;
5769
5770         return false;
5771 }
5772
5773 static int intel_iommu_add_device(struct device *dev)
5774 {
5775         struct dmar_domain *dmar_domain;
5776         struct iommu_domain *domain;
5777         struct intel_iommu *iommu;
5778         struct iommu_group *group;
5779         u8 bus, devfn;
5780         int ret;
5781
5782         iommu = device_to_iommu(dev, &bus, &devfn);
5783         if (!iommu)
5784                 return -ENODEV;
5785
5786         iommu_device_link(&iommu->iommu, dev);
5787
5788         if (translation_pre_enabled(iommu))
5789                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5790
5791         group = iommu_group_get_for_dev(dev);
5792
5793         if (IS_ERR(group)) {
5794                 ret = PTR_ERR(group);
5795                 goto unlink;
5796         }
5797
5798         iommu_group_put(group);
5799
5800         domain = iommu_get_domain_for_dev(dev);
5801         dmar_domain = to_dmar_domain(domain);
5802         if (domain->type == IOMMU_DOMAIN_DMA) {
5803                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5804                         ret = iommu_request_dm_for_dev(dev);
5805                         if (ret) {
5806                                 dmar_remove_one_dev_info(dev);
5807                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5808                                 domain_add_dev_info(si_domain, dev);
5809                                 dev_info(dev,
5810                                          "Device uses a private identity domain.\n");
5811                         }
5812                 }
5813         } else {
5814                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5815                         ret = iommu_request_dma_domain_for_dev(dev);
5816                         if (ret) {
5817                                 dmar_remove_one_dev_info(dev);
5818                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5819                                 if (!get_private_domain_for_dev(dev)) {
5820                                         dev_warn(dev,
5821                                                  "Failed to get a private domain.\n");
5822                                         ret = -ENOMEM;
5823                                         goto unlink;
5824                                 }
5825
5826                                 dev_info(dev,
5827                                          "Device uses a private dma domain.\n");
5828                         }
5829                 }
5830         }
5831
5832         if (device_needs_bounce(dev)) {
5833                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5834                 set_dma_ops(dev, &bounce_dma_ops);
5835         }
5836
5837         return 0;
5838
5839 unlink:
5840         iommu_device_unlink(&iommu->iommu, dev);
5841         return ret;
5842 }
5843
5844 static void intel_iommu_remove_device(struct device *dev)
5845 {
5846         struct intel_iommu *iommu;
5847         u8 bus, devfn;
5848
5849         iommu = device_to_iommu(dev, &bus, &devfn);
5850         if (!iommu)
5851                 return;
5852
5853         dmar_remove_one_dev_info(dev);
5854
5855         iommu_group_remove_device(dev);
5856
5857         iommu_device_unlink(&iommu->iommu, dev);
5858
5859         if (device_needs_bounce(dev))
5860                 set_dma_ops(dev, NULL);
5861 }
5862
5863 static void intel_iommu_get_resv_regions(struct device *device,
5864                                          struct list_head *head)
5865 {
5866         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5867         struct iommu_resv_region *reg;
5868         struct dmar_rmrr_unit *rmrr;
5869         struct device *i_dev;
5870         int i;
5871
5872         down_read(&dmar_global_lock);
5873         for_each_rmrr_units(rmrr) {
5874                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5875                                           i, i_dev) {
5876                         struct iommu_resv_region *resv;
5877                         enum iommu_resv_type type;
5878                         size_t length;
5879
5880                         if (i_dev != device &&
5881                             !is_downstream_to_pci_bridge(device, i_dev))
5882                                 continue;
5883
5884                         length = rmrr->end_address - rmrr->base_address + 1;
5885
5886                         type = device_rmrr_is_relaxable(device) ?
5887                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5888
5889                         resv = iommu_alloc_resv_region(rmrr->base_address,
5890                                                        length, prot, type);
5891                         if (!resv)
5892                                 break;
5893
5894                         list_add_tail(&resv->list, head);
5895                 }
5896         }
5897         up_read(&dmar_global_lock);
5898
5899 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5900         if (dev_is_pci(device)) {
5901                 struct pci_dev *pdev = to_pci_dev(device);
5902
5903                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5904                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5905                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5906                         if (reg)
5907                                 list_add_tail(&reg->list, head);
5908                 }
5909         }
5910 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5911
5912         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5913                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5914                                       0, IOMMU_RESV_MSI);
5915         if (!reg)
5916                 return;
5917         list_add_tail(&reg->list, head);
5918 }
5919
5920 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5921 {
5922         struct device_domain_info *info;
5923         struct context_entry *context;
5924         struct dmar_domain *domain;
5925         unsigned long flags;
5926         u64 ctx_lo;
5927         int ret;
5928
5929         domain = find_domain(dev);
5930         if (!domain)
5931                 return -EINVAL;
5932
5933         spin_lock_irqsave(&device_domain_lock, flags);
5934         spin_lock(&iommu->lock);
5935
5936         ret = -EINVAL;
5937         info = dev->archdata.iommu;
5938         if (!info || !info->pasid_supported)
5939                 goto out;
5940
5941         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5942         if (WARN_ON(!context))
5943                 goto out;
5944
5945         ctx_lo = context[0].lo;
5946
5947         if (!(ctx_lo & CONTEXT_PASIDE)) {
5948                 ctx_lo |= CONTEXT_PASIDE;
5949                 context[0].lo = ctx_lo;
5950                 wmb();
5951                 iommu->flush.flush_context(iommu,
5952                                            domain->iommu_did[iommu->seq_id],
5953                                            PCI_DEVID(info->bus, info->devfn),
5954                                            DMA_CCMD_MASK_NOBIT,
5955                                            DMA_CCMD_DEVICE_INVL);
5956         }
5957
5958         /* Enable PASID support in the device, if it wasn't already */
5959         if (!info->pasid_enabled)
5960                 iommu_enable_dev_iotlb(info);
5961
5962         ret = 0;
5963
5964  out:
5965         spin_unlock(&iommu->lock);
5966         spin_unlock_irqrestore(&device_domain_lock, flags);
5967
5968         return ret;
5969 }
5970
5971 static void intel_iommu_apply_resv_region(struct device *dev,
5972                                           struct iommu_domain *domain,
5973                                           struct iommu_resv_region *region)
5974 {
5975         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5976         unsigned long start, end;
5977
5978         start = IOVA_PFN(region->start);
5979         end   = IOVA_PFN(region->start + region->length - 1);
5980
5981         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5982 }
5983
5984 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5985 {
5986         if (dev_is_pci(dev))
5987                 return pci_device_group(dev);
5988         return generic_device_group(dev);
5989 }
5990
5991 #ifdef CONFIG_INTEL_IOMMU_SVM
5992 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5993 {
5994         struct intel_iommu *iommu;
5995         u8 bus, devfn;
5996
5997         if (iommu_dummy(dev)) {
5998                 dev_warn(dev,
5999                          "No IOMMU translation for device; cannot enable SVM\n");
6000                 return NULL;
6001         }
6002
6003         iommu = device_to_iommu(dev, &bus, &devfn);
6004         if ((!iommu)) {
6005                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6006                 return NULL;
6007         }
6008
6009         return iommu;
6010 }
6011 #endif /* CONFIG_INTEL_IOMMU_SVM */
6012
6013 static int intel_iommu_enable_auxd(struct device *dev)
6014 {
6015         struct device_domain_info *info;
6016         struct intel_iommu *iommu;
6017         unsigned long flags;
6018         u8 bus, devfn;
6019         int ret;
6020
6021         iommu = device_to_iommu(dev, &bus, &devfn);
6022         if (!iommu || dmar_disabled)
6023                 return -EINVAL;
6024
6025         if (!sm_supported(iommu) || !pasid_supported(iommu))
6026                 return -EINVAL;
6027
6028         ret = intel_iommu_enable_pasid(iommu, dev);
6029         if (ret)
6030                 return -ENODEV;
6031
6032         spin_lock_irqsave(&device_domain_lock, flags);
6033         info = dev->archdata.iommu;
6034         info->auxd_enabled = 1;
6035         spin_unlock_irqrestore(&device_domain_lock, flags);
6036
6037         return 0;
6038 }
6039
6040 static int intel_iommu_disable_auxd(struct device *dev)
6041 {
6042         struct device_domain_info *info;
6043         unsigned long flags;
6044
6045         spin_lock_irqsave(&device_domain_lock, flags);
6046         info = dev->archdata.iommu;
6047         if (!WARN_ON(!info))
6048                 info->auxd_enabled = 0;
6049         spin_unlock_irqrestore(&device_domain_lock, flags);
6050
6051         return 0;
6052 }
6053
6054 /*
6055  * A PCI express designated vendor specific extended capability is defined
6056  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6057  * for system software and tools to detect endpoint devices supporting the
6058  * Intel scalable IO virtualization without host driver dependency.
6059  *
6060  * Returns the address of the matching extended capability structure within
6061  * the device's PCI configuration space or 0 if the device does not support
6062  * it.
6063  */
6064 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6065 {
6066         int pos;
6067         u16 vendor, id;
6068
6069         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6070         while (pos) {
6071                 pci_read_config_word(pdev, pos + 4, &vendor);
6072                 pci_read_config_word(pdev, pos + 8, &id);
6073                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6074                         return pos;
6075
6076                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6077         }
6078
6079         return 0;
6080 }
6081
6082 static bool
6083 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6084 {
6085         if (feat == IOMMU_DEV_FEAT_AUX) {
6086                 int ret;
6087
6088                 if (!dev_is_pci(dev) || dmar_disabled ||
6089                     !scalable_mode_support() || !iommu_pasid_support())
6090                         return false;
6091
6092                 ret = pci_pasid_features(to_pci_dev(dev));
6093                 if (ret < 0)
6094                         return false;
6095
6096                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6097         }
6098
6099         return false;
6100 }
6101
6102 static int
6103 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6104 {
6105         if (feat == IOMMU_DEV_FEAT_AUX)
6106                 return intel_iommu_enable_auxd(dev);
6107
6108         return -ENODEV;
6109 }
6110
6111 static int
6112 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6113 {
6114         if (feat == IOMMU_DEV_FEAT_AUX)
6115                 return intel_iommu_disable_auxd(dev);
6116
6117         return -ENODEV;
6118 }
6119
6120 static bool
6121 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6122 {
6123         struct device_domain_info *info = dev->archdata.iommu;
6124
6125         if (feat == IOMMU_DEV_FEAT_AUX)
6126                 return scalable_mode_support() && info && info->auxd_enabled;
6127
6128         return false;
6129 }
6130
6131 static int
6132 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6133 {
6134         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6135
6136         return dmar_domain->default_pasid > 0 ?
6137                         dmar_domain->default_pasid : -EINVAL;
6138 }
6139
6140 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6141                                            struct device *dev)
6142 {
6143         return attach_deferred(dev);
6144 }
6145
6146 static int
6147 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6148                             enum iommu_attr attr, void *data)
6149 {
6150         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6151         unsigned long flags;
6152         int ret = 0;
6153
6154         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6155                 return -EINVAL;
6156
6157         switch (attr) {
6158         case DOMAIN_ATTR_NESTING:
6159                 spin_lock_irqsave(&device_domain_lock, flags);
6160                 if (nested_mode_support() &&
6161                     list_empty(&dmar_domain->devices)) {
6162                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6163                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6164                 } else {
6165                         ret = -ENODEV;
6166                 }
6167                 spin_unlock_irqrestore(&device_domain_lock, flags);
6168                 break;
6169         default:
6170                 ret = -EINVAL;
6171                 break;
6172         }
6173
6174         return ret;
6175 }
6176
6177 const struct iommu_ops intel_iommu_ops = {
6178         .capable                = intel_iommu_capable,
6179         .domain_alloc           = intel_iommu_domain_alloc,
6180         .domain_free            = intel_iommu_domain_free,
6181         .domain_set_attr        = intel_iommu_domain_set_attr,
6182         .attach_dev             = intel_iommu_attach_device,
6183         .detach_dev             = intel_iommu_detach_device,
6184         .aux_attach_dev         = intel_iommu_aux_attach_device,
6185         .aux_detach_dev         = intel_iommu_aux_detach_device,
6186         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6187         .map                    = intel_iommu_map,
6188         .unmap                  = intel_iommu_unmap,
6189         .iova_to_phys           = intel_iommu_iova_to_phys,
6190         .add_device             = intel_iommu_add_device,
6191         .remove_device          = intel_iommu_remove_device,
6192         .get_resv_regions       = intel_iommu_get_resv_regions,
6193         .put_resv_regions       = generic_iommu_put_resv_regions,
6194         .apply_resv_region      = intel_iommu_apply_resv_region,
6195         .device_group           = intel_iommu_device_group,
6196         .dev_has_feat           = intel_iommu_dev_has_feat,
6197         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6198         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6199         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6200         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6201         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6202 };
6203
6204 static void quirk_iommu_igfx(struct pci_dev *dev)
6205 {
6206         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6207         dmar_map_gfx = 0;
6208 }
6209
6210 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6218
6219 /* Broadwell igfx malfunctions with dmar */
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6244
6245 static void quirk_iommu_rwbf(struct pci_dev *dev)
6246 {
6247         /*
6248          * Mobile 4 Series Chipset neglects to set RWBF capability,
6249          * but needs it. Same seems to hold for the desktop versions.
6250          */
6251         pci_info(dev, "Forcing write-buffer flush capability\n");
6252         rwbf_quirk = 1;
6253 }
6254
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6262
6263 #define GGC 0x52
6264 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6265 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6266 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6267 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6268 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6269 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6270 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6271 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6272
6273 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6274 {
6275         unsigned short ggc;
6276
6277         if (pci_read_config_word(dev, GGC, &ggc))
6278                 return;
6279
6280         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6281                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6282                 dmar_map_gfx = 0;
6283         } else if (dmar_map_gfx) {
6284                 /* we have to ensure the gfx device is idle before we flush */
6285                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6286                 intel_iommu_strict = 1;
6287        }
6288 }
6289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6292 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6293
6294 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6295    ISOCH DMAR unit for the Azalia sound device, but not give it any
6296    TLB entries, which causes it to deadlock. Check for that.  We do
6297    this in a function called from init_dmars(), instead of in a PCI
6298    quirk, because we don't want to print the obnoxious "BIOS broken"
6299    message if VT-d is actually disabled.
6300 */
6301 static void __init check_tylersburg_isoch(void)
6302 {
6303         struct pci_dev *pdev;
6304         uint32_t vtisochctrl;
6305
6306         /* If there's no Azalia in the system anyway, forget it. */
6307         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6308         if (!pdev)
6309                 return;
6310         pci_dev_put(pdev);
6311
6312         /* System Management Registers. Might be hidden, in which case
6313            we can't do the sanity check. But that's OK, because the
6314            known-broken BIOSes _don't_ actually hide it, so far. */
6315         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6316         if (!pdev)
6317                 return;
6318
6319         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6320                 pci_dev_put(pdev);
6321                 return;
6322         }
6323
6324         pci_dev_put(pdev);
6325
6326         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6327         if (vtisochctrl & 1)
6328                 return;
6329
6330         /* Drop all bits other than the number of TLB entries */
6331         vtisochctrl &= 0x1c;
6332
6333         /* If we have the recommended number of TLB entries (16), fine. */
6334         if (vtisochctrl == 0x10)
6335                 return;
6336
6337         /* Zero TLB entries? You get to ride the short bus to school. */
6338         if (!vtisochctrl) {
6339                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6340                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6341                      dmi_get_system_info(DMI_BIOS_VENDOR),
6342                      dmi_get_system_info(DMI_BIOS_VERSION),
6343                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6344                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6345                 return;
6346         }
6347
6348         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6349                vtisochctrl);
6350 }