Merge branches 'acpi-processor', 'acpi-cppc', 'acpi-dbg', 'acpi-misc' and 'acpi-pci'
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359                                  struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361                                struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364                                      struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366                                             dma_addr_t iova);
367
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
373
374 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
379
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
382
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
389
390 #define IDENTMAP_GFX            2
391 #define IDENTMAP_AZALIA         4
392
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
400
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
402                                 to_pci_dev(d)->untrusted)
403
404 /*
405  * Iterate over elements in device_domain_list and call the specified
406  * callback @fn against each element.
407  */
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409                                      void *data), void *data)
410 {
411         int ret = 0;
412         unsigned long flags;
413         struct device_domain_info *info;
414
415         spin_lock_irqsave(&device_domain_lock, flags);
416         list_for_each_entry(info, &device_domain_list, global) {
417                 ret = fn(info, data);
418                 if (ret) {
419                         spin_unlock_irqrestore(&device_domain_lock, flags);
420                         return ret;
421                 }
422         }
423         spin_unlock_irqrestore(&device_domain_lock, flags);
424
425         return 0;
426 }
427
428 const struct iommu_ops intel_iommu_ops;
429
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
433 }
434
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
436 {
437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
438 }
439
440 static void init_translation_status(struct intel_iommu *iommu)
441 {
442         u32 gsts;
443
444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
445         if (gsts & DMA_GSTS_TES)
446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
447 }
448
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
451 {
452         return container_of(dom, struct dmar_domain, domain);
453 }
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         pr_info("IOMMU enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         no_platform_optin = 1;
466                         pr_info("IOMMU disabled\n");
467                 } else if (!strncmp(str, "igfx_off", 8)) {
468                         dmar_map_gfx = 0;
469                         pr_info("Disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         pr_info("Forcing DAC for PCI devices\n");
472                         dmar_forcedac = 1;
473                 } else if (!strncmp(str, "strict", 6)) {
474                         pr_info("Disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         pr_info("Disable supported super page\n");
478                         intel_iommu_superpage = 0;
479                 } else if (!strncmp(str, "sm_on", 5)) {
480                         pr_info("Intel-IOMMU: scalable mode supported\n");
481                         intel_iommu_sm = 1;
482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
483                         printk(KERN_INFO
484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485                         intel_iommu_tboot_noforce = 1;
486                 } else if (!strncmp(str, "nobounce", 8)) {
487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488                         intel_no_bounce = 1;
489                 }
490
491                 str += strcspn(str, ",");
492                 while (*str == ',')
493                         str++;
494         }
495         return 0;
496 }
497 __setup("intel_iommu=", intel_iommu_setup);
498
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
501
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
503 {
504         struct dmar_domain **domains;
505         int idx = did >> 8;
506
507         domains = iommu->domains[idx];
508         if (!domains)
509                 return NULL;
510
511         return domains[did & 0xff];
512 }
513
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515                              struct dmar_domain *domain)
516 {
517         struct dmar_domain **domains;
518         int idx = did >> 8;
519
520         if (!iommu->domains[idx]) {
521                 size_t size = 256 * sizeof(struct dmar_domain *);
522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
523         }
524
525         domains = iommu->domains[idx];
526         if (WARN_ON(!domains))
527                 return;
528         else
529                 domains[did & 0xff] = domain;
530 }
531
532 void *alloc_pgtable_page(int node)
533 {
534         struct page *page;
535         void *vaddr = NULL;
536
537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538         if (page)
539                 vaddr = page_address(page);
540         return vaddr;
541 }
542
543 void free_pgtable_page(void *vaddr)
544 {
545         free_page((unsigned long)vaddr);
546 }
547
548 static inline void *alloc_domain_mem(void)
549 {
550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
551 }
552
553 static void free_domain_mem(void *vaddr)
554 {
555         kmem_cache_free(iommu_domain_cache, vaddr);
556 }
557
558 static inline void * alloc_devinfo_mem(void)
559 {
560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
561 }
562
563 static inline void free_devinfo_mem(void *vaddr)
564 {
565         kmem_cache_free(iommu_devinfo_cache, vaddr);
566 }
567
568 static inline int domain_type_is_si(struct dmar_domain *domain)
569 {
570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
571 }
572
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
574 {
575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
576 }
577
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579                                        unsigned long pfn)
580 {
581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
582
583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588         unsigned long sagaw;
589         int agaw = -1;
590
591         sagaw = cap_sagaw(iommu->cap);
592         for (agaw = width_to_agaw(max_gaw);
593              agaw >= 0; agaw--) {
594                 if (test_bit(agaw, &sagaw))
595                         break;
596         }
597
598         return agaw;
599 }
600
601 /*
602  * Calculate max SAGAW for each iommu.
603  */
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 {
606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
607 }
608
609 /*
610  * calculate agaw for each iommu.
611  * "SAGAW" may be different across iommus, use a default agaw, and
612  * get a supported less agaw for iommus that don't support the default agaw.
613  */
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 {
616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
617 }
618
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
621 {
622         int iommu_id;
623
624         /* si_domain and vm domain should not get here. */
625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626                 return NULL;
627
628         for_each_domain_iommu(iommu_id, domain)
629                 break;
630
631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632                 return NULL;
633
634         return g_iommus[iommu_id];
635 }
636
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
638 {
639         struct dmar_drhd_unit *drhd;
640         struct intel_iommu *iommu;
641         bool found = false;
642         int i;
643
644         domain->iommu_coherency = 1;
645
646         for_each_domain_iommu(i, domain) {
647                 found = true;
648                 if (!ecap_coherent(g_iommus[i]->ecap)) {
649                         domain->iommu_coherency = 0;
650                         break;
651                 }
652         }
653         if (found)
654                 return;
655
656         /* No hardware attached; use lowest common denominator */
657         rcu_read_lock();
658         for_each_active_iommu(iommu, drhd) {
659                 if (!ecap_coherent(iommu->ecap)) {
660                         domain->iommu_coherency = 0;
661                         break;
662                 }
663         }
664         rcu_read_unlock();
665 }
666
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
668 {
669         struct dmar_drhd_unit *drhd;
670         struct intel_iommu *iommu;
671         int ret = 1;
672
673         rcu_read_lock();
674         for_each_active_iommu(iommu, drhd) {
675                 if (iommu != skip) {
676                         if (!ecap_sc_support(iommu->ecap)) {
677                                 ret = 0;
678                                 break;
679                         }
680                 }
681         }
682         rcu_read_unlock();
683
684         return ret;
685 }
686
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688                                          struct intel_iommu *skip)
689 {
690         struct dmar_drhd_unit *drhd;
691         struct intel_iommu *iommu;
692         int mask = 0x3;
693
694         if (!intel_iommu_superpage) {
695                 return 0;
696         }
697
698         /* set iommu_superpage to the smallest common denominator */
699         rcu_read_lock();
700         for_each_active_iommu(iommu, drhd) {
701                 if (iommu != skip) {
702                         if (domain && domain_use_first_level(domain)) {
703                                 if (!cap_fl1gp_support(iommu->cap))
704                                         mask = 0x1;
705                         } else {
706                                 mask &= cap_super_page_val(iommu->cap);
707                         }
708
709                         if (!mask)
710                                 break;
711                 }
712         }
713         rcu_read_unlock();
714
715         return fls(mask);
716 }
717
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
720 {
721         domain_update_iommu_coherency(domain);
722         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
724 }
725
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
727                                          u8 devfn, int alloc)
728 {
729         struct root_entry *root = &iommu->root_entry[bus];
730         struct context_entry *context;
731         u64 *entry;
732
733         entry = &root->lo;
734         if (sm_supported(iommu)) {
735                 if (devfn >= 0x80) {
736                         devfn -= 0x80;
737                         entry = &root->hi;
738                 }
739                 devfn *= 2;
740         }
741         if (*entry & 1)
742                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
743         else {
744                 unsigned long phy_addr;
745                 if (!alloc)
746                         return NULL;
747
748                 context = alloc_pgtable_page(iommu->node);
749                 if (!context)
750                         return NULL;
751
752                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753                 phy_addr = virt_to_phys((void *)context);
754                 *entry = phy_addr | 1;
755                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
756         }
757         return &context[devfn];
758 }
759
760 static int iommu_dummy(struct device *dev)
761 {
762         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
763 }
764
765 static bool attach_deferred(struct device *dev)
766 {
767         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
768 }
769
770 /**
771  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772  *                               sub-hierarchy of a candidate PCI-PCI bridge
773  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774  * @bridge: the candidate PCI-PCI bridge
775  *
776  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
777  */
778 static bool
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
780 {
781         struct pci_dev *pdev, *pbridge;
782
783         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
784                 return false;
785
786         pdev = to_pci_dev(dev);
787         pbridge = to_pci_dev(bridge);
788
789         if (pbridge->subordinate &&
790             pbridge->subordinate->number <= pdev->bus->number &&
791             pbridge->subordinate->busn_res.end >= pdev->bus->number)
792                 return true;
793
794         return false;
795 }
796
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
798 {
799         struct dmar_drhd_unit *drhd = NULL;
800         struct intel_iommu *iommu;
801         struct device *tmp;
802         struct pci_dev *pdev = NULL;
803         u16 segment = 0;
804         int i;
805
806         if (iommu_dummy(dev))
807                 return NULL;
808
809         if (dev_is_pci(dev)) {
810                 struct pci_dev *pf_pdev;
811
812                 pdev = pci_real_dma_dev(to_pci_dev(dev));
813
814                 /* VFs aren't listed in scope tables; we need to look up
815                  * the PF instead to find the IOMMU. */
816                 pf_pdev = pci_physfn(pdev);
817                 dev = &pf_pdev->dev;
818                 segment = pci_domain_nr(pdev->bus);
819         } else if (has_acpi_companion(dev))
820                 dev = &ACPI_COMPANION(dev)->dev;
821
822         rcu_read_lock();
823         for_each_active_iommu(iommu, drhd) {
824                 if (pdev && segment != drhd->segment)
825                         continue;
826
827                 for_each_active_dev_scope(drhd->devices,
828                                           drhd->devices_cnt, i, tmp) {
829                         if (tmp == dev) {
830                                 /* For a VF use its original BDF# not that of the PF
831                                  * which we used for the IOMMU lookup. Strictly speaking
832                                  * we could do this for all PCI devices; we only need to
833                                  * get the BDF# from the scope table for ACPI matches. */
834                                 if (pdev && pdev->is_virtfn)
835                                         goto got_pdev;
836
837                                 *bus = drhd->devices[i].bus;
838                                 *devfn = drhd->devices[i].devfn;
839                                 goto out;
840                         }
841
842                         if (is_downstream_to_pci_bridge(dev, tmp))
843                                 goto got_pdev;
844                 }
845
846                 if (pdev && drhd->include_all) {
847                 got_pdev:
848                         *bus = pdev->bus->number;
849                         *devfn = pdev->devfn;
850                         goto out;
851                 }
852         }
853         iommu = NULL;
854  out:
855         rcu_read_unlock();
856
857         return iommu;
858 }
859
860 static void domain_flush_cache(struct dmar_domain *domain,
861                                void *addr, int size)
862 {
863         if (!domain->iommu_coherency)
864                 clflush_cache_range(addr, size);
865 }
866
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
868 {
869         struct context_entry *context;
870         int ret = 0;
871         unsigned long flags;
872
873         spin_lock_irqsave(&iommu->lock, flags);
874         context = iommu_context_addr(iommu, bus, devfn, 0);
875         if (context)
876                 ret = context_present(context);
877         spin_unlock_irqrestore(&iommu->lock, flags);
878         return ret;
879 }
880
881 static void free_context_table(struct intel_iommu *iommu)
882 {
883         int i;
884         unsigned long flags;
885         struct context_entry *context;
886
887         spin_lock_irqsave(&iommu->lock, flags);
888         if (!iommu->root_entry) {
889                 goto out;
890         }
891         for (i = 0; i < ROOT_ENTRY_NR; i++) {
892                 context = iommu_context_addr(iommu, i, 0, 0);
893                 if (context)
894                         free_pgtable_page(context);
895
896                 if (!sm_supported(iommu))
897                         continue;
898
899                 context = iommu_context_addr(iommu, i, 0x80, 0);
900                 if (context)
901                         free_pgtable_page(context);
902
903         }
904         free_pgtable_page(iommu->root_entry);
905         iommu->root_entry = NULL;
906 out:
907         spin_unlock_irqrestore(&iommu->lock, flags);
908 }
909
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911                                       unsigned long pfn, int *target_level)
912 {
913         struct dma_pte *parent, *pte;
914         int level = agaw_to_level(domain->agaw);
915         int offset;
916
917         BUG_ON(!domain->pgd);
918
919         if (!domain_pfn_supported(domain, pfn))
920                 /* Address beyond IOMMU's addressing capabilities. */
921                 return NULL;
922
923         parent = domain->pgd;
924
925         while (1) {
926                 void *tmp_page;
927
928                 offset = pfn_level_offset(pfn, level);
929                 pte = &parent[offset];
930                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931                         break;
932                 if (level == *target_level)
933                         break;
934
935                 if (!dma_pte_present(pte)) {
936                         uint64_t pteval;
937
938                         tmp_page = alloc_pgtable_page(domain->nid);
939
940                         if (!tmp_page)
941                                 return NULL;
942
943                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945                         if (domain_use_first_level(domain))
946                                 pteval |= DMA_FL_PTE_XD;
947                         if (cmpxchg64(&pte->val, 0ULL, pteval))
948                                 /* Someone else set it while we were thinking; use theirs. */
949                                 free_pgtable_page(tmp_page);
950                         else
951                                 domain_flush_cache(domain, pte, sizeof(*pte));
952                 }
953                 if (level == 1)
954                         break;
955
956                 parent = phys_to_virt(dma_pte_addr(pte));
957                 level--;
958         }
959
960         if (!*target_level)
961                 *target_level = level;
962
963         return pte;
964 }
965
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968                                          unsigned long pfn,
969                                          int level, int *large_page)
970 {
971         struct dma_pte *parent, *pte;
972         int total = agaw_to_level(domain->agaw);
973         int offset;
974
975         parent = domain->pgd;
976         while (level <= total) {
977                 offset = pfn_level_offset(pfn, total);
978                 pte = &parent[offset];
979                 if (level == total)
980                         return pte;
981
982                 if (!dma_pte_present(pte)) {
983                         *large_page = total;
984                         break;
985                 }
986
987                 if (dma_pte_superpage(pte)) {
988                         *large_page = total;
989                         return pte;
990                 }
991
992                 parent = phys_to_virt(dma_pte_addr(pte));
993                 total--;
994         }
995         return NULL;
996 }
997
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000                                 unsigned long start_pfn,
1001                                 unsigned long last_pfn)
1002 {
1003         unsigned int large_page;
1004         struct dma_pte *first_pte, *pte;
1005
1006         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008         BUG_ON(start_pfn > last_pfn);
1009
1010         /* we don't need lock here; nobody else touches the iova range */
1011         do {
1012                 large_page = 1;
1013                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014                 if (!pte) {
1015                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016                         continue;
1017                 }
1018                 do {
1019                         dma_clear_pte(pte);
1020                         start_pfn += lvl_to_nr_pages(large_page);
1021                         pte++;
1022                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023
1024                 domain_flush_cache(domain, first_pte,
1025                                    (void *)pte - (void *)first_pte);
1026
1027         } while (start_pfn && start_pfn <= last_pfn);
1028 }
1029
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031                                int retain_level, struct dma_pte *pte,
1032                                unsigned long pfn, unsigned long start_pfn,
1033                                unsigned long last_pfn)
1034 {
1035         pfn = max(start_pfn, pfn);
1036         pte = &pte[pfn_level_offset(pfn, level)];
1037
1038         do {
1039                 unsigned long level_pfn;
1040                 struct dma_pte *level_pte;
1041
1042                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043                         goto next;
1044
1045                 level_pfn = pfn & level_mask(level);
1046                 level_pte = phys_to_virt(dma_pte_addr(pte));
1047
1048                 if (level > 2) {
1049                         dma_pte_free_level(domain, level - 1, retain_level,
1050                                            level_pte, level_pfn, start_pfn,
1051                                            last_pfn);
1052                 }
1053
1054                 /*
1055                  * Free the page table if we're below the level we want to
1056                  * retain and the range covers the entire table.
1057                  */
1058                 if (level < retain_level && !(start_pfn > level_pfn ||
1059                       last_pfn < level_pfn + level_size(level) - 1)) {
1060                         dma_clear_pte(pte);
1061                         domain_flush_cache(domain, pte, sizeof(*pte));
1062                         free_pgtable_page(level_pte);
1063                 }
1064 next:
1065                 pfn += level_size(level);
1066         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074                                    unsigned long start_pfn,
1075                                    unsigned long last_pfn,
1076                                    int retain_level)
1077 {
1078         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080         BUG_ON(start_pfn > last_pfn);
1081
1082         dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084         /* We don't need lock here; nobody else touches the iova range */
1085         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086                            domain->pgd, 0, start_pfn, last_pfn);
1087
1088         /* free pgd */
1089         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090                 free_pgtable_page(domain->pgd);
1091                 domain->pgd = NULL;
1092         }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096    need to *modify* it at all. All we need to do is make a list of all the
1097    pages which can be freed just as soon as we've flushed the IOTLB and we
1098    know the hardware page-walk will no longer touch them.
1099    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100    be freed. */
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102                                             int level, struct dma_pte *pte,
1103                                             struct page *freelist)
1104 {
1105         struct page *pg;
1106
1107         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108         pg->freelist = freelist;
1109         freelist = pg;
1110
1111         if (level == 1)
1112                 return freelist;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         freelist = dma_pte_list_pagetables(domain, level - 1,
1118                                                            pte, freelist);
1119                 pte++;
1120         } while (!first_pte_in_page(pte));
1121
1122         return freelist;
1123 }
1124
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126                                         struct dma_pte *pte, unsigned long pfn,
1127                                         unsigned long start_pfn,
1128                                         unsigned long last_pfn,
1129                                         struct page *freelist)
1130 {
1131         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1132
1133         pfn = max(start_pfn, pfn);
1134         pte = &pte[pfn_level_offset(pfn, level)];
1135
1136         do {
1137                 unsigned long level_pfn;
1138
1139                 if (!dma_pte_present(pte))
1140                         goto next;
1141
1142                 level_pfn = pfn & level_mask(level);
1143
1144                 /* If range covers entire pagetable, free it */
1145                 if (start_pfn <= level_pfn &&
1146                     last_pfn >= level_pfn + level_size(level) - 1) {
1147                         /* These suborbinate page tables are going away entirely. Don't
1148                            bother to clear them; we're just going to *free* them. */
1149                         if (level > 1 && !dma_pte_superpage(pte))
1150                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1151
1152                         dma_clear_pte(pte);
1153                         if (!first_pte)
1154                                 first_pte = pte;
1155                         last_pte = pte;
1156                 } else if (level > 1) {
1157                         /* Recurse down into a level that isn't *entirely* obsolete */
1158                         freelist = dma_pte_clear_level(domain, level - 1,
1159                                                        phys_to_virt(dma_pte_addr(pte)),
1160                                                        level_pfn, start_pfn, last_pfn,
1161                                                        freelist);
1162                 }
1163 next:
1164                 pfn += level_size(level);
1165         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1166
1167         if (first_pte)
1168                 domain_flush_cache(domain, first_pte,
1169                                    (void *)++last_pte - (void *)first_pte);
1170
1171         return freelist;
1172 }
1173
1174 /* We can't just free the pages because the IOMMU may still be walking
1175    the page tables, and may have cached the intermediate levels. The
1176    pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178                                  unsigned long start_pfn,
1179                                  unsigned long last_pfn)
1180 {
1181         struct page *freelist;
1182
1183         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185         BUG_ON(start_pfn > last_pfn);
1186
1187         /* we don't need lock here; nobody else touches the iova range */
1188         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1190
1191         /* free pgd */
1192         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193                 struct page *pgd_page = virt_to_page(domain->pgd);
1194                 pgd_page->freelist = freelist;
1195                 freelist = pgd_page;
1196
1197                 domain->pgd = NULL;
1198         }
1199
1200         return freelist;
1201 }
1202
1203 static void dma_free_pagelist(struct page *freelist)
1204 {
1205         struct page *pg;
1206
1207         while ((pg = freelist)) {
1208                 freelist = pg->freelist;
1209                 free_pgtable_page(page_address(pg));
1210         }
1211 }
1212
1213 static void iova_entry_free(unsigned long data)
1214 {
1215         struct page *freelist = (struct page *)data;
1216
1217         dma_free_pagelist(freelist);
1218 }
1219
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1222 {
1223         struct root_entry *root;
1224         unsigned long flags;
1225
1226         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1227         if (!root) {
1228                 pr_err("Allocating root entry for %s failed\n",
1229                         iommu->name);
1230                 return -ENOMEM;
1231         }
1232
1233         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1234
1235         spin_lock_irqsave(&iommu->lock, flags);
1236         iommu->root_entry = root;
1237         spin_unlock_irqrestore(&iommu->lock, flags);
1238
1239         return 0;
1240 }
1241
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1243 {
1244         u64 addr;
1245         u32 sts;
1246         unsigned long flag;
1247
1248         addr = virt_to_phys(iommu->root_entry);
1249         if (sm_supported(iommu))
1250                 addr |= DMA_RTADDR_SMT;
1251
1252         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1254
1255         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1256
1257         /* Make sure hardware complete it */
1258         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259                       readl, (sts & DMA_GSTS_RTPS), sts);
1260
1261         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1265 {
1266         u32 val;
1267         unsigned long flag;
1268
1269         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1270                 return;
1271
1272         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1274
1275         /* Make sure hardware complete it */
1276         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277                       readl, (!(val & DMA_GSTS_WBFS)), val);
1278
1279         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1280 }
1281
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284                                   u16 did, u16 source_id, u8 function_mask,
1285                                   u64 type)
1286 {
1287         u64 val = 0;
1288         unsigned long flag;
1289
1290         switch (type) {
1291         case DMA_CCMD_GLOBAL_INVL:
1292                 val = DMA_CCMD_GLOBAL_INVL;
1293                 break;
1294         case DMA_CCMD_DOMAIN_INVL:
1295                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1296                 break;
1297         case DMA_CCMD_DEVICE_INVL:
1298                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1300                 break;
1301         default:
1302                 BUG();
1303         }
1304         val |= DMA_CCMD_ICC;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1308
1309         /* Make sure hardware complete it */
1310         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1312
1313         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1314 }
1315
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318                                 u64 addr, unsigned int size_order, u64 type)
1319 {
1320         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321         u64 val = 0, val_iva = 0;
1322         unsigned long flag;
1323
1324         switch (type) {
1325         case DMA_TLB_GLOBAL_FLUSH:
1326                 /* global flush doesn't need set IVA_REG */
1327                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1328                 break;
1329         case DMA_TLB_DSI_FLUSH:
1330                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1331                 break;
1332         case DMA_TLB_PSI_FLUSH:
1333                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334                 /* IH bit is passed in as part of address */
1335                 val_iva = size_order | addr;
1336                 break;
1337         default:
1338                 BUG();
1339         }
1340         /* Note: set drain read/write */
1341 #if 0
1342         /*
1343          * This is probably to be super secure.. Looks like we can
1344          * ignore it without any impact.
1345          */
1346         if (cap_read_drain(iommu->cap))
1347                 val |= DMA_TLB_READ_DRAIN;
1348 #endif
1349         if (cap_write_drain(iommu->cap))
1350                 val |= DMA_TLB_WRITE_DRAIN;
1351
1352         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353         /* Note: Only uses first TLB reg currently */
1354         if (val_iva)
1355                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1357
1358         /* Make sure hardware complete it */
1359         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1361
1362         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1363
1364         /* check IOTLB invalidation granularity */
1365         if (DMA_TLB_IAIG(val) == 0)
1366                 pr_err("Flush IOTLB failed\n");
1367         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369                         (unsigned long long)DMA_TLB_IIRG(type),
1370                         (unsigned long long)DMA_TLB_IAIG(val));
1371 }
1372
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1375                          u8 bus, u8 devfn)
1376 {
1377         struct device_domain_info *info;
1378
1379         assert_spin_locked(&device_domain_lock);
1380
1381         if (!iommu->qi)
1382                 return NULL;
1383
1384         list_for_each_entry(info, &domain->devices, link)
1385                 if (info->iommu == iommu && info->bus == bus &&
1386                     info->devfn == devfn) {
1387                         if (info->ats_supported && info->dev)
1388                                 return info;
1389                         break;
1390                 }
1391
1392         return NULL;
1393 }
1394
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1396 {
1397         struct device_domain_info *info;
1398         bool has_iotlb_device = false;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         list_for_each_entry(info, &domain->devices, link) {
1403                 struct pci_dev *pdev;
1404
1405                 if (!info->dev || !dev_is_pci(info->dev))
1406                         continue;
1407
1408                 pdev = to_pci_dev(info->dev);
1409                 if (pdev->ats_enabled) {
1410                         has_iotlb_device = true;
1411                         break;
1412                 }
1413         }
1414
1415         domain->has_iotlb_device = has_iotlb_device;
1416 }
1417
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1419 {
1420         struct pci_dev *pdev;
1421
1422         assert_spin_locked(&device_domain_lock);
1423
1424         if (!info || !dev_is_pci(info->dev))
1425                 return;
1426
1427         pdev = to_pci_dev(info->dev);
1428         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431          * reserved, which should be set to 0.
1432          */
1433         if (!ecap_dit(info->iommu->ecap))
1434                 info->pfsid = 0;
1435         else {
1436                 struct pci_dev *pf_pdev;
1437
1438                 /* pdev will be returned if device is not a vf */
1439                 pf_pdev = pci_physfn(pdev);
1440                 info->pfsid = pci_dev_id(pf_pdev);
1441         }
1442
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444         /* The PCIe spec, in its wisdom, declares that the behaviour of
1445            the device if you enable PASID support after ATS support is
1446            undefined. So always enable PASID support on devices which
1447            have it, even if we can't yet know if we're ever going to
1448            use it. */
1449         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450                 info->pasid_enabled = 1;
1451
1452         if (info->pri_supported &&
1453             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1454             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455                 info->pri_enabled = 1;
1456 #endif
1457         if (!pdev->untrusted && info->ats_supported &&
1458             pci_ats_page_aligned(pdev) &&
1459             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460                 info->ats_enabled = 1;
1461                 domain_update_iotlb(info->domain);
1462                 info->ats_qdep = pci_ats_queue_depth(pdev);
1463         }
1464 }
1465
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1467 {
1468         struct pci_dev *pdev;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         if (!dev_is_pci(info->dev))
1473                 return;
1474
1475         pdev = to_pci_dev(info->dev);
1476
1477         if (info->ats_enabled) {
1478                 pci_disable_ats(pdev);
1479                 info->ats_enabled = 0;
1480                 domain_update_iotlb(info->domain);
1481         }
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483         if (info->pri_enabled) {
1484                 pci_disable_pri(pdev);
1485                 info->pri_enabled = 0;
1486         }
1487         if (info->pasid_enabled) {
1488                 pci_disable_pasid(pdev);
1489                 info->pasid_enabled = 0;
1490         }
1491 #endif
1492 }
1493
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495                                   u64 addr, unsigned mask)
1496 {
1497         u16 sid, qdep;
1498         unsigned long flags;
1499         struct device_domain_info *info;
1500
1501         if (!domain->has_iotlb_device)
1502                 return;
1503
1504         spin_lock_irqsave(&device_domain_lock, flags);
1505         list_for_each_entry(info, &domain->devices, link) {
1506                 if (!info->ats_enabled)
1507                         continue;
1508
1509                 sid = info->bus << 8 | info->devfn;
1510                 qdep = info->ats_qdep;
1511                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1512                                 qdep, addr, mask);
1513         }
1514         spin_unlock_irqrestore(&device_domain_lock, flags);
1515 }
1516
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518                                 struct dmar_domain *domain,
1519                                 u64 addr, unsigned long npages, bool ih)
1520 {
1521         u16 did = domain->iommu_did[iommu->seq_id];
1522
1523         if (domain->default_pasid)
1524                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1525                                 addr, npages, ih);
1526
1527         if (!list_empty(&domain->devices))
1528                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1529 }
1530
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532                                   struct dmar_domain *domain,
1533                                   unsigned long pfn, unsigned int pages,
1534                                   int ih, int map)
1535 {
1536         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538         u16 did = domain->iommu_did[iommu->seq_id];
1539
1540         BUG_ON(pages == 0);
1541
1542         if (ih)
1543                 ih = 1 << 6;
1544
1545         if (domain_use_first_level(domain)) {
1546                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1547         } else {
1548                 /*
1549                  * Fallback to domain selective flush if no PSI support or
1550                  * the size is too big. PSI requires page size to be 2 ^ x,
1551                  * and the base address is naturally aligned to the size.
1552                  */
1553                 if (!cap_pgsel_inv(iommu->cap) ||
1554                     mask > cap_max_amask_val(iommu->cap))
1555                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1556                                                         DMA_TLB_DSI_FLUSH);
1557                 else
1558                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1559                                                         DMA_TLB_PSI_FLUSH);
1560         }
1561
1562         /*
1563          * In caching mode, changes of pages from non-present to present require
1564          * flush. However, device IOTLB doesn't need to be flushed in this case.
1565          */
1566         if (!cap_caching_mode(iommu->cap) || !map)
1567                 iommu_flush_dev_iotlb(domain, addr, mask);
1568 }
1569
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572                                         struct dmar_domain *domain,
1573                                         unsigned long pfn, unsigned int pages)
1574 {
1575         /*
1576          * It's a non-present to present mapping. Only flush if caching mode
1577          * and second level.
1578          */
1579         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1581         else
1582                 iommu_flush_write_buffer(iommu);
1583 }
1584
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1586 {
1587         struct dmar_domain *domain;
1588         int idx;
1589
1590         domain = container_of(iovad, struct dmar_domain, iovad);
1591
1592         for_each_domain_iommu(idx, domain) {
1593                 struct intel_iommu *iommu = g_iommus[idx];
1594                 u16 did = domain->iommu_did[iommu->seq_id];
1595
1596                 if (domain_use_first_level(domain))
1597                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1598                 else
1599                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1600                                                  DMA_TLB_DSI_FLUSH);
1601
1602                 if (!cap_caching_mode(iommu->cap))
1603                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604                                               0, MAX_AGAW_PFN_WIDTH);
1605         }
1606 }
1607
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1609 {
1610         u32 pmen;
1611         unsigned long flags;
1612
1613         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1614                 return;
1615
1616         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618         pmen &= ~DMA_PMEN_EPM;
1619         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1620
1621         /* wait for the protected region status bit to clear */
1622         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1624
1625         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 }
1627
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1629 {
1630         u32 sts;
1631         unsigned long flags;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634         iommu->gcmd |= DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (sts & DMA_GSTS_TES), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1645 {
1646         u32 sts;
1647         unsigned long flag;
1648
1649         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650         iommu->gcmd &= ~DMA_GCMD_TE;
1651         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653         /* Make sure hardware complete it */
1654         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655                       readl, (!(sts & DMA_GSTS_TES)), sts);
1656
1657         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1658 }
1659
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1661 {
1662         u32 ndomains, nlongs;
1663         size_t size;
1664
1665         ndomains = cap_ndoms(iommu->cap);
1666         pr_debug("%s: Number of Domains supported <%d>\n",
1667                  iommu->name, ndomains);
1668         nlongs = BITS_TO_LONGS(ndomains);
1669
1670         spin_lock_init(&iommu->lock);
1671
1672         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673         if (!iommu->domain_ids) {
1674                 pr_err("%s: Allocating domain id array failed\n",
1675                        iommu->name);
1676                 return -ENOMEM;
1677         }
1678
1679         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680         iommu->domains = kzalloc(size, GFP_KERNEL);
1681
1682         if (iommu->domains) {
1683                 size = 256 * sizeof(struct dmar_domain *);
1684                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1685         }
1686
1687         if (!iommu->domains || !iommu->domains[0]) {
1688                 pr_err("%s: Allocating domain array failed\n",
1689                        iommu->name);
1690                 kfree(iommu->domain_ids);
1691                 kfree(iommu->domains);
1692                 iommu->domain_ids = NULL;
1693                 iommu->domains    = NULL;
1694                 return -ENOMEM;
1695         }
1696
1697         /*
1698          * If Caching mode is set, then invalid translations are tagged
1699          * with domain-id 0, hence we need to pre-allocate it. We also
1700          * use domain-id 0 as a marker for non-allocated domain-id, so
1701          * make sure it is not used for a real domain.
1702          */
1703         set_bit(0, iommu->domain_ids);
1704
1705         /*
1706          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707          * entry for first-level or pass-through translation modes should
1708          * be programmed with a domain id different from those used for
1709          * second-level or nested translation. We reserve a domain id for
1710          * this purpose.
1711          */
1712         if (sm_supported(iommu))
1713                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1714
1715         return 0;
1716 }
1717
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1719 {
1720         struct device_domain_info *info, *tmp;
1721         unsigned long flags;
1722
1723         if (!iommu->domains || !iommu->domain_ids)
1724                 return;
1725
1726         spin_lock_irqsave(&device_domain_lock, flags);
1727         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728                 if (info->iommu != iommu)
1729                         continue;
1730
1731                 if (!info->dev || !info->domain)
1732                         continue;
1733
1734                 __dmar_remove_one_dev_info(info);
1735         }
1736         spin_unlock_irqrestore(&device_domain_lock, flags);
1737
1738         if (iommu->gcmd & DMA_GCMD_TE)
1739                 iommu_disable_translation(iommu);
1740 }
1741
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1743 {
1744         if ((iommu->domains) && (iommu->domain_ids)) {
1745                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1746                 int i;
1747
1748                 for (i = 0; i < elems; i++)
1749                         kfree(iommu->domains[i]);
1750                 kfree(iommu->domains);
1751                 kfree(iommu->domain_ids);
1752                 iommu->domains = NULL;
1753                 iommu->domain_ids = NULL;
1754         }
1755
1756         g_iommus[iommu->seq_id] = NULL;
1757
1758         /* free context mapping */
1759         free_context_table(iommu);
1760
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762         if (pasid_supported(iommu)) {
1763                 if (ecap_prs(iommu->ecap))
1764                         intel_svm_finish_prq(iommu);
1765         }
1766 #endif
1767 }
1768
1769 /*
1770  * Check and return whether first level is used by default for
1771  * DMA translation.
1772  */
1773 static bool first_level_by_default(void)
1774 {
1775         struct dmar_drhd_unit *drhd;
1776         struct intel_iommu *iommu;
1777         static int first_level_support = -1;
1778
1779         if (likely(first_level_support != -1))
1780                 return first_level_support;
1781
1782         first_level_support = 1;
1783
1784         rcu_read_lock();
1785         for_each_active_iommu(iommu, drhd) {
1786                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787                         first_level_support = 0;
1788                         break;
1789                 }
1790         }
1791         rcu_read_unlock();
1792
1793         return first_level_support;
1794 }
1795
1796 static struct dmar_domain *alloc_domain(int flags)
1797 {
1798         struct dmar_domain *domain;
1799
1800         domain = alloc_domain_mem();
1801         if (!domain)
1802                 return NULL;
1803
1804         memset(domain, 0, sizeof(*domain));
1805         domain->nid = NUMA_NO_NODE;
1806         domain->flags = flags;
1807         if (first_level_by_default())
1808                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809         domain->has_iotlb_device = false;
1810         INIT_LIST_HEAD(&domain->devices);
1811
1812         return domain;
1813 }
1814
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817                                struct intel_iommu *iommu)
1818 {
1819         unsigned long ndomains;
1820         int num;
1821
1822         assert_spin_locked(&device_domain_lock);
1823         assert_spin_locked(&iommu->lock);
1824
1825         domain->iommu_refcnt[iommu->seq_id] += 1;
1826         domain->iommu_count += 1;
1827         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828                 ndomains = cap_ndoms(iommu->cap);
1829                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1830
1831                 if (num >= ndomains) {
1832                         pr_err("%s: No free domain ids\n", iommu->name);
1833                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1834                         domain->iommu_count -= 1;
1835                         return -ENOSPC;
1836                 }
1837
1838                 set_bit(num, iommu->domain_ids);
1839                 set_iommu_domain(iommu, num, domain);
1840
1841                 domain->iommu_did[iommu->seq_id] = num;
1842                 domain->nid                      = iommu->node;
1843
1844                 domain_update_iommu_cap(domain);
1845         }
1846
1847         return 0;
1848 }
1849
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851                                struct intel_iommu *iommu)
1852 {
1853         int num, count;
1854
1855         assert_spin_locked(&device_domain_lock);
1856         assert_spin_locked(&iommu->lock);
1857
1858         domain->iommu_refcnt[iommu->seq_id] -= 1;
1859         count = --domain->iommu_count;
1860         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861                 num = domain->iommu_did[iommu->seq_id];
1862                 clear_bit(num, iommu->domain_ids);
1863                 set_iommu_domain(iommu, num, NULL);
1864
1865                 domain_update_iommu_cap(domain);
1866                 domain->iommu_did[iommu->seq_id] = 0;
1867         }
1868
1869         return count;
1870 }
1871
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1874
1875 static int dmar_init_reserved_ranges(void)
1876 {
1877         struct pci_dev *pdev = NULL;
1878         struct iova *iova;
1879         int i;
1880
1881         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1882
1883         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884                 &reserved_rbtree_key);
1885
1886         /* IOAPIC ranges shouldn't be accessed by DMA */
1887         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888                 IOVA_PFN(IOAPIC_RANGE_END));
1889         if (!iova) {
1890                 pr_err("Reserve IOAPIC range failed\n");
1891                 return -ENODEV;
1892         }
1893
1894         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895         for_each_pci_dev(pdev) {
1896                 struct resource *r;
1897
1898                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899                         r = &pdev->resource[i];
1900                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1901                                 continue;
1902                         iova = reserve_iova(&reserved_iova_list,
1903                                             IOVA_PFN(r->start),
1904                                             IOVA_PFN(r->end));
1905                         if (!iova) {
1906                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1907                                 return -ENODEV;
1908                         }
1909                 }
1910         }
1911         return 0;
1912 }
1913
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1915 {
1916         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1917 }
1918
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1920 {
1921         int agaw;
1922         int r = (gaw - 12) % 9;
1923
1924         if (r == 0)
1925                 agaw = gaw;
1926         else
1927                 agaw = gaw + 9 - r;
1928         if (agaw > 64)
1929                 agaw = 64;
1930         return agaw;
1931 }
1932
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1934                        int guest_width)
1935 {
1936         int adjust_width, agaw;
1937         unsigned long sagaw;
1938         int ret;
1939
1940         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1941
1942         if (!intel_iommu_strict) {
1943                 ret = init_iova_flush_queue(&domain->iovad,
1944                                             iommu_flush_iova, iova_entry_free);
1945                 if (ret)
1946                         pr_info("iova flush queue initialization failed\n");
1947         }
1948
1949         domain_reserve_special_ranges(domain);
1950
1951         /* calculate AGAW */
1952         if (guest_width > cap_mgaw(iommu->cap))
1953                 guest_width = cap_mgaw(iommu->cap);
1954         domain->gaw = guest_width;
1955         adjust_width = guestwidth_to_adjustwidth(guest_width);
1956         agaw = width_to_agaw(adjust_width);
1957         sagaw = cap_sagaw(iommu->cap);
1958         if (!test_bit(agaw, &sagaw)) {
1959                 /* hardware doesn't support it, choose a bigger one */
1960                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961                 agaw = find_next_bit(&sagaw, 5, agaw);
1962                 if (agaw >= 5)
1963                         return -ENODEV;
1964         }
1965         domain->agaw = agaw;
1966
1967         if (ecap_coherent(iommu->ecap))
1968                 domain->iommu_coherency = 1;
1969         else
1970                 domain->iommu_coherency = 0;
1971
1972         if (ecap_sc_support(iommu->ecap))
1973                 domain->iommu_snooping = 1;
1974         else
1975                 domain->iommu_snooping = 0;
1976
1977         if (intel_iommu_superpage)
1978                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1979         else
1980                 domain->iommu_superpage = 0;
1981
1982         domain->nid = iommu->node;
1983
1984         /* always allocate the top pgd */
1985         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1986         if (!domain->pgd)
1987                 return -ENOMEM;
1988         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1989         return 0;
1990 }
1991
1992 static void domain_exit(struct dmar_domain *domain)
1993 {
1994
1995         /* Remove associated devices and clear attached or cached domains */
1996         domain_remove_dev_info(domain);
1997
1998         /* destroy iovas */
1999         put_iova_domain(&domain->iovad);
2000
2001         if (domain->pgd) {
2002                 struct page *freelist;
2003
2004                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005                 dma_free_pagelist(freelist);
2006         }
2007
2008         free_domain_mem(domain);
2009 }
2010
2011 /*
2012  * Get the PASID directory size for scalable mode context entry.
2013  * Value of X in the PDTS field of a scalable mode context entry
2014  * indicates PASID directory with 2^(X + 7) entries.
2015  */
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2017 {
2018         int pds, max_pde;
2019
2020         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2022         if (pds < 7)
2023                 return 0;
2024
2025         return pds - 7;
2026 }
2027
2028 /*
2029  * Set the RID_PASID field of a scalable mode context entry. The
2030  * IOMMU hardware will use the PASID value set in this field for
2031  * DMA translations of DMA requests without PASID.
2032  */
2033 static inline void
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2035 {
2036         context->hi |= pasid & ((1 << 20) - 1);
2037         context->hi |= (1 << 20);
2038 }
2039
2040 /*
2041  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2042  * entry.
2043  */
2044 static inline void context_set_sm_dte(struct context_entry *context)
2045 {
2046         context->lo |= (1 << 2);
2047 }
2048
2049 /*
2050  * Set the PRE(Page Request Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_pre(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 4);
2056 }
2057
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2060
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062                                       struct intel_iommu *iommu,
2063                                       struct pasid_table *table,
2064                                       u8 bus, u8 devfn)
2065 {
2066         u16 did = domain->iommu_did[iommu->seq_id];
2067         int translation = CONTEXT_TT_MULTI_LEVEL;
2068         struct device_domain_info *info = NULL;
2069         struct context_entry *context;
2070         unsigned long flags;
2071         int ret;
2072
2073         WARN_ON(did == 0);
2074
2075         if (hw_pass_through && domain_type_is_si(domain))
2076                 translation = CONTEXT_TT_PASS_THROUGH;
2077
2078         pr_debug("Set context mapping for %02x:%02x.%d\n",
2079                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2080
2081         BUG_ON(!domain->pgd);
2082
2083         spin_lock_irqsave(&device_domain_lock, flags);
2084         spin_lock(&iommu->lock);
2085
2086         ret = -ENOMEM;
2087         context = iommu_context_addr(iommu, bus, devfn, 1);
2088         if (!context)
2089                 goto out_unlock;
2090
2091         ret = 0;
2092         if (context_present(context))
2093                 goto out_unlock;
2094
2095         /*
2096          * For kdump cases, old valid entries may be cached due to the
2097          * in-flight DMA and copied pgtable, but there is no unmapping
2098          * behaviour for them, thus we need an explicit cache flush for
2099          * the newly-mapped device. For kdump, at this point, the device
2100          * is supposed to finish reset at its driver probe stage, so no
2101          * in-flight DMA will exist, and we don't need to worry anymore
2102          * hereafter.
2103          */
2104         if (context_copied(context)) {
2105                 u16 did_old = context_domain_id(context);
2106
2107                 if (did_old < cap_ndoms(iommu->cap)) {
2108                         iommu->flush.flush_context(iommu, did_old,
2109                                                    (((u16)bus) << 8) | devfn,
2110                                                    DMA_CCMD_MASK_NOBIT,
2111                                                    DMA_CCMD_DEVICE_INVL);
2112                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2113                                                  DMA_TLB_DSI_FLUSH);
2114                 }
2115         }
2116
2117         context_clear_entry(context);
2118
2119         if (sm_supported(iommu)) {
2120                 unsigned long pds;
2121
2122                 WARN_ON(!table);
2123
2124                 /* Setup the PASID DIR pointer: */
2125                 pds = context_get_sm_pds(table);
2126                 context->lo = (u64)virt_to_phys(table->table) |
2127                                 context_pdts(pds);
2128
2129                 /* Setup the RID_PASID field: */
2130                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2131
2132                 /*
2133                  * Setup the Device-TLB enable bit and Page request
2134                  * Enable bit:
2135                  */
2136                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137                 if (info && info->ats_supported)
2138                         context_set_sm_dte(context);
2139                 if (info && info->pri_supported)
2140                         context_set_sm_pre(context);
2141         } else {
2142                 struct dma_pte *pgd = domain->pgd;
2143                 int agaw;
2144
2145                 context_set_domain_id(context, did);
2146
2147                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2148                         /*
2149                          * Skip top levels of page tables for iommu which has
2150                          * less agaw than default. Unnecessary for PT mode.
2151                          */
2152                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2153                                 ret = -ENOMEM;
2154                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2155                                 if (!dma_pte_present(pgd))
2156                                         goto out_unlock;
2157                         }
2158
2159                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160                         if (info && info->ats_supported)
2161                                 translation = CONTEXT_TT_DEV_IOTLB;
2162                         else
2163                                 translation = CONTEXT_TT_MULTI_LEVEL;
2164
2165                         context_set_address_root(context, virt_to_phys(pgd));
2166                         context_set_address_width(context, agaw);
2167                 } else {
2168                         /*
2169                          * In pass through mode, AW must be programmed to
2170                          * indicate the largest AGAW value supported by
2171                          * hardware. And ASR is ignored by hardware.
2172                          */
2173                         context_set_address_width(context, iommu->msagaw);
2174                 }
2175
2176                 context_set_translation_type(context, translation);
2177         }
2178
2179         context_set_fault_enable(context);
2180         context_set_present(context);
2181         domain_flush_cache(domain, context, sizeof(*context));
2182
2183         /*
2184          * It's a non-present to present mapping. If hardware doesn't cache
2185          * non-present entry we only need to flush the write-buffer. If the
2186          * _does_ cache non-present entries, then it does so in the special
2187          * domain #0, which we have to flush:
2188          */
2189         if (cap_caching_mode(iommu->cap)) {
2190                 iommu->flush.flush_context(iommu, 0,
2191                                            (((u16)bus) << 8) | devfn,
2192                                            DMA_CCMD_MASK_NOBIT,
2193                                            DMA_CCMD_DEVICE_INVL);
2194                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2195         } else {
2196                 iommu_flush_write_buffer(iommu);
2197         }
2198         iommu_enable_dev_iotlb(info);
2199
2200         ret = 0;
2201
2202 out_unlock:
2203         spin_unlock(&iommu->lock);
2204         spin_unlock_irqrestore(&device_domain_lock, flags);
2205
2206         return ret;
2207 }
2208
2209 struct domain_context_mapping_data {
2210         struct dmar_domain *domain;
2211         struct intel_iommu *iommu;
2212         struct pasid_table *table;
2213 };
2214
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216                                      u16 alias, void *opaque)
2217 {
2218         struct domain_context_mapping_data *data = opaque;
2219
2220         return domain_context_mapping_one(data->domain, data->iommu,
2221                                           data->table, PCI_BUS_NUM(alias),
2222                                           alias & 0xff);
2223 }
2224
2225 static int
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2227 {
2228         struct domain_context_mapping_data data;
2229         struct pasid_table *table;
2230         struct intel_iommu *iommu;
2231         u8 bus, devfn;
2232
2233         iommu = device_to_iommu(dev, &bus, &devfn);
2234         if (!iommu)
2235                 return -ENODEV;
2236
2237         table = intel_pasid_get_table(dev);
2238
2239         if (!dev_is_pci(dev))
2240                 return domain_context_mapping_one(domain, iommu, table,
2241                                                   bus, devfn);
2242
2243         data.domain = domain;
2244         data.iommu = iommu;
2245         data.table = table;
2246
2247         return pci_for_each_dma_alias(to_pci_dev(dev),
2248                                       &domain_context_mapping_cb, &data);
2249 }
2250
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252                                     u16 alias, void *opaque)
2253 {
2254         struct intel_iommu *iommu = opaque;
2255
2256         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2257 }
2258
2259 static int domain_context_mapped(struct device *dev)
2260 {
2261         struct intel_iommu *iommu;
2262         u8 bus, devfn;
2263
2264         iommu = device_to_iommu(dev, &bus, &devfn);
2265         if (!iommu)
2266                 return -ENODEV;
2267
2268         if (!dev_is_pci(dev))
2269                 return device_context_mapped(iommu, bus, devfn);
2270
2271         return !pci_for_each_dma_alias(to_pci_dev(dev),
2272                                        domain_context_mapped_cb, iommu);
2273 }
2274
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2277                                             size_t size)
2278 {
2279         host_addr &= ~PAGE_MASK;
2280         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2281 }
2282
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285                                           unsigned long iov_pfn,
2286                                           unsigned long phy_pfn,
2287                                           unsigned long pages)
2288 {
2289         int support, level = 1;
2290         unsigned long pfnmerge;
2291
2292         support = domain->iommu_superpage;
2293
2294         /* To use a large page, the virtual *and* physical addresses
2295            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296            of them will mean we have to use smaller pages. So just
2297            merge them and check both at once. */
2298         pfnmerge = iov_pfn | phy_pfn;
2299
2300         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301                 pages >>= VTD_STRIDE_SHIFT;
2302                 if (!pages)
2303                         break;
2304                 pfnmerge >>= VTD_STRIDE_SHIFT;
2305                 level++;
2306                 support--;
2307         }
2308         return level;
2309 }
2310
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312                             struct scatterlist *sg, unsigned long phys_pfn,
2313                             unsigned long nr_pages, int prot)
2314 {
2315         struct dma_pte *first_pte = NULL, *pte = NULL;
2316         phys_addr_t uninitialized_var(pteval);
2317         unsigned long sg_res = 0;
2318         unsigned int largepage_lvl = 0;
2319         unsigned long lvl_pages = 0;
2320         u64 attr;
2321
2322         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2323
2324         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2325                 return -EINVAL;
2326
2327         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328         if (domain_use_first_level(domain))
2329                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2330
2331         if (!sg) {
2332                 sg_res = nr_pages;
2333                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2334         }
2335
2336         while (nr_pages > 0) {
2337                 uint64_t tmp;
2338
2339                 if (!sg_res) {
2340                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2341
2342                         sg_res = aligned_nrpages(sg->offset, sg->length);
2343                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344                         sg->dma_length = sg->length;
2345                         pteval = (sg_phys(sg) - pgoff) | attr;
2346                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2347                 }
2348
2349                 if (!pte) {
2350                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2351
2352                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353                         if (!pte)
2354                                 return -ENOMEM;
2355                         /* It is large page*/
2356                         if (largepage_lvl > 1) {
2357                                 unsigned long nr_superpages, end_pfn;
2358
2359                                 pteval |= DMA_PTE_LARGE_PAGE;
2360                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2361
2362                                 nr_superpages = sg_res / lvl_pages;
2363                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2364
2365                                 /*
2366                                  * Ensure that old small page tables are
2367                                  * removed to make room for superpage(s).
2368                                  * We're adding new large pages, so make sure
2369                                  * we don't remove their parent tables.
2370                                  */
2371                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2372                                                        largepage_lvl + 1);
2373                         } else {
2374                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2375                         }
2376
2377                 }
2378                 /* We don't need lock here, nobody else
2379                  * touches the iova range
2380                  */
2381                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2382                 if (tmp) {
2383                         static int dumps = 5;
2384                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385                                 iov_pfn, tmp, (unsigned long long)pteval);
2386                         if (dumps) {
2387                                 dumps--;
2388                                 debug_dma_dump_mappings(NULL);
2389                         }
2390                         WARN_ON(1);
2391                 }
2392
2393                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2394
2395                 BUG_ON(nr_pages < lvl_pages);
2396                 BUG_ON(sg_res < lvl_pages);
2397
2398                 nr_pages -= lvl_pages;
2399                 iov_pfn += lvl_pages;
2400                 phys_pfn += lvl_pages;
2401                 pteval += lvl_pages * VTD_PAGE_SIZE;
2402                 sg_res -= lvl_pages;
2403
2404                 /* If the next PTE would be the first in a new page, then we
2405                    need to flush the cache on the entries we've just written.
2406                    And then we'll need to recalculate 'pte', so clear it and
2407                    let it get set again in the if (!pte) block above.
2408
2409                    If we're done (!nr_pages) we need to flush the cache too.
2410
2411                    Also if we've been setting superpages, we may need to
2412                    recalculate 'pte' and switch back to smaller pages for the
2413                    end of the mapping, if the trailing size is not enough to
2414                    use another superpage (i.e. sg_res < lvl_pages). */
2415                 pte++;
2416                 if (!nr_pages || first_pte_in_page(pte) ||
2417                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418                         domain_flush_cache(domain, first_pte,
2419                                            (void *)pte - (void *)first_pte);
2420                         pte = NULL;
2421                 }
2422
2423                 if (!sg_res && nr_pages)
2424                         sg = sg_next(sg);
2425         }
2426         return 0;
2427 }
2428
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430                           struct scatterlist *sg, unsigned long phys_pfn,
2431                           unsigned long nr_pages, int prot)
2432 {
2433         int iommu_id, ret;
2434         struct intel_iommu *iommu;
2435
2436         /* Do the real mapping first */
2437         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2438         if (ret)
2439                 return ret;
2440
2441         for_each_domain_iommu(iommu_id, domain) {
2442                 iommu = g_iommus[iommu_id];
2443                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2444         }
2445
2446         return 0;
2447 }
2448
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450                                     struct scatterlist *sg, unsigned long nr_pages,
2451                                     int prot)
2452 {
2453         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2454 }
2455
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457                                      unsigned long phys_pfn, unsigned long nr_pages,
2458                                      int prot)
2459 {
2460         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2461 }
2462
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2464 {
2465         unsigned long flags;
2466         struct context_entry *context;
2467         u16 did_old;
2468
2469         if (!iommu)
2470                 return;
2471
2472         spin_lock_irqsave(&iommu->lock, flags);
2473         context = iommu_context_addr(iommu, bus, devfn, 0);
2474         if (!context) {
2475                 spin_unlock_irqrestore(&iommu->lock, flags);
2476                 return;
2477         }
2478         did_old = context_domain_id(context);
2479         context_clear_entry(context);
2480         __iommu_flush_cache(iommu, context, sizeof(*context));
2481         spin_unlock_irqrestore(&iommu->lock, flags);
2482         iommu->flush.flush_context(iommu,
2483                                    did_old,
2484                                    (((u16)bus) << 8) | devfn,
2485                                    DMA_CCMD_MASK_NOBIT,
2486                                    DMA_CCMD_DEVICE_INVL);
2487         iommu->flush.flush_iotlb(iommu,
2488                                  did_old,
2489                                  0,
2490                                  0,
2491                                  DMA_TLB_DSI_FLUSH);
2492 }
2493
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2495 {
2496         assert_spin_locked(&device_domain_lock);
2497         list_del(&info->link);
2498         list_del(&info->global);
2499         if (info->dev)
2500                 info->dev->archdata.iommu = NULL;
2501 }
2502
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2504 {
2505         struct device_domain_info *info, *tmp;
2506         unsigned long flags;
2507
2508         spin_lock_irqsave(&device_domain_lock, flags);
2509         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510                 __dmar_remove_one_dev_info(info);
2511         spin_unlock_irqrestore(&device_domain_lock, flags);
2512 }
2513
2514 struct dmar_domain *find_domain(struct device *dev)
2515 {
2516         struct device_domain_info *info;
2517
2518         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2519                 return NULL;
2520
2521         if (dev_is_pci(dev))
2522                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2523
2524         /* No lock here, assumes no domain exit in normal case */
2525         info = dev->archdata.iommu;
2526         if (likely(info))
2527                 return info->domain;
2528
2529         return NULL;
2530 }
2531
2532 static void do_deferred_attach(struct device *dev)
2533 {
2534         struct iommu_domain *domain;
2535
2536         dev->archdata.iommu = NULL;
2537         domain = iommu_get_domain_for_dev(dev);
2538         if (domain)
2539                 intel_iommu_attach_device(domain, dev);
2540 }
2541
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2544 {
2545         struct device_domain_info *info;
2546
2547         list_for_each_entry(info, &device_domain_list, global)
2548                 if (info->iommu->segment == segment && info->bus == bus &&
2549                     info->devfn == devfn)
2550                         return info;
2551
2552         return NULL;
2553 }
2554
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556                                     struct dmar_domain *domain,
2557                                     struct device *dev,
2558                                     int pasid)
2559 {
2560         int flags = PASID_FLAG_SUPERVISOR_MODE;
2561         struct dma_pte *pgd = domain->pgd;
2562         int agaw, level;
2563
2564         /*
2565          * Skip top levels of page tables for iommu which has
2566          * less agaw than default. Unnecessary for PT mode.
2567          */
2568         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569                 pgd = phys_to_virt(dma_pte_addr(pgd));
2570                 if (!dma_pte_present(pgd))
2571                         return -ENOMEM;
2572         }
2573
2574         level = agaw_to_level(agaw);
2575         if (level != 4 && level != 5)
2576                 return -EINVAL;
2577
2578         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2579
2580         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581                                              domain->iommu_did[iommu->seq_id],
2582                                              flags);
2583 }
2584
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2586                                                     int bus, int devfn,
2587                                                     struct device *dev,
2588                                                     struct dmar_domain *domain)
2589 {
2590         struct dmar_domain *found = NULL;
2591         struct device_domain_info *info;
2592         unsigned long flags;
2593         int ret;
2594
2595         info = alloc_devinfo_mem();
2596         if (!info)
2597                 return NULL;
2598
2599         info->bus = bus;
2600         info->devfn = devfn;
2601         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2603         info->ats_qdep = 0;
2604         info->dev = dev;
2605         info->domain = domain;
2606         info->iommu = iommu;
2607         info->pasid_table = NULL;
2608         info->auxd_enabled = 0;
2609         INIT_LIST_HEAD(&info->auxiliary_domains);
2610
2611         if (dev && dev_is_pci(dev)) {
2612                 struct pci_dev *pdev = to_pci_dev(info->dev);
2613
2614                 if (!pdev->untrusted &&
2615                     !pci_ats_disabled() &&
2616                     ecap_dev_iotlb_support(iommu->ecap) &&
2617                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618                     dmar_find_matched_atsr_unit(pdev))
2619                         info->ats_supported = 1;
2620
2621                 if (sm_supported(iommu)) {
2622                         if (pasid_supported(iommu)) {
2623                                 int features = pci_pasid_features(pdev);
2624                                 if (features >= 0)
2625                                         info->pasid_supported = features | 1;
2626                         }
2627
2628                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630                                 info->pri_supported = 1;
2631                 }
2632         }
2633
2634         spin_lock_irqsave(&device_domain_lock, flags);
2635         if (dev)
2636                 found = find_domain(dev);
2637
2638         if (!found) {
2639                 struct device_domain_info *info2;
2640                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2641                 if (info2) {
2642                         found      = info2->domain;
2643                         info2->dev = dev;
2644                 }
2645         }
2646
2647         if (found) {
2648                 spin_unlock_irqrestore(&device_domain_lock, flags);
2649                 free_devinfo_mem(info);
2650                 /* Caller must free the original domain */
2651                 return found;
2652         }
2653
2654         spin_lock(&iommu->lock);
2655         ret = domain_attach_iommu(domain, iommu);
2656         spin_unlock(&iommu->lock);
2657
2658         if (ret) {
2659                 spin_unlock_irqrestore(&device_domain_lock, flags);
2660                 free_devinfo_mem(info);
2661                 return NULL;
2662         }
2663
2664         list_add(&info->link, &domain->devices);
2665         list_add(&info->global, &device_domain_list);
2666         if (dev)
2667                 dev->archdata.iommu = info;
2668         spin_unlock_irqrestore(&device_domain_lock, flags);
2669
2670         /* PASID table is mandatory for a PCI device in scalable mode. */
2671         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672                 ret = intel_pasid_alloc_table(dev);
2673                 if (ret) {
2674                         dev_err(dev, "PASID table allocation failed\n");
2675                         dmar_remove_one_dev_info(dev);
2676                         return NULL;
2677                 }
2678
2679                 /* Setup the PASID entry for requests without PASID: */
2680                 spin_lock(&iommu->lock);
2681                 if (hw_pass_through && domain_type_is_si(domain))
2682                         ret = intel_pasid_setup_pass_through(iommu, domain,
2683                                         dev, PASID_RID2PASID);
2684                 else if (domain_use_first_level(domain))
2685                         ret = domain_setup_first_level(iommu, domain, dev,
2686                                         PASID_RID2PASID);
2687                 else
2688                         ret = intel_pasid_setup_second_level(iommu, domain,
2689                                         dev, PASID_RID2PASID);
2690                 spin_unlock(&iommu->lock);
2691                 if (ret) {
2692                         dev_err(dev, "Setup RID2PASID failed\n");
2693                         dmar_remove_one_dev_info(dev);
2694                         return NULL;
2695                 }
2696         }
2697
2698         if (dev && domain_context_mapping(domain, dev)) {
2699                 dev_err(dev, "Domain context map failed\n");
2700                 dmar_remove_one_dev_info(dev);
2701                 return NULL;
2702         }
2703
2704         return domain;
2705 }
2706
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2708 {
2709         *(u16 *)opaque = alias;
2710         return 0;
2711 }
2712
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2714 {
2715         struct device_domain_info *info;
2716         struct dmar_domain *domain = NULL;
2717         struct intel_iommu *iommu;
2718         u16 dma_alias;
2719         unsigned long flags;
2720         u8 bus, devfn;
2721
2722         iommu = device_to_iommu(dev, &bus, &devfn);
2723         if (!iommu)
2724                 return NULL;
2725
2726         if (dev_is_pci(dev)) {
2727                 struct pci_dev *pdev = to_pci_dev(dev);
2728
2729                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2730
2731                 spin_lock_irqsave(&device_domain_lock, flags);
2732                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733                                                       PCI_BUS_NUM(dma_alias),
2734                                                       dma_alias & 0xff);
2735                 if (info) {
2736                         iommu = info->iommu;
2737                         domain = info->domain;
2738                 }
2739                 spin_unlock_irqrestore(&device_domain_lock, flags);
2740
2741                 /* DMA alias already has a domain, use it */
2742                 if (info)
2743                         goto out;
2744         }
2745
2746         /* Allocate and initialize new domain for the device */
2747         domain = alloc_domain(0);
2748         if (!domain)
2749                 return NULL;
2750         if (domain_init(domain, iommu, gaw)) {
2751                 domain_exit(domain);
2752                 return NULL;
2753         }
2754
2755 out:
2756         return domain;
2757 }
2758
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760                                               struct dmar_domain *domain)
2761 {
2762         struct intel_iommu *iommu;
2763         struct dmar_domain *tmp;
2764         u16 req_id, dma_alias;
2765         u8 bus, devfn;
2766
2767         iommu = device_to_iommu(dev, &bus, &devfn);
2768         if (!iommu)
2769                 return NULL;
2770
2771         req_id = ((u16)bus << 8) | devfn;
2772
2773         if (dev_is_pci(dev)) {
2774                 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2777
2778                 /* register PCI DMA alias device */
2779                 if (req_id != dma_alias) {
2780                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781                                         dma_alias & 0xff, NULL, domain);
2782
2783                         if (!tmp || tmp != domain)
2784                                 return tmp;
2785                 }
2786         }
2787
2788         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789         if (!tmp || tmp != domain)
2790                 return tmp;
2791
2792         return domain;
2793 }
2794
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796                                      unsigned long long start,
2797                                      unsigned long long end)
2798 {
2799         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2801
2802         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803                           dma_to_mm_pfn(last_vpfn))) {
2804                 pr_err("Reserving iova failed\n");
2805                 return -ENOMEM;
2806         }
2807
2808         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2809         /*
2810          * RMRR range might have overlap with physical memory range,
2811          * clear it first
2812          */
2813         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2814
2815         return __domain_mapping(domain, first_vpfn, NULL,
2816                                 first_vpfn, last_vpfn - first_vpfn + 1,
2817                                 DMA_PTE_READ|DMA_PTE_WRITE);
2818 }
2819
2820 static int domain_prepare_identity_map(struct device *dev,
2821                                        struct dmar_domain *domain,
2822                                        unsigned long long start,
2823                                        unsigned long long end)
2824 {
2825         /* For _hardware_ passthrough, don't bother. But for software
2826            passthrough, we do it anyway -- it may indicate a memory
2827            range which is reserved in E820, so which didn't get set
2828            up to start with in si_domain */
2829         if (domain == si_domain && hw_pass_through) {
2830                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2831                          start, end);
2832                 return 0;
2833         }
2834
2835         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2836
2837         if (end < start) {
2838                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840                         dmi_get_system_info(DMI_BIOS_VENDOR),
2841                         dmi_get_system_info(DMI_BIOS_VERSION),
2842                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2843                 return -EIO;
2844         }
2845
2846         if (end >> agaw_to_width(domain->agaw)) {
2847                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849                      agaw_to_width(domain->agaw),
2850                      dmi_get_system_info(DMI_BIOS_VENDOR),
2851                      dmi_get_system_info(DMI_BIOS_VERSION),
2852                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2853                 return -EIO;
2854         }
2855
2856         return iommu_domain_identity_map(domain, start, end);
2857 }
2858
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2860
2861 static int __init si_domain_init(int hw)
2862 {
2863         struct dmar_rmrr_unit *rmrr;
2864         struct device *dev;
2865         int i, nid, ret;
2866
2867         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2868         if (!si_domain)
2869                 return -EFAULT;
2870
2871         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872                 domain_exit(si_domain);
2873                 return -EFAULT;
2874         }
2875
2876         if (hw)
2877                 return 0;
2878
2879         for_each_online_node(nid) {
2880                 unsigned long start_pfn, end_pfn;
2881                 int i;
2882
2883                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884                         ret = iommu_domain_identity_map(si_domain,
2885                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2886                         if (ret)
2887                                 return ret;
2888                 }
2889         }
2890
2891         /*
2892          * Identity map the RMRRs so that devices with RMRRs could also use
2893          * the si_domain.
2894          */
2895         for_each_rmrr_units(rmrr) {
2896                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2897                                           i, dev) {
2898                         unsigned long long start = rmrr->base_address;
2899                         unsigned long long end = rmrr->end_address;
2900
2901                         if (WARN_ON(end < start ||
2902                                     end >> agaw_to_width(si_domain->agaw)))
2903                                 continue;
2904
2905                         ret = iommu_domain_identity_map(si_domain, start, end);
2906                         if (ret)
2907                                 return ret;
2908                 }
2909         }
2910
2911         return 0;
2912 }
2913
2914 static int identity_mapping(struct device *dev)
2915 {
2916         struct device_domain_info *info;
2917
2918         info = dev->archdata.iommu;
2919         if (info)
2920                 return (info->domain == si_domain);
2921
2922         return 0;
2923 }
2924
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2926 {
2927         struct dmar_domain *ndomain;
2928         struct intel_iommu *iommu;
2929         u8 bus, devfn;
2930
2931         iommu = device_to_iommu(dev, &bus, &devfn);
2932         if (!iommu)
2933                 return -ENODEV;
2934
2935         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936         if (ndomain != domain)
2937                 return -EBUSY;
2938
2939         return 0;
2940 }
2941
2942 static bool device_has_rmrr(struct device *dev)
2943 {
2944         struct dmar_rmrr_unit *rmrr;
2945         struct device *tmp;
2946         int i;
2947
2948         rcu_read_lock();
2949         for_each_rmrr_units(rmrr) {
2950                 /*
2951                  * Return TRUE if this RMRR contains the device that
2952                  * is passed in.
2953                  */
2954                 for_each_active_dev_scope(rmrr->devices,
2955                                           rmrr->devices_cnt, i, tmp)
2956                         if (tmp == dev ||
2957                             is_downstream_to_pci_bridge(dev, tmp)) {
2958                                 rcu_read_unlock();
2959                                 return true;
2960                         }
2961         }
2962         rcu_read_unlock();
2963         return false;
2964 }
2965
2966 /**
2967  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968  * is relaxable (ie. is allowed to be not enforced under some conditions)
2969  * @dev: device handle
2970  *
2971  * We assume that PCI USB devices with RMRRs have them largely
2972  * for historical reasons and that the RMRR space is not actively used post
2973  * boot.  This exclusion may change if vendors begin to abuse it.
2974  *
2975  * The same exception is made for graphics devices, with the requirement that
2976  * any use of the RMRR regions will be torn down before assigning the device
2977  * to a guest.
2978  *
2979  * Return: true if the RMRR is relaxable, false otherwise
2980  */
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2982 {
2983         struct pci_dev *pdev;
2984
2985         if (!dev_is_pci(dev))
2986                 return false;
2987
2988         pdev = to_pci_dev(dev);
2989         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2990                 return true;
2991         else
2992                 return false;
2993 }
2994
2995 /*
2996  * There are a couple cases where we need to restrict the functionality of
2997  * devices associated with RMRRs.  The first is when evaluating a device for
2998  * identity mapping because problems exist when devices are moved in and out
2999  * of domains and their respective RMRR information is lost.  This means that
3000  * a device with associated RMRRs will never be in a "passthrough" domain.
3001  * The second is use of the device through the IOMMU API.  This interface
3002  * expects to have full control of the IOVA space for the device.  We cannot
3003  * satisfy both the requirement that RMRR access is maintained and have an
3004  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3005  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006  * We therefore prevent devices associated with an RMRR from participating in
3007  * the IOMMU API, which eliminates them from device assignment.
3008  *
3009  * In both cases, devices which have relaxable RMRRs are not concerned by this
3010  * restriction. See device_rmrr_is_relaxable comment.
3011  */
3012 static bool device_is_rmrr_locked(struct device *dev)
3013 {
3014         if (!device_has_rmrr(dev))
3015                 return false;
3016
3017         if (device_rmrr_is_relaxable(dev))
3018                 return false;
3019
3020         return true;
3021 }
3022
3023 /*
3024  * Return the required default domain type for a specific device.
3025  *
3026  * @dev: the device in query
3027  * @startup: true if this is during early boot
3028  *
3029  * Returns:
3030  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032  *  - 0: both identity and dynamic domains work for this device
3033  */
3034 static int device_def_domain_type(struct device *dev)
3035 {
3036         if (dev_is_pci(dev)) {
3037                 struct pci_dev *pdev = to_pci_dev(dev);
3038
3039                 /*
3040                  * Prevent any device marked as untrusted from getting
3041                  * placed into the statically identity mapping domain.
3042                  */
3043                 if (pdev->untrusted)
3044                         return IOMMU_DOMAIN_DMA;
3045
3046                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047                         return IOMMU_DOMAIN_IDENTITY;
3048
3049                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050                         return IOMMU_DOMAIN_IDENTITY;
3051
3052                 /*
3053                  * We want to start off with all devices in the 1:1 domain, and
3054                  * take them out later if we find they can't access all of memory.
3055                  *
3056                  * However, we can't do this for PCI devices behind bridges,
3057                  * because all PCI devices behind the same bridge will end up
3058                  * with the same source-id on their transactions.
3059                  *
3060                  * Practically speaking, we can't change things around for these
3061                  * devices at run-time, because we can't be sure there'll be no
3062                  * DMA transactions in flight for any of their siblings.
3063                  *
3064                  * So PCI devices (unless they're on the root bus) as well as
3065                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066                  * the 1:1 domain, just in _case_ one of their siblings turns out
3067                  * not to be able to map all of memory.
3068                  */
3069                 if (!pci_is_pcie(pdev)) {
3070                         if (!pci_is_root_bus(pdev->bus))
3071                                 return IOMMU_DOMAIN_DMA;
3072                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073                                 return IOMMU_DOMAIN_DMA;
3074                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075                         return IOMMU_DOMAIN_DMA;
3076         }
3077
3078         return 0;
3079 }
3080
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3082 {
3083         /*
3084          * Start from the sane iommu hardware state.
3085          * If the queued invalidation is already initialized by us
3086          * (for example, while enabling interrupt-remapping) then
3087          * we got the things already rolling from a sane state.
3088          */
3089         if (!iommu->qi) {
3090                 /*
3091                  * Clear any previous faults.
3092                  */
3093                 dmar_fault(-1, iommu);
3094                 /*
3095                  * Disable queued invalidation if supported and already enabled
3096                  * before OS handover.
3097                  */
3098                 dmar_disable_qi(iommu);
3099         }
3100
3101         if (dmar_enable_qi(iommu)) {
3102                 /*
3103                  * Queued Invalidate not enabled, use Register Based Invalidate
3104                  */
3105                 iommu->flush.flush_context = __iommu_flush_context;
3106                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107                 pr_info("%s: Using Register based invalidation\n",
3108                         iommu->name);
3109         } else {
3110                 iommu->flush.flush_context = qi_flush_context;
3111                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3113         }
3114 }
3115
3116 static int copy_context_table(struct intel_iommu *iommu,
3117                               struct root_entry *old_re,
3118                               struct context_entry **tbl,
3119                               int bus, bool ext)
3120 {
3121         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122         struct context_entry *new_ce = NULL, ce;
3123         struct context_entry *old_ce = NULL;
3124         struct root_entry re;
3125         phys_addr_t old_ce_phys;
3126
3127         tbl_idx = ext ? bus * 2 : bus;
3128         memcpy(&re, old_re, sizeof(re));
3129
3130         for (devfn = 0; devfn < 256; devfn++) {
3131                 /* First calculate the correct index */
3132                 idx = (ext ? devfn * 2 : devfn) % 256;
3133
3134                 if (idx == 0) {
3135                         /* First save what we may have and clean up */
3136                         if (new_ce) {
3137                                 tbl[tbl_idx] = new_ce;
3138                                 __iommu_flush_cache(iommu, new_ce,
3139                                                     VTD_PAGE_SIZE);
3140                                 pos = 1;
3141                         }
3142
3143                         if (old_ce)
3144                                 memunmap(old_ce);
3145
3146                         ret = 0;
3147                         if (devfn < 0x80)
3148                                 old_ce_phys = root_entry_lctp(&re);
3149                         else
3150                                 old_ce_phys = root_entry_uctp(&re);
3151
3152                         if (!old_ce_phys) {
3153                                 if (ext && devfn == 0) {
3154                                         /* No LCTP, try UCTP */
3155                                         devfn = 0x7f;
3156                                         continue;
3157                                 } else {
3158                                         goto out;
3159                                 }
3160                         }
3161
3162                         ret = -ENOMEM;
3163                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3164                                         MEMREMAP_WB);
3165                         if (!old_ce)
3166                                 goto out;
3167
3168                         new_ce = alloc_pgtable_page(iommu->node);
3169                         if (!new_ce)
3170                                 goto out_unmap;
3171
3172                         ret = 0;
3173                 }
3174
3175                 /* Now copy the context entry */
3176                 memcpy(&ce, old_ce + idx, sizeof(ce));
3177
3178                 if (!__context_present(&ce))
3179                         continue;
3180
3181                 did = context_domain_id(&ce);
3182                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183                         set_bit(did, iommu->domain_ids);
3184
3185                 /*
3186                  * We need a marker for copied context entries. This
3187                  * marker needs to work for the old format as well as
3188                  * for extended context entries.
3189                  *
3190                  * Bit 67 of the context entry is used. In the old
3191                  * format this bit is available to software, in the
3192                  * extended format it is the PGE bit, but PGE is ignored
3193                  * by HW if PASIDs are disabled (and thus still
3194                  * available).
3195                  *
3196                  * So disable PASIDs first and then mark the entry
3197                  * copied. This means that we don't copy PASID
3198                  * translations from the old kernel, but this is fine as
3199                  * faults there are not fatal.
3200                  */
3201                 context_clear_pasid_enable(&ce);
3202                 context_set_copied(&ce);
3203
3204                 new_ce[idx] = ce;
3205         }
3206
3207         tbl[tbl_idx + pos] = new_ce;
3208
3209         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3210
3211 out_unmap:
3212         memunmap(old_ce);
3213
3214 out:
3215         return ret;
3216 }
3217
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3219 {
3220         struct context_entry **ctxt_tbls;
3221         struct root_entry *old_rt;
3222         phys_addr_t old_rt_phys;
3223         int ctxt_table_entries;
3224         unsigned long flags;
3225         u64 rtaddr_reg;
3226         int bus, ret;
3227         bool new_ext, ext;
3228
3229         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231         new_ext    = !!ecap_ecs(iommu->ecap);
3232
3233         /*
3234          * The RTT bit can only be changed when translation is disabled,
3235          * but disabling translation means to open a window for data
3236          * corruption. So bail out and don't copy anything if we would
3237          * have to change the bit.
3238          */
3239         if (new_ext != ext)
3240                 return -EINVAL;
3241
3242         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3243         if (!old_rt_phys)
3244                 return -EINVAL;
3245
3246         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3247         if (!old_rt)
3248                 return -ENOMEM;
3249
3250         /* This is too big for the stack - allocate it from slab */
3251         ctxt_table_entries = ext ? 512 : 256;
3252         ret = -ENOMEM;
3253         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3254         if (!ctxt_tbls)
3255                 goto out_unmap;
3256
3257         for (bus = 0; bus < 256; bus++) {
3258                 ret = copy_context_table(iommu, &old_rt[bus],
3259                                          ctxt_tbls, bus, ext);
3260                 if (ret) {
3261                         pr_err("%s: Failed to copy context table for bus %d\n",
3262                                 iommu->name, bus);
3263                         continue;
3264                 }
3265         }
3266
3267         spin_lock_irqsave(&iommu->lock, flags);
3268
3269         /* Context tables are copied, now write them to the root_entry table */
3270         for (bus = 0; bus < 256; bus++) {
3271                 int idx = ext ? bus * 2 : bus;
3272                 u64 val;
3273
3274                 if (ctxt_tbls[idx]) {
3275                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276                         iommu->root_entry[bus].lo = val;
3277                 }
3278
3279                 if (!ext || !ctxt_tbls[idx + 1])
3280                         continue;
3281
3282                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283                 iommu->root_entry[bus].hi = val;
3284         }
3285
3286         spin_unlock_irqrestore(&iommu->lock, flags);
3287
3288         kfree(ctxt_tbls);
3289
3290         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3291
3292         ret = 0;
3293
3294 out_unmap:
3295         memunmap(old_rt);
3296
3297         return ret;
3298 }
3299
3300 static int __init init_dmars(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu;
3304         int ret;
3305
3306         /*
3307          * for each drhd
3308          *    allocate root
3309          *    initialize and program root entry to not present
3310          * endfor
3311          */
3312         for_each_drhd_unit(drhd) {
3313                 /*
3314                  * lock not needed as this is only incremented in the single
3315                  * threaded kernel __init code path all other access are read
3316                  * only
3317                  */
3318                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3319                         g_num_of_iommus++;
3320                         continue;
3321                 }
3322                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3323         }
3324
3325         /* Preallocate enough resources for IOMMU hot-addition */
3326         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3328
3329         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3330                         GFP_KERNEL);
3331         if (!g_iommus) {
3332                 pr_err("Allocating global iommu array failed\n");
3333                 ret = -ENOMEM;
3334                 goto error;
3335         }
3336
3337         for_each_iommu(iommu, drhd) {
3338                 if (drhd->ignored) {
3339                         iommu_disable_translation(iommu);
3340                         continue;
3341                 }
3342
3343                 /*
3344                  * Find the max pasid size of all IOMMU's in the system.
3345                  * We need to ensure the system pasid table is no bigger
3346                  * than the smallest supported.
3347                  */
3348                 if (pasid_supported(iommu)) {
3349                         u32 temp = 2 << ecap_pss(iommu->ecap);
3350
3351                         intel_pasid_max_id = min_t(u32, temp,
3352                                                    intel_pasid_max_id);
3353                 }
3354
3355                 g_iommus[iommu->seq_id] = iommu;
3356
3357                 intel_iommu_init_qi(iommu);
3358
3359                 ret = iommu_init_domains(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362
3363                 init_translation_status(iommu);
3364
3365                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366                         iommu_disable_translation(iommu);
3367                         clear_translation_pre_enabled(iommu);
3368                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3369                                 iommu->name);
3370                 }
3371
3372                 /*
3373                  * TBD:
3374                  * we could share the same root & context tables
3375                  * among all IOMMU's. Need to Split it later.
3376                  */
3377                 ret = iommu_alloc_root_entry(iommu);
3378                 if (ret)
3379                         goto free_iommu;
3380
3381                 if (translation_pre_enabled(iommu)) {
3382                         pr_info("Translation already enabled - trying to copy translation structures\n");
3383
3384                         ret = copy_translation_tables(iommu);
3385                         if (ret) {
3386                                 /*
3387                                  * We found the IOMMU with translation
3388                                  * enabled - but failed to copy over the
3389                                  * old root-entry table. Try to proceed
3390                                  * by disabling translation now and
3391                                  * allocating a clean root-entry table.
3392                                  * This might cause DMAR faults, but
3393                                  * probably the dump will still succeed.
3394                                  */
3395                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3396                                        iommu->name);
3397                                 iommu_disable_translation(iommu);
3398                                 clear_translation_pre_enabled(iommu);
3399                         } else {
3400                                 pr_info("Copied translation tables from previous kernel for %s\n",
3401                                         iommu->name);
3402                         }
3403                 }
3404
3405                 if (!ecap_pass_through(iommu->ecap))
3406                         hw_pass_through = 0;
3407                 intel_svm_check(iommu);
3408         }
3409
3410         /*
3411          * Now that qi is enabled on all iommus, set the root entry and flush
3412          * caches. This is required on some Intel X58 chipsets, otherwise the
3413          * flush_context function will loop forever and the boot hangs.
3414          */
3415         for_each_active_iommu(iommu, drhd) {
3416                 iommu_flush_write_buffer(iommu);
3417                 iommu_set_root_entry(iommu);
3418                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3420         }
3421
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3423         dmar_map_gfx = 0;
3424 #endif
3425
3426         if (!dmar_map_gfx)
3427                 iommu_identity_mapping |= IDENTMAP_GFX;
3428
3429         check_tylersburg_isoch();
3430
3431         ret = si_domain_init(hw_pass_through);
3432         if (ret)
3433                 goto free_iommu;
3434
3435         /*
3436          * for each drhd
3437          *   enable fault log
3438          *   global invalidate context cache
3439          *   global invalidate iotlb
3440          *   enable translation
3441          */
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457                         /*
3458                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459                          * could cause possible lock race condition.
3460                          */
3461                         up_write(&dmar_global_lock);
3462                         ret = intel_svm_enable_prq(iommu);
3463                         down_write(&dmar_global_lock);
3464                         if (ret)
3465                                 goto free_iommu;
3466                 }
3467 #endif
3468                 ret = dmar_set_interrupt(iommu);
3469                 if (ret)
3470                         goto free_iommu;
3471         }
3472
3473         return 0;
3474
3475 free_iommu:
3476         for_each_active_iommu(iommu, drhd) {
3477                 disable_dmar_iommu(iommu);
3478                 free_dmar_iommu(iommu);
3479         }
3480
3481         kfree(g_iommus);
3482
3483 error:
3484         return ret;
3485 }
3486
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489                                      struct dmar_domain *domain,
3490                                      unsigned long nrpages, uint64_t dma_mask)
3491 {
3492         unsigned long iova_pfn;
3493
3494         /*
3495          * Restrict dma_mask to the width that the iommu can handle.
3496          * First-level translation restricts the input-address to a
3497          * canonical address (i.e., address bits 63:N have the same
3498          * value as address bit [N-1], where N is 48-bits with 4-level
3499          * paging and 57-bits with 5-level paging). Hence, skip bit
3500          * [N-1].
3501          */
3502         if (domain_use_first_level(domain))
3503                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3504                                  dma_mask);
3505         else
3506                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3507                                  dma_mask);
3508
3509         /* Ensure we reserve the whole size-aligned region */
3510         nrpages = __roundup_pow_of_two(nrpages);
3511
3512         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3513                 /*
3514                  * First try to allocate an io virtual address in
3515                  * DMA_BIT_MASK(32) and if that fails then try allocating
3516                  * from higher range
3517                  */
3518                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3520                 if (iova_pfn)
3521                         return iova_pfn;
3522         }
3523         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524                                    IOVA_PFN(dma_mask), true);
3525         if (unlikely(!iova_pfn)) {
3526                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3527                              nrpages);
3528                 return 0;
3529         }
3530
3531         return iova_pfn;
3532 }
3533
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3535 {
3536         struct dmar_domain *domain, *tmp;
3537         struct dmar_rmrr_unit *rmrr;
3538         struct device *i_dev;
3539         int i, ret;
3540
3541         /* Device shouldn't be attached by any domains. */
3542         domain = find_domain(dev);
3543         if (domain)
3544                 return NULL;
3545
3546         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3547         if (!domain)
3548                 goto out;
3549
3550         /* We have a new domain - setup possible RMRRs for the device */
3551         rcu_read_lock();
3552         for_each_rmrr_units(rmrr) {
3553                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3554                                           i, i_dev) {
3555                         if (i_dev != dev)
3556                                 continue;
3557
3558                         ret = domain_prepare_identity_map(dev, domain,
3559                                                           rmrr->base_address,
3560                                                           rmrr->end_address);
3561                         if (ret)
3562                                 dev_err(dev, "Mapping reserved region failed\n");
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         tmp = set_domain_for_dev(dev, domain);
3568         if (!tmp || domain != tmp) {
3569                 domain_exit(domain);
3570                 domain = tmp;
3571         }
3572
3573 out:
3574         if (!domain)
3575                 dev_err(dev, "Allocating domain failed\n");
3576         else
3577                 domain->domain.type = IOMMU_DOMAIN_DMA;
3578
3579         return domain;
3580 }
3581
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3584 {
3585         int ret;
3586
3587         if (iommu_dummy(dev))
3588                 return false;
3589
3590         if (unlikely(attach_deferred(dev)))
3591                 do_deferred_attach(dev);
3592
3593         ret = identity_mapping(dev);
3594         if (ret) {
3595                 u64 dma_mask = *dev->dma_mask;
3596
3597                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598                         dma_mask = dev->coherent_dma_mask;
3599
3600                 if (dma_mask >= dma_direct_get_required_mask(dev))
3601                         return false;
3602
3603                 /*
3604                  * 32 bit DMA is removed from si_domain and fall back to
3605                  * non-identity mapping.
3606                  */
3607                 dmar_remove_one_dev_info(dev);
3608                 ret = iommu_request_dma_domain_for_dev(dev);
3609                 if (ret) {
3610                         struct iommu_domain *domain;
3611                         struct dmar_domain *dmar_domain;
3612
3613                         domain = iommu_get_domain_for_dev(dev);
3614                         if (domain) {
3615                                 dmar_domain = to_dmar_domain(domain);
3616                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3617                         }
3618                         dmar_remove_one_dev_info(dev);
3619                         get_private_domain_for_dev(dev);
3620                 }
3621
3622                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3623         }
3624
3625         return true;
3626 }
3627
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629                                      size_t size, int dir, u64 dma_mask)
3630 {
3631         struct dmar_domain *domain;
3632         phys_addr_t start_paddr;
3633         unsigned long iova_pfn;
3634         int prot = 0;
3635         int ret;
3636         struct intel_iommu *iommu;
3637         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3638
3639         BUG_ON(dir == DMA_NONE);
3640
3641         domain = find_domain(dev);
3642         if (!domain)
3643                 return DMA_MAPPING_ERROR;
3644
3645         iommu = domain_get_iommu(domain);
3646         size = aligned_nrpages(paddr, size);
3647
3648         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3649         if (!iova_pfn)
3650                 goto error;
3651
3652         /*
3653          * Check if DMAR supports zero-length reads on write only
3654          * mappings..
3655          */
3656         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657                         !cap_zlr(iommu->cap))
3658                 prot |= DMA_PTE_READ;
3659         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660                 prot |= DMA_PTE_WRITE;
3661         /*
3662          * paddr - (paddr + size) might be partial page, we should map the whole
3663          * page.  Note: if two part of one page are separately mapped, we
3664          * might have two guest_addr mapping to the same host paddr, but this
3665          * is not a big problem
3666          */
3667         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3669         if (ret)
3670                 goto error;
3671
3672         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673         start_paddr += paddr & ~PAGE_MASK;
3674
3675         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3676
3677         return start_paddr;
3678
3679 error:
3680         if (iova_pfn)
3681                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683                 size, (unsigned long long)paddr, dir);
3684         return DMA_MAPPING_ERROR;
3685 }
3686
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688                                  unsigned long offset, size_t size,
3689                                  enum dma_data_direction dir,
3690                                  unsigned long attrs)
3691 {
3692         if (iommu_need_mapping(dev))
3693                 return __intel_map_single(dev, page_to_phys(page) + offset,
3694                                 size, dir, *dev->dma_mask);
3695         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3696 }
3697
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699                                      size_t size, enum dma_data_direction dir,
3700                                      unsigned long attrs)
3701 {
3702         if (iommu_need_mapping(dev))
3703                 return __intel_map_single(dev, phys_addr, size, dir,
3704                                 *dev->dma_mask);
3705         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3706 }
3707
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3709 {
3710         struct dmar_domain *domain;
3711         unsigned long start_pfn, last_pfn;
3712         unsigned long nrpages;
3713         unsigned long iova_pfn;
3714         struct intel_iommu *iommu;
3715         struct page *freelist;
3716         struct pci_dev *pdev = NULL;
3717
3718         domain = find_domain(dev);
3719         BUG_ON(!domain);
3720
3721         iommu = domain_get_iommu(domain);
3722
3723         iova_pfn = IOVA_PFN(dev_addr);
3724
3725         nrpages = aligned_nrpages(dev_addr, size);
3726         start_pfn = mm_to_dma_pfn(iova_pfn);
3727         last_pfn = start_pfn + nrpages - 1;
3728
3729         if (dev_is_pci(dev))
3730                 pdev = to_pci_dev(dev);
3731
3732         freelist = domain_unmap(domain, start_pfn, last_pfn);
3733         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734                         !has_iova_flush_queue(&domain->iovad)) {
3735                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736                                       nrpages, !freelist, 0);
3737                 /* free iova */
3738                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739                 dma_free_pagelist(freelist);
3740         } else {
3741                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742                            (unsigned long)freelist);
3743                 /*
3744                  * queue up the release of the unmap to save the 1/6th of the
3745                  * cpu used up by the iotlb flush operation...
3746                  */
3747         }
3748
3749         trace_unmap_single(dev, dev_addr, size);
3750 }
3751
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753                              size_t size, enum dma_data_direction dir,
3754                              unsigned long attrs)
3755 {
3756         if (iommu_need_mapping(dev))
3757                 intel_unmap(dev, dev_addr, size);
3758         else
3759                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3760 }
3761
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3764 {
3765         if (iommu_need_mapping(dev))
3766                 intel_unmap(dev, dev_addr, size);
3767 }
3768
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770                                   dma_addr_t *dma_handle, gfp_t flags,
3771                                   unsigned long attrs)
3772 {
3773         struct page *page = NULL;
3774         int order;
3775
3776         if (!iommu_need_mapping(dev))
3777                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3778
3779         size = PAGE_ALIGN(size);
3780         order = get_order(size);
3781
3782         if (gfpflags_allow_blocking(flags)) {
3783                 unsigned int count = size >> PAGE_SHIFT;
3784
3785                 page = dma_alloc_from_contiguous(dev, count, order,
3786                                                  flags & __GFP_NOWARN);
3787         }
3788
3789         if (!page)
3790                 page = alloc_pages(flags, order);
3791         if (!page)
3792                 return NULL;
3793         memset(page_address(page), 0, size);
3794
3795         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3796                                          DMA_BIDIRECTIONAL,
3797                                          dev->coherent_dma_mask);
3798         if (*dma_handle != DMA_MAPPING_ERROR)
3799                 return page_address(page);
3800         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801                 __free_pages(page, order);
3802
3803         return NULL;
3804 }
3805
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807                                 dma_addr_t dma_handle, unsigned long attrs)
3808 {
3809         int order;
3810         struct page *page = virt_to_page(vaddr);
3811
3812         if (!iommu_need_mapping(dev))
3813                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3814
3815         size = PAGE_ALIGN(size);
3816         order = get_order(size);
3817
3818         intel_unmap(dev, dma_handle, size);
3819         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820                 __free_pages(page, order);
3821 }
3822
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824                            int nelems, enum dma_data_direction dir,
3825                            unsigned long attrs)
3826 {
3827         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828         unsigned long nrpages = 0;
3829         struct scatterlist *sg;
3830         int i;
3831
3832         if (!iommu_need_mapping(dev))
3833                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3834
3835         for_each_sg(sglist, sg, nelems, i) {
3836                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3837         }
3838
3839         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3840
3841         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3842 }
3843
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845                         enum dma_data_direction dir, unsigned long attrs)
3846 {
3847         int i;
3848         struct dmar_domain *domain;
3849         size_t size = 0;
3850         int prot = 0;
3851         unsigned long iova_pfn;
3852         int ret;
3853         struct scatterlist *sg;
3854         unsigned long start_vpfn;
3855         struct intel_iommu *iommu;
3856
3857         BUG_ON(dir == DMA_NONE);
3858         if (!iommu_need_mapping(dev))
3859                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3860
3861         domain = find_domain(dev);
3862         if (!domain)
3863                 return 0;
3864
3865         iommu = domain_get_iommu(domain);
3866
3867         for_each_sg(sglist, sg, nelems, i)
3868                 size += aligned_nrpages(sg->offset, sg->length);
3869
3870         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3871                                 *dev->dma_mask);
3872         if (!iova_pfn) {
3873                 sglist->dma_length = 0;
3874                 return 0;
3875         }
3876
3877         /*
3878          * Check if DMAR supports zero-length reads on write only
3879          * mappings..
3880          */
3881         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882                         !cap_zlr(iommu->cap))
3883                 prot |= DMA_PTE_READ;
3884         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885                 prot |= DMA_PTE_WRITE;
3886
3887         start_vpfn = mm_to_dma_pfn(iova_pfn);
3888
3889         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890         if (unlikely(ret)) {
3891                 dma_pte_free_pagetable(domain, start_vpfn,
3892                                        start_vpfn + size - 1,
3893                                        agaw_to_level(domain->agaw) + 1);
3894                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3895                 return 0;
3896         }
3897
3898         for_each_sg(sglist, sg, nelems, i)
3899                 trace_map_sg(dev, i + 1, nelems, sg);
3900
3901         return nelems;
3902 }
3903
3904 static u64 intel_get_required_mask(struct device *dev)
3905 {
3906         if (!iommu_need_mapping(dev))
3907                 return dma_direct_get_required_mask(dev);
3908         return DMA_BIT_MASK(32);
3909 }
3910
3911 static const struct dma_map_ops intel_dma_ops = {
3912         .alloc = intel_alloc_coherent,
3913         .free = intel_free_coherent,
3914         .map_sg = intel_map_sg,
3915         .unmap_sg = intel_unmap_sg,
3916         .map_page = intel_map_page,
3917         .unmap_page = intel_unmap_page,
3918         .map_resource = intel_map_resource,
3919         .unmap_resource = intel_unmap_resource,
3920         .dma_supported = dma_direct_supported,
3921         .mmap = dma_common_mmap,
3922         .get_sgtable = dma_common_get_sgtable,
3923         .get_required_mask = intel_get_required_mask,
3924 };
3925
3926 static void
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928                    enum dma_data_direction dir, enum dma_sync_target target)
3929 {
3930         struct dmar_domain *domain;
3931         phys_addr_t tlb_addr;
3932
3933         domain = find_domain(dev);
3934         if (WARN_ON(!domain))
3935                 return;
3936
3937         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938         if (is_swiotlb_buffer(tlb_addr))
3939                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944                   enum dma_data_direction dir, unsigned long attrs,
3945                   u64 dma_mask)
3946 {
3947         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948         struct dmar_domain *domain;
3949         struct intel_iommu *iommu;
3950         unsigned long iova_pfn;
3951         unsigned long nrpages;
3952         phys_addr_t tlb_addr;
3953         int prot = 0;
3954         int ret;
3955
3956         if (unlikely(attach_deferred(dev)))
3957                 do_deferred_attach(dev);
3958
3959         domain = find_domain(dev);
3960
3961         if (WARN_ON(dir == DMA_NONE || !domain))
3962                 return DMA_MAPPING_ERROR;
3963
3964         iommu = domain_get_iommu(domain);
3965         if (WARN_ON(!iommu))
3966                 return DMA_MAPPING_ERROR;
3967
3968         nrpages = aligned_nrpages(0, size);
3969         iova_pfn = intel_alloc_iova(dev, domain,
3970                                     dma_to_mm_pfn(nrpages), dma_mask);
3971         if (!iova_pfn)
3972                 return DMA_MAPPING_ERROR;
3973
3974         /*
3975          * Check if DMAR supports zero-length reads on write only
3976          * mappings..
3977          */
3978         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979                         !cap_zlr(iommu->cap))
3980                 prot |= DMA_PTE_READ;
3981         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982                 prot |= DMA_PTE_WRITE;
3983
3984         /*
3985          * If both the physical buffer start address and size are
3986          * page aligned, we don't need to use a bounce page.
3987          */
3988         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989                 tlb_addr = swiotlb_tbl_map_single(dev,
3990                                 __phys_to_dma(dev, io_tlb_start),
3991                                 paddr, size, aligned_size, dir, attrs);
3992                 if (tlb_addr == DMA_MAPPING_ERROR) {
3993                         goto swiotlb_error;
3994                 } else {
3995                         /* Cleanup the padding area. */
3996                         void *padding_start = phys_to_virt(tlb_addr);
3997                         size_t padding_size = aligned_size;
3998
3999                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000                             (dir == DMA_TO_DEVICE ||
4001                              dir == DMA_BIDIRECTIONAL)) {
4002                                 padding_start += size;
4003                                 padding_size -= size;
4004                         }
4005
4006                         memset(padding_start, 0, padding_size);
4007                 }
4008         } else {
4009                 tlb_addr = paddr;
4010         }
4011
4012         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4014         if (ret)
4015                 goto mapping_error;
4016
4017         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4018
4019         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4020
4021 mapping_error:
4022         if (is_swiotlb_buffer(tlb_addr))
4023                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024                                          aligned_size, dir, attrs);
4025 swiotlb_error:
4026         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028                 size, (unsigned long long)paddr, dir);
4029
4030         return DMA_MAPPING_ERROR;
4031 }
4032
4033 static void
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035                     enum dma_data_direction dir, unsigned long attrs)
4036 {
4037         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038         struct dmar_domain *domain;
4039         phys_addr_t tlb_addr;
4040
4041         domain = find_domain(dev);
4042         if (WARN_ON(!domain))
4043                 return;
4044
4045         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046         if (WARN_ON(!tlb_addr))
4047                 return;
4048
4049         intel_unmap(dev, dev_addr, size);
4050         if (is_swiotlb_buffer(tlb_addr))
4051                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052                                          aligned_size, dir, attrs);
4053
4054         trace_bounce_unmap_single(dev, dev_addr, size);
4055 }
4056
4057 static dma_addr_t
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4060 {
4061         return bounce_map_single(dev, page_to_phys(page) + offset,
4062                                  size, dir, attrs, *dev->dma_mask);
4063 }
4064
4065 static dma_addr_t
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067                     enum dma_data_direction dir, unsigned long attrs)
4068 {
4069         return bounce_map_single(dev, phys_addr, size,
4070                                  dir, attrs, *dev->dma_mask);
4071 }
4072
4073 static void
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075                   enum dma_data_direction dir, unsigned long attrs)
4076 {
4077         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4078 }
4079
4080 static void
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082                       enum dma_data_direction dir, unsigned long attrs)
4083 {
4084         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4085 }
4086
4087 static void
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089                 enum dma_data_direction dir, unsigned long attrs)
4090 {
4091         struct scatterlist *sg;
4092         int i;
4093
4094         for_each_sg(sglist, sg, nelems, i)
4095                 bounce_unmap_page(dev, sg->dma_address,
4096                                   sg_dma_len(sg), dir, attrs);
4097 }
4098
4099 static int
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101               enum dma_data_direction dir, unsigned long attrs)
4102 {
4103         int i;
4104         struct scatterlist *sg;
4105
4106         for_each_sg(sglist, sg, nelems, i) {
4107                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108                                                   sg->offset, sg->length,
4109                                                   dir, attrs);
4110                 if (sg->dma_address == DMA_MAPPING_ERROR)
4111                         goto out_unmap;
4112                 sg_dma_len(sg) = sg->length;
4113         }
4114
4115         for_each_sg(sglist, sg, nelems, i)
4116                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4117
4118         return nelems;
4119
4120 out_unmap:
4121         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4122         return 0;
4123 }
4124
4125 static void
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127                            size_t size, enum dma_data_direction dir)
4128 {
4129         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4130 }
4131
4132 static void
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134                               size_t size, enum dma_data_direction dir)
4135 {
4136         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4137 }
4138
4139 static void
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141                        int nelems, enum dma_data_direction dir)
4142 {
4143         struct scatterlist *sg;
4144         int i;
4145
4146         for_each_sg(sglist, sg, nelems, i)
4147                 bounce_sync_single(dev, sg_dma_address(sg),
4148                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4149 }
4150
4151 static void
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153                           int nelems, enum dma_data_direction dir)
4154 {
4155         struct scatterlist *sg;
4156         int i;
4157
4158         for_each_sg(sglist, sg, nelems, i)
4159                 bounce_sync_single(dev, sg_dma_address(sg),
4160                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4161 }
4162
4163 static const struct dma_map_ops bounce_dma_ops = {
4164         .alloc                  = intel_alloc_coherent,
4165         .free                   = intel_free_coherent,
4166         .map_sg                 = bounce_map_sg,
4167         .unmap_sg               = bounce_unmap_sg,
4168         .map_page               = bounce_map_page,
4169         .unmap_page             = bounce_unmap_page,
4170         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4171         .sync_single_for_device = bounce_sync_single_for_device,
4172         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4173         .sync_sg_for_device     = bounce_sync_sg_for_device,
4174         .map_resource           = bounce_map_resource,
4175         .unmap_resource         = bounce_unmap_resource,
4176         .dma_supported          = dma_direct_supported,
4177 };
4178
4179 static inline int iommu_domain_cache_init(void)
4180 {
4181         int ret = 0;
4182
4183         iommu_domain_cache = kmem_cache_create("iommu_domain",
4184                                          sizeof(struct dmar_domain),
4185                                          0,
4186                                          SLAB_HWCACHE_ALIGN,
4187
4188                                          NULL);
4189         if (!iommu_domain_cache) {
4190                 pr_err("Couldn't create iommu_domain cache\n");
4191                 ret = -ENOMEM;
4192         }
4193
4194         return ret;
4195 }
4196
4197 static inline int iommu_devinfo_cache_init(void)
4198 {
4199         int ret = 0;
4200
4201         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202                                          sizeof(struct device_domain_info),
4203                                          0,
4204                                          SLAB_HWCACHE_ALIGN,
4205                                          NULL);
4206         if (!iommu_devinfo_cache) {
4207                 pr_err("Couldn't create devinfo cache\n");
4208                 ret = -ENOMEM;
4209         }
4210
4211         return ret;
4212 }
4213
4214 static int __init iommu_init_mempool(void)
4215 {
4216         int ret;
4217         ret = iova_cache_get();
4218         if (ret)
4219                 return ret;
4220
4221         ret = iommu_domain_cache_init();
4222         if (ret)
4223                 goto domain_error;
4224
4225         ret = iommu_devinfo_cache_init();
4226         if (!ret)
4227                 return ret;
4228
4229         kmem_cache_destroy(iommu_domain_cache);
4230 domain_error:
4231         iova_cache_put();
4232
4233         return -ENOMEM;
4234 }
4235
4236 static void __init iommu_exit_mempool(void)
4237 {
4238         kmem_cache_destroy(iommu_devinfo_cache);
4239         kmem_cache_destroy(iommu_domain_cache);
4240         iova_cache_put();
4241 }
4242
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4244 {
4245         struct dmar_drhd_unit *drhd;
4246         u32 vtbar;
4247         int rc;
4248
4249         /* We know that this device on this chipset has its own IOMMU.
4250          * If we find it under a different IOMMU, then the BIOS is lying
4251          * to us. Hope that the IOMMU for this device is actually
4252          * disabled, and it needs no translation...
4253          */
4254         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4255         if (rc) {
4256                 /* "can't" happen */
4257                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4258                 return;
4259         }
4260         vtbar &= 0xffff0000;
4261
4262         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263         drhd = dmar_find_matched_drhd_unit(pdev);
4264         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4265                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4266                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4267                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4268         }
4269 }
4270 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4271
4272 static void __init init_no_remapping_devices(void)
4273 {
4274         struct dmar_drhd_unit *drhd;
4275         struct device *dev;
4276         int i;
4277
4278         for_each_drhd_unit(drhd) {
4279                 if (!drhd->include_all) {
4280                         for_each_active_dev_scope(drhd->devices,
4281                                                   drhd->devices_cnt, i, dev)
4282                                 break;
4283                         /* ignore DMAR unit if no devices exist */
4284                         if (i == drhd->devices_cnt)
4285                                 drhd->ignored = 1;
4286                 }
4287         }
4288
4289         for_each_active_drhd_unit(drhd) {
4290                 if (drhd->include_all)
4291                         continue;
4292
4293                 for_each_active_dev_scope(drhd->devices,
4294                                           drhd->devices_cnt, i, dev)
4295                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4296                                 break;
4297                 if (i < drhd->devices_cnt)
4298                         continue;
4299
4300                 /* This IOMMU has *only* gfx devices. Either bypass it or
4301                    set the gfx_mapped flag, as appropriate */
4302                 if (!dmar_map_gfx) {
4303                         drhd->ignored = 1;
4304                         for_each_active_dev_scope(drhd->devices,
4305                                                   drhd->devices_cnt, i, dev)
4306                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4307                 }
4308         }
4309 }
4310
4311 #ifdef CONFIG_SUSPEND
4312 static int init_iommu_hw(void)
4313 {
4314         struct dmar_drhd_unit *drhd;
4315         struct intel_iommu *iommu = NULL;
4316
4317         for_each_active_iommu(iommu, drhd)
4318                 if (iommu->qi)
4319                         dmar_reenable_qi(iommu);
4320
4321         for_each_iommu(iommu, drhd) {
4322                 if (drhd->ignored) {
4323                         /*
4324                          * we always have to disable PMRs or DMA may fail on
4325                          * this device
4326                          */
4327                         if (force_on)
4328                                 iommu_disable_protect_mem_regions(iommu);
4329                         continue;
4330                 }
4331
4332                 iommu_flush_write_buffer(iommu);
4333
4334                 iommu_set_root_entry(iommu);
4335
4336                 iommu->flush.flush_context(iommu, 0, 0, 0,
4337                                            DMA_CCMD_GLOBAL_INVL);
4338                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4339                 iommu_enable_translation(iommu);
4340                 iommu_disable_protect_mem_regions(iommu);
4341         }
4342
4343         return 0;
4344 }
4345
4346 static void iommu_flush_all(void)
4347 {
4348         struct dmar_drhd_unit *drhd;
4349         struct intel_iommu *iommu;
4350
4351         for_each_active_iommu(iommu, drhd) {
4352                 iommu->flush.flush_context(iommu, 0, 0, 0,
4353                                            DMA_CCMD_GLOBAL_INVL);
4354                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4355                                          DMA_TLB_GLOBAL_FLUSH);
4356         }
4357 }
4358
4359 static int iommu_suspend(void)
4360 {
4361         struct dmar_drhd_unit *drhd;
4362         struct intel_iommu *iommu = NULL;
4363         unsigned long flag;
4364
4365         for_each_active_iommu(iommu, drhd) {
4366                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4367                                                  GFP_ATOMIC);
4368                 if (!iommu->iommu_state)
4369                         goto nomem;
4370         }
4371
4372         iommu_flush_all();
4373
4374         for_each_active_iommu(iommu, drhd) {
4375                 iommu_disable_translation(iommu);
4376
4377                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4378
4379                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4380                         readl(iommu->reg + DMAR_FECTL_REG);
4381                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4382                         readl(iommu->reg + DMAR_FEDATA_REG);
4383                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4384                         readl(iommu->reg + DMAR_FEADDR_REG);
4385                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4386                         readl(iommu->reg + DMAR_FEUADDR_REG);
4387
4388                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4389         }
4390         return 0;
4391
4392 nomem:
4393         for_each_active_iommu(iommu, drhd)
4394                 kfree(iommu->iommu_state);
4395
4396         return -ENOMEM;
4397 }
4398
4399 static void iommu_resume(void)
4400 {
4401         struct dmar_drhd_unit *drhd;
4402         struct intel_iommu *iommu = NULL;
4403         unsigned long flag;
4404
4405         if (init_iommu_hw()) {
4406                 if (force_on)
4407                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4408                 else
4409                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4410                 return;
4411         }
4412
4413         for_each_active_iommu(iommu, drhd) {
4414
4415                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4416
4417                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4418                         iommu->reg + DMAR_FECTL_REG);
4419                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4420                         iommu->reg + DMAR_FEDATA_REG);
4421                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4422                         iommu->reg + DMAR_FEADDR_REG);
4423                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4424                         iommu->reg + DMAR_FEUADDR_REG);
4425
4426                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4427         }
4428
4429         for_each_active_iommu(iommu, drhd)
4430                 kfree(iommu->iommu_state);
4431 }
4432
4433 static struct syscore_ops iommu_syscore_ops = {
4434         .resume         = iommu_resume,
4435         .suspend        = iommu_suspend,
4436 };
4437
4438 static void __init init_iommu_pm_ops(void)
4439 {
4440         register_syscore_ops(&iommu_syscore_ops);
4441 }
4442
4443 #else
4444 static inline void init_iommu_pm_ops(void) {}
4445 #endif  /* CONFIG_PM */
4446
4447 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4448 {
4449         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4450             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4451             rmrr->end_address <= rmrr->base_address ||
4452             arch_rmrr_sanity_check(rmrr))
4453                 return -EINVAL;
4454
4455         return 0;
4456 }
4457
4458 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4459 {
4460         struct acpi_dmar_reserved_memory *rmrr;
4461         struct dmar_rmrr_unit *rmrru;
4462
4463         rmrr = (struct acpi_dmar_reserved_memory *)header;
4464         if (rmrr_sanity_check(rmrr)) {
4465                 pr_warn(FW_BUG
4466                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4467                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4468                            rmrr->base_address, rmrr->end_address,
4469                            dmi_get_system_info(DMI_BIOS_VENDOR),
4470                            dmi_get_system_info(DMI_BIOS_VERSION),
4471                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4472                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4473         }
4474
4475         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4476         if (!rmrru)
4477                 goto out;
4478
4479         rmrru->hdr = header;
4480
4481         rmrru->base_address = rmrr->base_address;
4482         rmrru->end_address = rmrr->end_address;
4483
4484         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4485                                 ((void *)rmrr) + rmrr->header.length,
4486                                 &rmrru->devices_cnt);
4487         if (rmrru->devices_cnt && rmrru->devices == NULL)
4488                 goto free_rmrru;
4489
4490         list_add(&rmrru->list, &dmar_rmrr_units);
4491
4492         return 0;
4493 free_rmrru:
4494         kfree(rmrru);
4495 out:
4496         return -ENOMEM;
4497 }
4498
4499 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4500 {
4501         struct dmar_atsr_unit *atsru;
4502         struct acpi_dmar_atsr *tmp;
4503
4504         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4505                                 dmar_rcu_check()) {
4506                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4507                 if (atsr->segment != tmp->segment)
4508                         continue;
4509                 if (atsr->header.length != tmp->header.length)
4510                         continue;
4511                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4512                         return atsru;
4513         }
4514
4515         return NULL;
4516 }
4517
4518 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4519 {
4520         struct acpi_dmar_atsr *atsr;
4521         struct dmar_atsr_unit *atsru;
4522
4523         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4524                 return 0;
4525
4526         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4527         atsru = dmar_find_atsr(atsr);
4528         if (atsru)
4529                 return 0;
4530
4531         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4532         if (!atsru)
4533                 return -ENOMEM;
4534
4535         /*
4536          * If memory is allocated from slab by ACPI _DSM method, we need to
4537          * copy the memory content because the memory buffer will be freed
4538          * on return.
4539          */
4540         atsru->hdr = (void *)(atsru + 1);
4541         memcpy(atsru->hdr, hdr, hdr->length);
4542         atsru->include_all = atsr->flags & 0x1;
4543         if (!atsru->include_all) {
4544                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4545                                 (void *)atsr + atsr->header.length,
4546                                 &atsru->devices_cnt);
4547                 if (atsru->devices_cnt && atsru->devices == NULL) {
4548                         kfree(atsru);
4549                         return -ENOMEM;
4550                 }
4551         }
4552
4553         list_add_rcu(&atsru->list, &dmar_atsr_units);
4554
4555         return 0;
4556 }
4557
4558 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4559 {
4560         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4561         kfree(atsru);
4562 }
4563
4564 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4565 {
4566         struct acpi_dmar_atsr *atsr;
4567         struct dmar_atsr_unit *atsru;
4568
4569         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4570         atsru = dmar_find_atsr(atsr);
4571         if (atsru) {
4572                 list_del_rcu(&atsru->list);
4573                 synchronize_rcu();
4574                 intel_iommu_free_atsr(atsru);
4575         }
4576
4577         return 0;
4578 }
4579
4580 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4581 {
4582         int i;
4583         struct device *dev;
4584         struct acpi_dmar_atsr *atsr;
4585         struct dmar_atsr_unit *atsru;
4586
4587         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4588         atsru = dmar_find_atsr(atsr);
4589         if (!atsru)
4590                 return 0;
4591
4592         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4593                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4594                                           i, dev)
4595                         return -EBUSY;
4596         }
4597
4598         return 0;
4599 }
4600
4601 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4602 {
4603         int sp, ret;
4604         struct intel_iommu *iommu = dmaru->iommu;
4605
4606         if (g_iommus[iommu->seq_id])
4607                 return 0;
4608
4609         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4610                 pr_warn("%s: Doesn't support hardware pass through.\n",
4611                         iommu->name);
4612                 return -ENXIO;
4613         }
4614         if (!ecap_sc_support(iommu->ecap) &&
4615             domain_update_iommu_snooping(iommu)) {
4616                 pr_warn("%s: Doesn't support snooping.\n",
4617                         iommu->name);
4618                 return -ENXIO;
4619         }
4620         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4621         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4622                 pr_warn("%s: Doesn't support large page.\n",
4623                         iommu->name);
4624                 return -ENXIO;
4625         }
4626
4627         /*
4628          * Disable translation if already enabled prior to OS handover.
4629          */
4630         if (iommu->gcmd & DMA_GCMD_TE)
4631                 iommu_disable_translation(iommu);
4632
4633         g_iommus[iommu->seq_id] = iommu;
4634         ret = iommu_init_domains(iommu);
4635         if (ret == 0)
4636                 ret = iommu_alloc_root_entry(iommu);
4637         if (ret)
4638                 goto out;
4639
4640         intel_svm_check(iommu);
4641
4642         if (dmaru->ignored) {
4643                 /*
4644                  * we always have to disable PMRs or DMA may fail on this device
4645                  */
4646                 if (force_on)
4647                         iommu_disable_protect_mem_regions(iommu);
4648                 return 0;
4649         }
4650
4651         intel_iommu_init_qi(iommu);
4652         iommu_flush_write_buffer(iommu);
4653
4654 #ifdef CONFIG_INTEL_IOMMU_SVM
4655         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4656                 ret = intel_svm_enable_prq(iommu);
4657                 if (ret)
4658                         goto disable_iommu;
4659         }
4660 #endif
4661         ret = dmar_set_interrupt(iommu);
4662         if (ret)
4663                 goto disable_iommu;
4664
4665         iommu_set_root_entry(iommu);
4666         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4667         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4668         iommu_enable_translation(iommu);
4669
4670         iommu_disable_protect_mem_regions(iommu);
4671         return 0;
4672
4673 disable_iommu:
4674         disable_dmar_iommu(iommu);
4675 out:
4676         free_dmar_iommu(iommu);
4677         return ret;
4678 }
4679
4680 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4681 {
4682         int ret = 0;
4683         struct intel_iommu *iommu = dmaru->iommu;
4684
4685         if (!intel_iommu_enabled)
4686                 return 0;
4687         if (iommu == NULL)
4688                 return -EINVAL;
4689
4690         if (insert) {
4691                 ret = intel_iommu_add(dmaru);
4692         } else {
4693                 disable_dmar_iommu(iommu);
4694                 free_dmar_iommu(iommu);
4695         }
4696
4697         return ret;
4698 }
4699
4700 static void intel_iommu_free_dmars(void)
4701 {
4702         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4703         struct dmar_atsr_unit *atsru, *atsr_n;
4704
4705         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4706                 list_del(&rmrru->list);
4707                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4708                 kfree(rmrru);
4709         }
4710
4711         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4712                 list_del(&atsru->list);
4713                 intel_iommu_free_atsr(atsru);
4714         }
4715 }
4716
4717 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4718 {
4719         int i, ret = 1;
4720         struct pci_bus *bus;
4721         struct pci_dev *bridge = NULL;
4722         struct device *tmp;
4723         struct acpi_dmar_atsr *atsr;
4724         struct dmar_atsr_unit *atsru;
4725
4726         dev = pci_physfn(dev);
4727         for (bus = dev->bus; bus; bus = bus->parent) {
4728                 bridge = bus->self;
4729                 /* If it's an integrated device, allow ATS */
4730                 if (!bridge)
4731                         return 1;
4732                 /* Connected via non-PCIe: no ATS */
4733                 if (!pci_is_pcie(bridge) ||
4734                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4735                         return 0;
4736                 /* If we found the root port, look it up in the ATSR */
4737                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4738                         break;
4739         }
4740
4741         rcu_read_lock();
4742         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4743                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4744                 if (atsr->segment != pci_domain_nr(dev->bus))
4745                         continue;
4746
4747                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4748                         if (tmp == &bridge->dev)
4749                                 goto out;
4750
4751                 if (atsru->include_all)
4752                         goto out;
4753         }
4754         ret = 0;
4755 out:
4756         rcu_read_unlock();
4757
4758         return ret;
4759 }
4760
4761 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4762 {
4763         int ret;
4764         struct dmar_rmrr_unit *rmrru;
4765         struct dmar_atsr_unit *atsru;
4766         struct acpi_dmar_atsr *atsr;
4767         struct acpi_dmar_reserved_memory *rmrr;
4768
4769         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4770                 return 0;
4771
4772         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4773                 rmrr = container_of(rmrru->hdr,
4774                                     struct acpi_dmar_reserved_memory, header);
4775                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4776                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4777                                 ((void *)rmrr) + rmrr->header.length,
4778                                 rmrr->segment, rmrru->devices,
4779                                 rmrru->devices_cnt);
4780                         if (ret < 0)
4781                                 return ret;
4782                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4783                         dmar_remove_dev_scope(info, rmrr->segment,
4784                                 rmrru->devices, rmrru->devices_cnt);
4785                 }
4786         }
4787
4788         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4789                 if (atsru->include_all)
4790                         continue;
4791
4792                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4793                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4794                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4795                                         (void *)atsr + atsr->header.length,
4796                                         atsr->segment, atsru->devices,
4797                                         atsru->devices_cnt);
4798                         if (ret > 0)
4799                                 break;
4800                         else if (ret < 0)
4801                                 return ret;
4802                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4803                         if (dmar_remove_dev_scope(info, atsr->segment,
4804                                         atsru->devices, atsru->devices_cnt))
4805                                 break;
4806                 }
4807         }
4808
4809         return 0;
4810 }
4811
4812 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4813                                        unsigned long val, void *v)
4814 {
4815         struct memory_notify *mhp = v;
4816         unsigned long long start, end;
4817         unsigned long start_vpfn, last_vpfn;
4818
4819         switch (val) {
4820         case MEM_GOING_ONLINE:
4821                 start = mhp->start_pfn << PAGE_SHIFT;
4822                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4823                 if (iommu_domain_identity_map(si_domain, start, end)) {
4824                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4825                                 start, end);
4826                         return NOTIFY_BAD;
4827                 }
4828                 break;
4829
4830         case MEM_OFFLINE:
4831         case MEM_CANCEL_ONLINE:
4832                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4833                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4834                 while (start_vpfn <= last_vpfn) {
4835                         struct iova *iova;
4836                         struct dmar_drhd_unit *drhd;
4837                         struct intel_iommu *iommu;
4838                         struct page *freelist;
4839
4840                         iova = find_iova(&si_domain->iovad, start_vpfn);
4841                         if (iova == NULL) {
4842                                 pr_debug("Failed get IOVA for PFN %lx\n",
4843                                          start_vpfn);
4844                                 break;
4845                         }
4846
4847                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4848                                                      start_vpfn, last_vpfn);
4849                         if (iova == NULL) {
4850                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4851                                         start_vpfn, last_vpfn);
4852                                 return NOTIFY_BAD;
4853                         }
4854
4855                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4856                                                iova->pfn_hi);
4857
4858                         rcu_read_lock();
4859                         for_each_active_iommu(iommu, drhd)
4860                                 iommu_flush_iotlb_psi(iommu, si_domain,
4861                                         iova->pfn_lo, iova_size(iova),
4862                                         !freelist, 0);
4863                         rcu_read_unlock();
4864                         dma_free_pagelist(freelist);
4865
4866                         start_vpfn = iova->pfn_hi + 1;
4867                         free_iova_mem(iova);
4868                 }
4869                 break;
4870         }
4871
4872         return NOTIFY_OK;
4873 }
4874
4875 static struct notifier_block intel_iommu_memory_nb = {
4876         .notifier_call = intel_iommu_memory_notifier,
4877         .priority = 0
4878 };
4879
4880 static void free_all_cpu_cached_iovas(unsigned int cpu)
4881 {
4882         int i;
4883
4884         for (i = 0; i < g_num_of_iommus; i++) {
4885                 struct intel_iommu *iommu = g_iommus[i];
4886                 struct dmar_domain *domain;
4887                 int did;
4888
4889                 if (!iommu)
4890                         continue;
4891
4892                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4893                         domain = get_iommu_domain(iommu, (u16)did);
4894
4895                         if (!domain)
4896                                 continue;
4897                         free_cpu_cached_iovas(cpu, &domain->iovad);
4898                 }
4899         }
4900 }
4901
4902 static int intel_iommu_cpu_dead(unsigned int cpu)
4903 {
4904         free_all_cpu_cached_iovas(cpu);
4905         return 0;
4906 }
4907
4908 static void intel_disable_iommus(void)
4909 {
4910         struct intel_iommu *iommu = NULL;
4911         struct dmar_drhd_unit *drhd;
4912
4913         for_each_iommu(iommu, drhd)
4914                 iommu_disable_translation(iommu);
4915 }
4916
4917 void intel_iommu_shutdown(void)
4918 {
4919         struct dmar_drhd_unit *drhd;
4920         struct intel_iommu *iommu = NULL;
4921
4922         if (no_iommu || dmar_disabled)
4923                 return;
4924
4925         down_write(&dmar_global_lock);
4926
4927         /* Disable PMRs explicitly here. */
4928         for_each_iommu(iommu, drhd)
4929                 iommu_disable_protect_mem_regions(iommu);
4930
4931         /* Make sure the IOMMUs are switched off */
4932         intel_disable_iommus();
4933
4934         up_write(&dmar_global_lock);
4935 }
4936
4937 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4938 {
4939         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4940
4941         return container_of(iommu_dev, struct intel_iommu, iommu);
4942 }
4943
4944 static ssize_t intel_iommu_show_version(struct device *dev,
4945                                         struct device_attribute *attr,
4946                                         char *buf)
4947 {
4948         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4949         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4950         return sprintf(buf, "%d:%d\n",
4951                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4952 }
4953 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4954
4955 static ssize_t intel_iommu_show_address(struct device *dev,
4956                                         struct device_attribute *attr,
4957                                         char *buf)
4958 {
4959         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4960         return sprintf(buf, "%llx\n", iommu->reg_phys);
4961 }
4962 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4963
4964 static ssize_t intel_iommu_show_cap(struct device *dev,
4965                                     struct device_attribute *attr,
4966                                     char *buf)
4967 {
4968         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4969         return sprintf(buf, "%llx\n", iommu->cap);
4970 }
4971 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4972
4973 static ssize_t intel_iommu_show_ecap(struct device *dev,
4974                                     struct device_attribute *attr,
4975                                     char *buf)
4976 {
4977         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4978         return sprintf(buf, "%llx\n", iommu->ecap);
4979 }
4980 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4981
4982 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4983                                       struct device_attribute *attr,
4984                                       char *buf)
4985 {
4986         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4987         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4988 }
4989 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4990
4991 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4992                                            struct device_attribute *attr,
4993                                            char *buf)
4994 {
4995         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4996         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4997                                                   cap_ndoms(iommu->cap)));
4998 }
4999 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
5000
5001 static struct attribute *intel_iommu_attrs[] = {
5002         &dev_attr_version.attr,
5003         &dev_attr_address.attr,
5004         &dev_attr_cap.attr,
5005         &dev_attr_ecap.attr,
5006         &dev_attr_domains_supported.attr,
5007         &dev_attr_domains_used.attr,
5008         NULL,
5009 };
5010
5011 static struct attribute_group intel_iommu_group = {
5012         .name = "intel-iommu",
5013         .attrs = intel_iommu_attrs,
5014 };
5015
5016 const struct attribute_group *intel_iommu_groups[] = {
5017         &intel_iommu_group,
5018         NULL,
5019 };
5020
5021 static inline bool has_untrusted_dev(void)
5022 {
5023         struct pci_dev *pdev = NULL;
5024
5025         for_each_pci_dev(pdev)
5026                 if (pdev->untrusted)
5027                         return true;
5028
5029         return false;
5030 }
5031
5032 static int __init platform_optin_force_iommu(void)
5033 {
5034         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5035                 return 0;
5036
5037         if (no_iommu || dmar_disabled)
5038                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5039
5040         /*
5041          * If Intel-IOMMU is disabled by default, we will apply identity
5042          * map for all devices except those marked as being untrusted.
5043          */
5044         if (dmar_disabled)
5045                 iommu_set_default_passthrough(false);
5046
5047         dmar_disabled = 0;
5048         no_iommu = 0;
5049
5050         return 1;
5051 }
5052
5053 static int __init probe_acpi_namespace_devices(void)
5054 {
5055         struct dmar_drhd_unit *drhd;
5056         /* To avoid a -Wunused-but-set-variable warning. */
5057         struct intel_iommu *iommu __maybe_unused;
5058         struct device *dev;
5059         int i, ret = 0;
5060
5061         for_each_active_iommu(iommu, drhd) {
5062                 for_each_active_dev_scope(drhd->devices,
5063                                           drhd->devices_cnt, i, dev) {
5064                         struct acpi_device_physical_node *pn;
5065                         struct iommu_group *group;
5066                         struct acpi_device *adev;
5067
5068                         if (dev->bus != &acpi_bus_type)
5069                                 continue;
5070
5071                         adev = to_acpi_device(dev);
5072                         mutex_lock(&adev->physical_node_lock);
5073                         list_for_each_entry(pn,
5074                                             &adev->physical_node_list, node) {
5075                                 group = iommu_group_get(pn->dev);
5076                                 if (group) {
5077                                         iommu_group_put(group);
5078                                         continue;
5079                                 }
5080
5081                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5082                                 ret = iommu_probe_device(pn->dev);
5083                                 if (ret)
5084                                         break;
5085                         }
5086                         mutex_unlock(&adev->physical_node_lock);
5087
5088                         if (ret)
5089                                 return ret;
5090                 }
5091         }
5092
5093         return 0;
5094 }
5095
5096 int __init intel_iommu_init(void)
5097 {
5098         int ret = -ENODEV;
5099         struct dmar_drhd_unit *drhd;
5100         struct intel_iommu *iommu;
5101
5102         /*
5103          * Intel IOMMU is required for a TXT/tboot launch or platform
5104          * opt in, so enforce that.
5105          */
5106         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5107
5108         if (iommu_init_mempool()) {
5109                 if (force_on)
5110                         panic("tboot: Failed to initialize iommu memory\n");
5111                 return -ENOMEM;
5112         }
5113
5114         down_write(&dmar_global_lock);
5115         if (dmar_table_init()) {
5116                 if (force_on)
5117                         panic("tboot: Failed to initialize DMAR table\n");
5118                 goto out_free_dmar;
5119         }
5120
5121         if (dmar_dev_scope_init() < 0) {
5122                 if (force_on)
5123                         panic("tboot: Failed to initialize DMAR device scope\n");
5124                 goto out_free_dmar;
5125         }
5126
5127         up_write(&dmar_global_lock);
5128
5129         /*
5130          * The bus notifier takes the dmar_global_lock, so lockdep will
5131          * complain later when we register it under the lock.
5132          */
5133         dmar_register_bus_notifier();
5134
5135         down_write(&dmar_global_lock);
5136
5137         if (!no_iommu)
5138                 intel_iommu_debugfs_init();
5139
5140         if (no_iommu || dmar_disabled) {
5141                 /*
5142                  * We exit the function here to ensure IOMMU's remapping and
5143                  * mempool aren't setup, which means that the IOMMU's PMRs
5144                  * won't be disabled via the call to init_dmars(). So disable
5145                  * it explicitly here. The PMRs were setup by tboot prior to
5146                  * calling SENTER, but the kernel is expected to reset/tear
5147                  * down the PMRs.
5148                  */
5149                 if (intel_iommu_tboot_noforce) {
5150                         for_each_iommu(iommu, drhd)
5151                                 iommu_disable_protect_mem_regions(iommu);
5152                 }
5153
5154                 /*
5155                  * Make sure the IOMMUs are switched off, even when we
5156                  * boot into a kexec kernel and the previous kernel left
5157                  * them enabled
5158                  */
5159                 intel_disable_iommus();
5160                 goto out_free_dmar;
5161         }
5162
5163         if (list_empty(&dmar_rmrr_units))
5164                 pr_info("No RMRR found\n");
5165
5166         if (list_empty(&dmar_atsr_units))
5167                 pr_info("No ATSR found\n");
5168
5169         if (dmar_init_reserved_ranges()) {
5170                 if (force_on)
5171                         panic("tboot: Failed to reserve iommu ranges\n");
5172                 goto out_free_reserved_range;
5173         }
5174
5175         if (dmar_map_gfx)
5176                 intel_iommu_gfx_mapped = 1;
5177
5178         init_no_remapping_devices();
5179
5180         ret = init_dmars();
5181         if (ret) {
5182                 if (force_on)
5183                         panic("tboot: Failed to initialize DMARs\n");
5184                 pr_err("Initialization failed\n");
5185                 goto out_free_reserved_range;
5186         }
5187         up_write(&dmar_global_lock);
5188
5189 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5190         /*
5191          * If the system has no untrusted device or the user has decided
5192          * to disable the bounce page mechanisms, we don't need swiotlb.
5193          * Mark this and the pre-allocated bounce pages will be released
5194          * later.
5195          */
5196         if (!has_untrusted_dev() || intel_no_bounce)
5197                 swiotlb = 0;
5198 #endif
5199         dma_ops = &intel_dma_ops;
5200
5201         init_iommu_pm_ops();
5202
5203         down_read(&dmar_global_lock);
5204         for_each_active_iommu(iommu, drhd) {
5205                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5206                                        intel_iommu_groups,
5207                                        "%s", iommu->name);
5208                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5209                 iommu_device_register(&iommu->iommu);
5210         }
5211         up_read(&dmar_global_lock);
5212
5213         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5214         if (si_domain && !hw_pass_through)
5215                 register_memory_notifier(&intel_iommu_memory_nb);
5216         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5217                           intel_iommu_cpu_dead);
5218
5219         down_read(&dmar_global_lock);
5220         if (probe_acpi_namespace_devices())
5221                 pr_warn("ACPI name space devices didn't probe correctly\n");
5222
5223         /* Finally, we enable the DMA remapping hardware. */
5224         for_each_iommu(iommu, drhd) {
5225                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5226                         iommu_enable_translation(iommu);
5227
5228                 iommu_disable_protect_mem_regions(iommu);
5229         }
5230         up_read(&dmar_global_lock);
5231
5232         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5233
5234         intel_iommu_enabled = 1;
5235
5236         return 0;
5237
5238 out_free_reserved_range:
5239         put_iova_domain(&reserved_iova_list);
5240 out_free_dmar:
5241         intel_iommu_free_dmars();
5242         up_write(&dmar_global_lock);
5243         iommu_exit_mempool();
5244         return ret;
5245 }
5246
5247 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5248 {
5249         struct intel_iommu *iommu = opaque;
5250
5251         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5252         return 0;
5253 }
5254
5255 /*
5256  * NB - intel-iommu lacks any sort of reference counting for the users of
5257  * dependent devices.  If multiple endpoints have intersecting dependent
5258  * devices, unbinding the driver from any one of them will possibly leave
5259  * the others unable to operate.
5260  */
5261 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5262 {
5263         if (!iommu || !dev || !dev_is_pci(dev))
5264                 return;
5265
5266         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5267 }
5268
5269 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5270 {
5271         struct dmar_domain *domain;
5272         struct intel_iommu *iommu;
5273         unsigned long flags;
5274
5275         assert_spin_locked(&device_domain_lock);
5276
5277         if (WARN_ON(!info))
5278                 return;
5279
5280         iommu = info->iommu;
5281         domain = info->domain;
5282
5283         if (info->dev) {
5284                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5285                         intel_pasid_tear_down_entry(iommu, info->dev,
5286                                         PASID_RID2PASID);
5287
5288                 iommu_disable_dev_iotlb(info);
5289                 domain_context_clear(iommu, info->dev);
5290                 intel_pasid_free_table(info->dev);
5291         }
5292
5293         unlink_domain_info(info);
5294
5295         spin_lock_irqsave(&iommu->lock, flags);
5296         domain_detach_iommu(domain, iommu);
5297         spin_unlock_irqrestore(&iommu->lock, flags);
5298
5299         /* free the private domain */
5300         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5301             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5302             list_empty(&domain->devices))
5303                 domain_exit(info->domain);
5304
5305         free_devinfo_mem(info);
5306 }
5307
5308 static void dmar_remove_one_dev_info(struct device *dev)
5309 {
5310         struct device_domain_info *info;
5311         unsigned long flags;
5312
5313         spin_lock_irqsave(&device_domain_lock, flags);
5314         info = dev->archdata.iommu;
5315         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5316             && info != DUMMY_DEVICE_DOMAIN_INFO)
5317                 __dmar_remove_one_dev_info(info);
5318         spin_unlock_irqrestore(&device_domain_lock, flags);
5319 }
5320
5321 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5322 {
5323         int adjust_width;
5324
5325         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5326         domain_reserve_special_ranges(domain);
5327
5328         /* calculate AGAW */
5329         domain->gaw = guest_width;
5330         adjust_width = guestwidth_to_adjustwidth(guest_width);
5331         domain->agaw = width_to_agaw(adjust_width);
5332
5333         domain->iommu_coherency = 0;
5334         domain->iommu_snooping = 0;
5335         domain->iommu_superpage = 0;
5336         domain->max_addr = 0;
5337
5338         /* always allocate the top pgd */
5339         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5340         if (!domain->pgd)
5341                 return -ENOMEM;
5342         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5343         return 0;
5344 }
5345
5346 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5347 {
5348         struct dmar_domain *dmar_domain;
5349         struct iommu_domain *domain;
5350         int ret;
5351
5352         switch (type) {
5353         case IOMMU_DOMAIN_DMA:
5354         /* fallthrough */
5355         case IOMMU_DOMAIN_UNMANAGED:
5356                 dmar_domain = alloc_domain(0);
5357                 if (!dmar_domain) {
5358                         pr_err("Can't allocate dmar_domain\n");
5359                         return NULL;
5360                 }
5361                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5362                         pr_err("Domain initialization failed\n");
5363                         domain_exit(dmar_domain);
5364                         return NULL;
5365                 }
5366
5367                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5368                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5369                                                     iommu_flush_iova,
5370                                                     iova_entry_free);
5371                         if (ret)
5372                                 pr_info("iova flush queue initialization failed\n");
5373                 }
5374
5375                 domain_update_iommu_cap(dmar_domain);
5376
5377                 domain = &dmar_domain->domain;
5378                 domain->geometry.aperture_start = 0;
5379                 domain->geometry.aperture_end   =
5380                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5381                 domain->geometry.force_aperture = true;
5382
5383                 return domain;
5384         case IOMMU_DOMAIN_IDENTITY:
5385                 return &si_domain->domain;
5386         default:
5387                 return NULL;
5388         }
5389
5390         return NULL;
5391 }
5392
5393 static void intel_iommu_domain_free(struct iommu_domain *domain)
5394 {
5395         if (domain != &si_domain->domain)
5396                 domain_exit(to_dmar_domain(domain));
5397 }
5398
5399 /*
5400  * Check whether a @domain could be attached to the @dev through the
5401  * aux-domain attach/detach APIs.
5402  */
5403 static inline bool
5404 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5405 {
5406         struct device_domain_info *info = dev->archdata.iommu;
5407
5408         return info && info->auxd_enabled &&
5409                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5410 }
5411
5412 static void auxiliary_link_device(struct dmar_domain *domain,
5413                                   struct device *dev)
5414 {
5415         struct device_domain_info *info = dev->archdata.iommu;
5416
5417         assert_spin_locked(&device_domain_lock);
5418         if (WARN_ON(!info))
5419                 return;
5420
5421         domain->auxd_refcnt++;
5422         list_add(&domain->auxd, &info->auxiliary_domains);
5423 }
5424
5425 static void auxiliary_unlink_device(struct dmar_domain *domain,
5426                                     struct device *dev)
5427 {
5428         struct device_domain_info *info = dev->archdata.iommu;
5429
5430         assert_spin_locked(&device_domain_lock);
5431         if (WARN_ON(!info))
5432                 return;
5433
5434         list_del(&domain->auxd);
5435         domain->auxd_refcnt--;
5436
5437         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5438                 ioasid_free(domain->default_pasid);
5439 }
5440
5441 static int aux_domain_add_dev(struct dmar_domain *domain,
5442                               struct device *dev)
5443 {
5444         int ret;
5445         u8 bus, devfn;
5446         unsigned long flags;
5447         struct intel_iommu *iommu;
5448
5449         iommu = device_to_iommu(dev, &bus, &devfn);
5450         if (!iommu)
5451                 return -ENODEV;
5452
5453         if (domain->default_pasid <= 0) {
5454                 int pasid;
5455
5456                 /* No private data needed for the default pasid */
5457                 pasid = ioasid_alloc(NULL, PASID_MIN,
5458                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5459                                      NULL);
5460                 if (pasid == INVALID_IOASID) {
5461                         pr_err("Can't allocate default pasid\n");
5462                         return -ENODEV;
5463                 }
5464                 domain->default_pasid = pasid;
5465         }
5466
5467         spin_lock_irqsave(&device_domain_lock, flags);
5468         /*
5469          * iommu->lock must be held to attach domain to iommu and setup the
5470          * pasid entry for second level translation.
5471          */
5472         spin_lock(&iommu->lock);
5473         ret = domain_attach_iommu(domain, iommu);
5474         if (ret)
5475                 goto attach_failed;
5476
5477         /* Setup the PASID entry for mediated devices: */
5478         if (domain_use_first_level(domain))
5479                 ret = domain_setup_first_level(iommu, domain, dev,
5480                                                domain->default_pasid);
5481         else
5482                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5483                                                      domain->default_pasid);
5484         if (ret)
5485                 goto table_failed;
5486         spin_unlock(&iommu->lock);
5487
5488         auxiliary_link_device(domain, dev);
5489
5490         spin_unlock_irqrestore(&device_domain_lock, flags);
5491
5492         return 0;
5493
5494 table_failed:
5495         domain_detach_iommu(domain, iommu);
5496 attach_failed:
5497         spin_unlock(&iommu->lock);
5498         spin_unlock_irqrestore(&device_domain_lock, flags);
5499         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5500                 ioasid_free(domain->default_pasid);
5501
5502         return ret;
5503 }
5504
5505 static void aux_domain_remove_dev(struct dmar_domain *domain,
5506                                   struct device *dev)
5507 {
5508         struct device_domain_info *info;
5509         struct intel_iommu *iommu;
5510         unsigned long flags;
5511
5512         if (!is_aux_domain(dev, &domain->domain))
5513                 return;
5514
5515         spin_lock_irqsave(&device_domain_lock, flags);
5516         info = dev->archdata.iommu;
5517         iommu = info->iommu;
5518
5519         auxiliary_unlink_device(domain, dev);
5520
5521         spin_lock(&iommu->lock);
5522         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5523         domain_detach_iommu(domain, iommu);
5524         spin_unlock(&iommu->lock);
5525
5526         spin_unlock_irqrestore(&device_domain_lock, flags);
5527 }
5528
5529 static int prepare_domain_attach_device(struct iommu_domain *domain,
5530                                         struct device *dev)
5531 {
5532         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5533         struct intel_iommu *iommu;
5534         int addr_width;
5535         u8 bus, devfn;
5536
5537         iommu = device_to_iommu(dev, &bus, &devfn);
5538         if (!iommu)
5539                 return -ENODEV;
5540
5541         /* check if this iommu agaw is sufficient for max mapped address */
5542         addr_width = agaw_to_width(iommu->agaw);
5543         if (addr_width > cap_mgaw(iommu->cap))
5544                 addr_width = cap_mgaw(iommu->cap);
5545
5546         if (dmar_domain->max_addr > (1LL << addr_width)) {
5547                 dev_err(dev, "%s: iommu width (%d) is not "
5548                         "sufficient for the mapped address (%llx)\n",
5549                         __func__, addr_width, dmar_domain->max_addr);
5550                 return -EFAULT;
5551         }
5552         dmar_domain->gaw = addr_width;
5553
5554         /*
5555          * Knock out extra levels of page tables if necessary
5556          */
5557         while (iommu->agaw < dmar_domain->agaw) {
5558                 struct dma_pte *pte;
5559
5560                 pte = dmar_domain->pgd;
5561                 if (dma_pte_present(pte)) {
5562                         dmar_domain->pgd = (struct dma_pte *)
5563                                 phys_to_virt(dma_pte_addr(pte));
5564                         free_pgtable_page(pte);
5565                 }
5566                 dmar_domain->agaw--;
5567         }
5568
5569         return 0;
5570 }
5571
5572 static int intel_iommu_attach_device(struct iommu_domain *domain,
5573                                      struct device *dev)
5574 {
5575         int ret;
5576
5577         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5578             device_is_rmrr_locked(dev)) {
5579                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5580                 return -EPERM;
5581         }
5582
5583         if (is_aux_domain(dev, domain))
5584                 return -EPERM;
5585
5586         /* normally dev is not mapped */
5587         if (unlikely(domain_context_mapped(dev))) {
5588                 struct dmar_domain *old_domain;
5589
5590                 old_domain = find_domain(dev);
5591                 if (old_domain)
5592                         dmar_remove_one_dev_info(dev);
5593         }
5594
5595         ret = prepare_domain_attach_device(domain, dev);
5596         if (ret)
5597                 return ret;
5598
5599         return domain_add_dev_info(to_dmar_domain(domain), dev);
5600 }
5601
5602 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5603                                          struct device *dev)
5604 {
5605         int ret;
5606
5607         if (!is_aux_domain(dev, domain))
5608                 return -EPERM;
5609
5610         ret = prepare_domain_attach_device(domain, dev);
5611         if (ret)
5612                 return ret;
5613
5614         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5615 }
5616
5617 static void intel_iommu_detach_device(struct iommu_domain *domain,
5618                                       struct device *dev)
5619 {
5620         dmar_remove_one_dev_info(dev);
5621 }
5622
5623 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5624                                           struct device *dev)
5625 {
5626         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5627 }
5628
5629 static int intel_iommu_map(struct iommu_domain *domain,
5630                            unsigned long iova, phys_addr_t hpa,
5631                            size_t size, int iommu_prot, gfp_t gfp)
5632 {
5633         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5634         u64 max_addr;
5635         int prot = 0;
5636         int ret;
5637
5638         if (iommu_prot & IOMMU_READ)
5639                 prot |= DMA_PTE_READ;
5640         if (iommu_prot & IOMMU_WRITE)
5641                 prot |= DMA_PTE_WRITE;
5642         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5643                 prot |= DMA_PTE_SNP;
5644
5645         max_addr = iova + size;
5646         if (dmar_domain->max_addr < max_addr) {
5647                 u64 end;
5648
5649                 /* check if minimum agaw is sufficient for mapped address */
5650                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5651                 if (end < max_addr) {
5652                         pr_err("%s: iommu width (%d) is not "
5653                                "sufficient for the mapped address (%llx)\n",
5654                                __func__, dmar_domain->gaw, max_addr);
5655                         return -EFAULT;
5656                 }
5657                 dmar_domain->max_addr = max_addr;
5658         }
5659         /* Round up size to next multiple of PAGE_SIZE, if it and
5660            the low bits of hpa would take us onto the next page */
5661         size = aligned_nrpages(hpa, size);
5662         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5663                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5664         return ret;
5665 }
5666
5667 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5668                                 unsigned long iova, size_t size,
5669                                 struct iommu_iotlb_gather *gather)
5670 {
5671         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5672         struct page *freelist = NULL;
5673         unsigned long start_pfn, last_pfn;
5674         unsigned int npages;
5675         int iommu_id, level = 0;
5676
5677         /* Cope with horrid API which requires us to unmap more than the
5678            size argument if it happens to be a large-page mapping. */
5679         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5680
5681         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5682                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5683
5684         start_pfn = iova >> VTD_PAGE_SHIFT;
5685         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5686
5687         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5688
5689         npages = last_pfn - start_pfn + 1;
5690
5691         for_each_domain_iommu(iommu_id, dmar_domain)
5692                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5693                                       start_pfn, npages, !freelist, 0);
5694
5695         dma_free_pagelist(freelist);
5696
5697         if (dmar_domain->max_addr == iova + size)
5698                 dmar_domain->max_addr = iova;
5699
5700         return size;
5701 }
5702
5703 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5704                                             dma_addr_t iova)
5705 {
5706         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5707         struct dma_pte *pte;
5708         int level = 0;
5709         u64 phys = 0;
5710
5711         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5712         if (pte && dma_pte_present(pte))
5713                 phys = dma_pte_addr(pte) +
5714                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5715                                                 VTD_PAGE_SHIFT) - 1));
5716
5717         return phys;
5718 }
5719
5720 static inline bool scalable_mode_support(void)
5721 {
5722         struct dmar_drhd_unit *drhd;
5723         struct intel_iommu *iommu;
5724         bool ret = true;
5725
5726         rcu_read_lock();
5727         for_each_active_iommu(iommu, drhd) {
5728                 if (!sm_supported(iommu)) {
5729                         ret = false;
5730                         break;
5731                 }
5732         }
5733         rcu_read_unlock();
5734
5735         return ret;
5736 }
5737
5738 static inline bool iommu_pasid_support(void)
5739 {
5740         struct dmar_drhd_unit *drhd;
5741         struct intel_iommu *iommu;
5742         bool ret = true;
5743
5744         rcu_read_lock();
5745         for_each_active_iommu(iommu, drhd) {
5746                 if (!pasid_supported(iommu)) {
5747                         ret = false;
5748                         break;
5749                 }
5750         }
5751         rcu_read_unlock();
5752
5753         return ret;
5754 }
5755
5756 static inline bool nested_mode_support(void)
5757 {
5758         struct dmar_drhd_unit *drhd;
5759         struct intel_iommu *iommu;
5760         bool ret = true;
5761
5762         rcu_read_lock();
5763         for_each_active_iommu(iommu, drhd) {
5764                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5765                         ret = false;
5766                         break;
5767                 }
5768         }
5769         rcu_read_unlock();
5770
5771         return ret;
5772 }
5773
5774 static bool intel_iommu_capable(enum iommu_cap cap)
5775 {
5776         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5777                 return domain_update_iommu_snooping(NULL) == 1;
5778         if (cap == IOMMU_CAP_INTR_REMAP)
5779                 return irq_remapping_enabled == 1;
5780
5781         return false;
5782 }
5783
5784 static int intel_iommu_add_device(struct device *dev)
5785 {
5786         struct dmar_domain *dmar_domain;
5787         struct iommu_domain *domain;
5788         struct intel_iommu *iommu;
5789         struct iommu_group *group;
5790         u8 bus, devfn;
5791         int ret;
5792
5793         iommu = device_to_iommu(dev, &bus, &devfn);
5794         if (!iommu)
5795                 return -ENODEV;
5796
5797         iommu_device_link(&iommu->iommu, dev);
5798
5799         if (translation_pre_enabled(iommu))
5800                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5801
5802         group = iommu_group_get_for_dev(dev);
5803
5804         if (IS_ERR(group)) {
5805                 ret = PTR_ERR(group);
5806                 goto unlink;
5807         }
5808
5809         iommu_group_put(group);
5810
5811         domain = iommu_get_domain_for_dev(dev);
5812         dmar_domain = to_dmar_domain(domain);
5813         if (domain->type == IOMMU_DOMAIN_DMA) {
5814                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5815                         ret = iommu_request_dm_for_dev(dev);
5816                         if (ret) {
5817                                 dmar_remove_one_dev_info(dev);
5818                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5819                                 domain_add_dev_info(si_domain, dev);
5820                                 dev_info(dev,
5821                                          "Device uses a private identity domain.\n");
5822                         }
5823                 }
5824         } else {
5825                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5826                         ret = iommu_request_dma_domain_for_dev(dev);
5827                         if (ret) {
5828                                 dmar_remove_one_dev_info(dev);
5829                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5830                                 if (!get_private_domain_for_dev(dev)) {
5831                                         dev_warn(dev,
5832                                                  "Failed to get a private domain.\n");
5833                                         ret = -ENOMEM;
5834                                         goto unlink;
5835                                 }
5836
5837                                 dev_info(dev,
5838                                          "Device uses a private dma domain.\n");
5839                         }
5840                 }
5841         }
5842
5843         if (device_needs_bounce(dev)) {
5844                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5845                 set_dma_ops(dev, &bounce_dma_ops);
5846         }
5847
5848         return 0;
5849
5850 unlink:
5851         iommu_device_unlink(&iommu->iommu, dev);
5852         return ret;
5853 }
5854
5855 static void intel_iommu_remove_device(struct device *dev)
5856 {
5857         struct intel_iommu *iommu;
5858         u8 bus, devfn;
5859
5860         iommu = device_to_iommu(dev, &bus, &devfn);
5861         if (!iommu)
5862                 return;
5863
5864         dmar_remove_one_dev_info(dev);
5865
5866         iommu_group_remove_device(dev);
5867
5868         iommu_device_unlink(&iommu->iommu, dev);
5869
5870         if (device_needs_bounce(dev))
5871                 set_dma_ops(dev, NULL);
5872 }
5873
5874 static void intel_iommu_get_resv_regions(struct device *device,
5875                                          struct list_head *head)
5876 {
5877         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5878         struct iommu_resv_region *reg;
5879         struct dmar_rmrr_unit *rmrr;
5880         struct device *i_dev;
5881         int i;
5882
5883         down_read(&dmar_global_lock);
5884         for_each_rmrr_units(rmrr) {
5885                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5886                                           i, i_dev) {
5887                         struct iommu_resv_region *resv;
5888                         enum iommu_resv_type type;
5889                         size_t length;
5890
5891                         if (i_dev != device &&
5892                             !is_downstream_to_pci_bridge(device, i_dev))
5893                                 continue;
5894
5895                         length = rmrr->end_address - rmrr->base_address + 1;
5896
5897                         type = device_rmrr_is_relaxable(device) ?
5898                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5899
5900                         resv = iommu_alloc_resv_region(rmrr->base_address,
5901                                                        length, prot, type);
5902                         if (!resv)
5903                                 break;
5904
5905                         list_add_tail(&resv->list, head);
5906                 }
5907         }
5908         up_read(&dmar_global_lock);
5909
5910 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5911         if (dev_is_pci(device)) {
5912                 struct pci_dev *pdev = to_pci_dev(device);
5913
5914                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5915                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5916                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5917                         if (reg)
5918                                 list_add_tail(&reg->list, head);
5919                 }
5920         }
5921 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5922
5923         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5924                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5925                                       0, IOMMU_RESV_MSI);
5926         if (!reg)
5927                 return;
5928         list_add_tail(&reg->list, head);
5929 }
5930
5931 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5932 {
5933         struct device_domain_info *info;
5934         struct context_entry *context;
5935         struct dmar_domain *domain;
5936         unsigned long flags;
5937         u64 ctx_lo;
5938         int ret;
5939
5940         domain = find_domain(dev);
5941         if (!domain)
5942                 return -EINVAL;
5943
5944         spin_lock_irqsave(&device_domain_lock, flags);
5945         spin_lock(&iommu->lock);
5946
5947         ret = -EINVAL;
5948         info = dev->archdata.iommu;
5949         if (!info || !info->pasid_supported)
5950                 goto out;
5951
5952         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5953         if (WARN_ON(!context))
5954                 goto out;
5955
5956         ctx_lo = context[0].lo;
5957
5958         if (!(ctx_lo & CONTEXT_PASIDE)) {
5959                 ctx_lo |= CONTEXT_PASIDE;
5960                 context[0].lo = ctx_lo;
5961                 wmb();
5962                 iommu->flush.flush_context(iommu,
5963                                            domain->iommu_did[iommu->seq_id],
5964                                            PCI_DEVID(info->bus, info->devfn),
5965                                            DMA_CCMD_MASK_NOBIT,
5966                                            DMA_CCMD_DEVICE_INVL);
5967         }
5968
5969         /* Enable PASID support in the device, if it wasn't already */
5970         if (!info->pasid_enabled)
5971                 iommu_enable_dev_iotlb(info);
5972
5973         ret = 0;
5974
5975  out:
5976         spin_unlock(&iommu->lock);
5977         spin_unlock_irqrestore(&device_domain_lock, flags);
5978
5979         return ret;
5980 }
5981
5982 static void intel_iommu_apply_resv_region(struct device *dev,
5983                                           struct iommu_domain *domain,
5984                                           struct iommu_resv_region *region)
5985 {
5986         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5987         unsigned long start, end;
5988
5989         start = IOVA_PFN(region->start);
5990         end   = IOVA_PFN(region->start + region->length - 1);
5991
5992         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5993 }
5994
5995 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5996 {
5997         if (dev_is_pci(dev))
5998                 return pci_device_group(dev);
5999         return generic_device_group(dev);
6000 }
6001
6002 #ifdef CONFIG_INTEL_IOMMU_SVM
6003 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6004 {
6005         struct intel_iommu *iommu;
6006         u8 bus, devfn;
6007
6008         if (iommu_dummy(dev)) {
6009                 dev_warn(dev,
6010                          "No IOMMU translation for device; cannot enable SVM\n");
6011                 return NULL;
6012         }
6013
6014         iommu = device_to_iommu(dev, &bus, &devfn);
6015         if ((!iommu)) {
6016                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6017                 return NULL;
6018         }
6019
6020         return iommu;
6021 }
6022 #endif /* CONFIG_INTEL_IOMMU_SVM */
6023
6024 static int intel_iommu_enable_auxd(struct device *dev)
6025 {
6026         struct device_domain_info *info;
6027         struct intel_iommu *iommu;
6028         unsigned long flags;
6029         u8 bus, devfn;
6030         int ret;
6031
6032         iommu = device_to_iommu(dev, &bus, &devfn);
6033         if (!iommu || dmar_disabled)
6034                 return -EINVAL;
6035
6036         if (!sm_supported(iommu) || !pasid_supported(iommu))
6037                 return -EINVAL;
6038
6039         ret = intel_iommu_enable_pasid(iommu, dev);
6040         if (ret)
6041                 return -ENODEV;
6042
6043         spin_lock_irqsave(&device_domain_lock, flags);
6044         info = dev->archdata.iommu;
6045         info->auxd_enabled = 1;
6046         spin_unlock_irqrestore(&device_domain_lock, flags);
6047
6048         return 0;
6049 }
6050
6051 static int intel_iommu_disable_auxd(struct device *dev)
6052 {
6053         struct device_domain_info *info;
6054         unsigned long flags;
6055
6056         spin_lock_irqsave(&device_domain_lock, flags);
6057         info = dev->archdata.iommu;
6058         if (!WARN_ON(!info))
6059                 info->auxd_enabled = 0;
6060         spin_unlock_irqrestore(&device_domain_lock, flags);
6061
6062         return 0;
6063 }
6064
6065 /*
6066  * A PCI express designated vendor specific extended capability is defined
6067  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6068  * for system software and tools to detect endpoint devices supporting the
6069  * Intel scalable IO virtualization without host driver dependency.
6070  *
6071  * Returns the address of the matching extended capability structure within
6072  * the device's PCI configuration space or 0 if the device does not support
6073  * it.
6074  */
6075 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6076 {
6077         int pos;
6078         u16 vendor, id;
6079
6080         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6081         while (pos) {
6082                 pci_read_config_word(pdev, pos + 4, &vendor);
6083                 pci_read_config_word(pdev, pos + 8, &id);
6084                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6085                         return pos;
6086
6087                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6088         }
6089
6090         return 0;
6091 }
6092
6093 static bool
6094 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6095 {
6096         if (feat == IOMMU_DEV_FEAT_AUX) {
6097                 int ret;
6098
6099                 if (!dev_is_pci(dev) || dmar_disabled ||
6100                     !scalable_mode_support() || !iommu_pasid_support())
6101                         return false;
6102
6103                 ret = pci_pasid_features(to_pci_dev(dev));
6104                 if (ret < 0)
6105                         return false;
6106
6107                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6108         }
6109
6110         return false;
6111 }
6112
6113 static int
6114 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6115 {
6116         if (feat == IOMMU_DEV_FEAT_AUX)
6117                 return intel_iommu_enable_auxd(dev);
6118
6119         return -ENODEV;
6120 }
6121
6122 static int
6123 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6124 {
6125         if (feat == IOMMU_DEV_FEAT_AUX)
6126                 return intel_iommu_disable_auxd(dev);
6127
6128         return -ENODEV;
6129 }
6130
6131 static bool
6132 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6133 {
6134         struct device_domain_info *info = dev->archdata.iommu;
6135
6136         if (feat == IOMMU_DEV_FEAT_AUX)
6137                 return scalable_mode_support() && info && info->auxd_enabled;
6138
6139         return false;
6140 }
6141
6142 static int
6143 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6144 {
6145         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6146
6147         return dmar_domain->default_pasid > 0 ?
6148                         dmar_domain->default_pasid : -EINVAL;
6149 }
6150
6151 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6152                                            struct device *dev)
6153 {
6154         return attach_deferred(dev);
6155 }
6156
6157 static int
6158 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6159                             enum iommu_attr attr, void *data)
6160 {
6161         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6162         unsigned long flags;
6163         int ret = 0;
6164
6165         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6166                 return -EINVAL;
6167
6168         switch (attr) {
6169         case DOMAIN_ATTR_NESTING:
6170                 spin_lock_irqsave(&device_domain_lock, flags);
6171                 if (nested_mode_support() &&
6172                     list_empty(&dmar_domain->devices)) {
6173                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6174                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6175                 } else {
6176                         ret = -ENODEV;
6177                 }
6178                 spin_unlock_irqrestore(&device_domain_lock, flags);
6179                 break;
6180         default:
6181                 ret = -EINVAL;
6182                 break;
6183         }
6184
6185         return ret;
6186 }
6187
6188 const struct iommu_ops intel_iommu_ops = {
6189         .capable                = intel_iommu_capable,
6190         .domain_alloc           = intel_iommu_domain_alloc,
6191         .domain_free            = intel_iommu_domain_free,
6192         .domain_set_attr        = intel_iommu_domain_set_attr,
6193         .attach_dev             = intel_iommu_attach_device,
6194         .detach_dev             = intel_iommu_detach_device,
6195         .aux_attach_dev         = intel_iommu_aux_attach_device,
6196         .aux_detach_dev         = intel_iommu_aux_detach_device,
6197         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6198         .map                    = intel_iommu_map,
6199         .unmap                  = intel_iommu_unmap,
6200         .iova_to_phys           = intel_iommu_iova_to_phys,
6201         .add_device             = intel_iommu_add_device,
6202         .remove_device          = intel_iommu_remove_device,
6203         .get_resv_regions       = intel_iommu_get_resv_regions,
6204         .put_resv_regions       = generic_iommu_put_resv_regions,
6205         .apply_resv_region      = intel_iommu_apply_resv_region,
6206         .device_group           = intel_iommu_device_group,
6207         .dev_has_feat           = intel_iommu_dev_has_feat,
6208         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6209         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6210         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6211         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6212         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6213 };
6214
6215 static void quirk_iommu_igfx(struct pci_dev *dev)
6216 {
6217         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6218         dmar_map_gfx = 0;
6219 }
6220
6221 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6229
6230 /* Broadwell igfx malfunctions with dmar */
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6255
6256 static void quirk_iommu_rwbf(struct pci_dev *dev)
6257 {
6258         /*
6259          * Mobile 4 Series Chipset neglects to set RWBF capability,
6260          * but needs it. Same seems to hold for the desktop versions.
6261          */
6262         pci_info(dev, "Forcing write-buffer flush capability\n");
6263         rwbf_quirk = 1;
6264 }
6265
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6273
6274 #define GGC 0x52
6275 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6276 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6277 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6278 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6279 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6280 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6281 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6282 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6283
6284 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6285 {
6286         unsigned short ggc;
6287
6288         if (pci_read_config_word(dev, GGC, &ggc))
6289                 return;
6290
6291         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6292                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6293                 dmar_map_gfx = 0;
6294         } else if (dmar_map_gfx) {
6295                 /* we have to ensure the gfx device is idle before we flush */
6296                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6297                 intel_iommu_strict = 1;
6298        }
6299 }
6300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6304
6305 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6306    ISOCH DMAR unit for the Azalia sound device, but not give it any
6307    TLB entries, which causes it to deadlock. Check for that.  We do
6308    this in a function called from init_dmars(), instead of in a PCI
6309    quirk, because we don't want to print the obnoxious "BIOS broken"
6310    message if VT-d is actually disabled.
6311 */
6312 static void __init check_tylersburg_isoch(void)
6313 {
6314         struct pci_dev *pdev;
6315         uint32_t vtisochctrl;
6316
6317         /* If there's no Azalia in the system anyway, forget it. */
6318         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6319         if (!pdev)
6320                 return;
6321         pci_dev_put(pdev);
6322
6323         /* System Management Registers. Might be hidden, in which case
6324            we can't do the sanity check. But that's OK, because the
6325            known-broken BIOSes _don't_ actually hide it, so far. */
6326         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6327         if (!pdev)
6328                 return;
6329
6330         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6331                 pci_dev_put(pdev);
6332                 return;
6333         }
6334
6335         pci_dev_put(pdev);
6336
6337         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6338         if (vtisochctrl & 1)
6339                 return;
6340
6341         /* Drop all bits other than the number of TLB entries */
6342         vtisochctrl &= 0x1c;
6343
6344         /* If we have the recommended number of TLB entries (16), fine. */
6345         if (vtisochctrl == 0x10)
6346                 return;
6347
6348         /* Zero TLB entries? You get to ride the short bus to school. */
6349         if (!vtisochctrl) {
6350                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6351                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6352                      dmi_get_system_info(DMI_BIOS_VENDOR),
6353                      dmi_get_system_info(DMI_BIOS_VERSION),
6354                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6355                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6356                 return;
6357         }
6358
6359         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6360                vtisochctrl);
6361 }