csky: replace definitions of __pXd_offset() with pXd_index()
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359
360 #define IDENTMAP_GFX            2
361 #define IDENTMAP_AZALIA         4
362
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev->archdata.iommu;
376         if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377                      info == DEFER_DEVICE_DOMAIN_INFO))
378                 return NULL;
379
380         return info;
381 }
382
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
385
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
387                                 to_pci_dev(d)->untrusted)
388
389 /*
390  * Iterate over elements in device_domain_list and call the specified
391  * callback @fn against each element.
392  */
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394                                      void *data), void *data)
395 {
396         int ret = 0;
397         unsigned long flags;
398         struct device_domain_info *info;
399
400         spin_lock_irqsave(&device_domain_lock, flags);
401         list_for_each_entry(info, &device_domain_list, global) {
402                 ret = fn(info, data);
403                 if (ret) {
404                         spin_unlock_irqrestore(&device_domain_lock, flags);
405                         return ret;
406                 }
407         }
408         spin_unlock_irqrestore(&device_domain_lock, flags);
409
410         return 0;
411 }
412
413 const struct iommu_ops intel_iommu_ops;
414
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 {
417         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
418 }
419
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 {
422         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
423 }
424
425 static void init_translation_status(struct intel_iommu *iommu)
426 {
427         u32 gsts;
428
429         gsts = readl(iommu->reg + DMAR_GSTS_REG);
430         if (gsts & DMA_GSTS_TES)
431                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
432 }
433
434 static int __init intel_iommu_setup(char *str)
435 {
436         if (!str)
437                 return -EINVAL;
438         while (*str) {
439                 if (!strncmp(str, "on", 2)) {
440                         dmar_disabled = 0;
441                         pr_info("IOMMU enabled\n");
442                 } else if (!strncmp(str, "off", 3)) {
443                         dmar_disabled = 1;
444                         no_platform_optin = 1;
445                         pr_info("IOMMU disabled\n");
446                 } else if (!strncmp(str, "igfx_off", 8)) {
447                         dmar_map_gfx = 0;
448                         pr_info("Disable GFX device mapping\n");
449                 } else if (!strncmp(str, "forcedac", 8)) {
450                         pr_info("Forcing DAC for PCI devices\n");
451                         dmar_forcedac = 1;
452                 } else if (!strncmp(str, "strict", 6)) {
453                         pr_info("Disable batched IOTLB flush\n");
454                         intel_iommu_strict = 1;
455                 } else if (!strncmp(str, "sp_off", 6)) {
456                         pr_info("Disable supported super page\n");
457                         intel_iommu_superpage = 0;
458                 } else if (!strncmp(str, "sm_on", 5)) {
459                         pr_info("Intel-IOMMU: scalable mode supported\n");
460                         intel_iommu_sm = 1;
461                 } else if (!strncmp(str, "tboot_noforce", 13)) {
462                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463                         intel_iommu_tboot_noforce = 1;
464                 } else if (!strncmp(str, "nobounce", 8)) {
465                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
466                         intel_no_bounce = 1;
467                 }
468
469                 str += strcspn(str, ",");
470                 while (*str == ',')
471                         str++;
472         }
473         return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482         struct dmar_domain **domains;
483         int idx = did >> 8;
484
485         domains = iommu->domains[idx];
486         if (!domains)
487                 return NULL;
488
489         return domains[did & 0xff];
490 }
491
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493                              struct dmar_domain *domain)
494 {
495         struct dmar_domain **domains;
496         int idx = did >> 8;
497
498         if (!iommu->domains[idx]) {
499                 size_t size = 256 * sizeof(struct dmar_domain *);
500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501         }
502
503         domains = iommu->domains[idx];
504         if (WARN_ON(!domains))
505                 return;
506         else
507                 domains[did & 0xff] = domain;
508 }
509
510 void *alloc_pgtable_page(int node)
511 {
512         struct page *page;
513         void *vaddr = NULL;
514
515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516         if (page)
517                 vaddr = page_address(page);
518         return vaddr;
519 }
520
521 void free_pgtable_page(void *vaddr)
522 {
523         free_page((unsigned long)vaddr);
524 }
525
526 static inline void *alloc_domain_mem(void)
527 {
528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530
531 static void free_domain_mem(void *vaddr)
532 {
533         kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535
536 static inline void * alloc_devinfo_mem(void)
537 {
538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543         kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545
546 static inline int domain_type_is_si(struct dmar_domain *domain)
547 {
548         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 }
550
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 {
553         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
554 }
555
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
557                                        unsigned long pfn)
558 {
559         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560
561         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
562 }
563
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 {
566         unsigned long sagaw;
567         int agaw = -1;
568
569         sagaw = cap_sagaw(iommu->cap);
570         for (agaw = width_to_agaw(max_gaw);
571              agaw >= 0; agaw--) {
572                 if (test_bit(agaw, &sagaw))
573                         break;
574         }
575
576         return agaw;
577 }
578
579 /*
580  * Calculate max SAGAW for each iommu.
581  */
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 {
584         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 }
586
587 /*
588  * calculate agaw for each iommu.
589  * "SAGAW" may be different across iommus, use a default agaw, and
590  * get a supported less agaw for iommus that don't support the default agaw.
591  */
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 {
594         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
595 }
596
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 {
600         int iommu_id;
601
602         /* si_domain and vm domain should not get here. */
603         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
604                 return NULL;
605
606         for_each_domain_iommu(iommu_id, domain)
607                 break;
608
609         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
610                 return NULL;
611
612         return g_iommus[iommu_id];
613 }
614
615 static void domain_update_iommu_coherency(struct dmar_domain *domain)
616 {
617         struct dmar_drhd_unit *drhd;
618         struct intel_iommu *iommu;
619         bool found = false;
620         int i;
621
622         domain->iommu_coherency = 1;
623
624         for_each_domain_iommu(i, domain) {
625                 found = true;
626                 if (!ecap_coherent(g_iommus[i]->ecap)) {
627                         domain->iommu_coherency = 0;
628                         break;
629                 }
630         }
631         if (found)
632                 return;
633
634         /* No hardware attached; use lowest common denominator */
635         rcu_read_lock();
636         for_each_active_iommu(iommu, drhd) {
637                 if (!ecap_coherent(iommu->ecap)) {
638                         domain->iommu_coherency = 0;
639                         break;
640                 }
641         }
642         rcu_read_unlock();
643 }
644
645 static int domain_update_iommu_snooping(struct intel_iommu *skip)
646 {
647         struct dmar_drhd_unit *drhd;
648         struct intel_iommu *iommu;
649         int ret = 1;
650
651         rcu_read_lock();
652         for_each_active_iommu(iommu, drhd) {
653                 if (iommu != skip) {
654                         if (!ecap_sc_support(iommu->ecap)) {
655                                 ret = 0;
656                                 break;
657                         }
658                 }
659         }
660         rcu_read_unlock();
661
662         return ret;
663 }
664
665 static int domain_update_iommu_superpage(struct dmar_domain *domain,
666                                          struct intel_iommu *skip)
667 {
668         struct dmar_drhd_unit *drhd;
669         struct intel_iommu *iommu;
670         int mask = 0x3;
671
672         if (!intel_iommu_superpage) {
673                 return 0;
674         }
675
676         /* set iommu_superpage to the smallest common denominator */
677         rcu_read_lock();
678         for_each_active_iommu(iommu, drhd) {
679                 if (iommu != skip) {
680                         if (domain && domain_use_first_level(domain)) {
681                                 if (!cap_fl1gp_support(iommu->cap))
682                                         mask = 0x1;
683                         } else {
684                                 mask &= cap_super_page_val(iommu->cap);
685                         }
686
687                         if (!mask)
688                                 break;
689                 }
690         }
691         rcu_read_unlock();
692
693         return fls(mask);
694 }
695
696 /* Some capabilities may be different across iommus */
697 static void domain_update_iommu_cap(struct dmar_domain *domain)
698 {
699         domain_update_iommu_coherency(domain);
700         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
701         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
702 }
703
704 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
705                                          u8 devfn, int alloc)
706 {
707         struct root_entry *root = &iommu->root_entry[bus];
708         struct context_entry *context;
709         u64 *entry;
710
711         entry = &root->lo;
712         if (sm_supported(iommu)) {
713                 if (devfn >= 0x80) {
714                         devfn -= 0x80;
715                         entry = &root->hi;
716                 }
717                 devfn *= 2;
718         }
719         if (*entry & 1)
720                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
721         else {
722                 unsigned long phy_addr;
723                 if (!alloc)
724                         return NULL;
725
726                 context = alloc_pgtable_page(iommu->node);
727                 if (!context)
728                         return NULL;
729
730                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731                 phy_addr = virt_to_phys((void *)context);
732                 *entry = phy_addr | 1;
733                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
734         }
735         return &context[devfn];
736 }
737
738 static int iommu_dummy(struct device *dev)
739 {
740         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
741 }
742
743 static bool attach_deferred(struct device *dev)
744 {
745         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
746 }
747
748 /**
749  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750  *                               sub-hierarchy of a candidate PCI-PCI bridge
751  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752  * @bridge: the candidate PCI-PCI bridge
753  *
754  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755  */
756 static bool
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758 {
759         struct pci_dev *pdev, *pbridge;
760
761         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762                 return false;
763
764         pdev = to_pci_dev(dev);
765         pbridge = to_pci_dev(bridge);
766
767         if (pbridge->subordinate &&
768             pbridge->subordinate->number <= pdev->bus->number &&
769             pbridge->subordinate->busn_res.end >= pdev->bus->number)
770                 return true;
771
772         return false;
773 }
774
775 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
776 {
777         struct dmar_drhd_unit *drhd = NULL;
778         struct intel_iommu *iommu;
779         struct device *tmp;
780         struct pci_dev *pdev = NULL;
781         u16 segment = 0;
782         int i;
783
784         if (iommu_dummy(dev))
785                 return NULL;
786
787         if (dev_is_pci(dev)) {
788                 struct pci_dev *pf_pdev;
789
790                 pdev = pci_real_dma_dev(to_pci_dev(dev));
791
792                 /* VFs aren't listed in scope tables; we need to look up
793                  * the PF instead to find the IOMMU. */
794                 pf_pdev = pci_physfn(pdev);
795                 dev = &pf_pdev->dev;
796                 segment = pci_domain_nr(pdev->bus);
797         } else if (has_acpi_companion(dev))
798                 dev = &ACPI_COMPANION(dev)->dev;
799
800         rcu_read_lock();
801         for_each_active_iommu(iommu, drhd) {
802                 if (pdev && segment != drhd->segment)
803                         continue;
804
805                 for_each_active_dev_scope(drhd->devices,
806                                           drhd->devices_cnt, i, tmp) {
807                         if (tmp == dev) {
808                                 /* For a VF use its original BDF# not that of the PF
809                                  * which we used for the IOMMU lookup. Strictly speaking
810                                  * we could do this for all PCI devices; we only need to
811                                  * get the BDF# from the scope table for ACPI matches. */
812                                 if (pdev && pdev->is_virtfn)
813                                         goto got_pdev;
814
815                                 *bus = drhd->devices[i].bus;
816                                 *devfn = drhd->devices[i].devfn;
817                                 goto out;
818                         }
819
820                         if (is_downstream_to_pci_bridge(dev, tmp))
821                                 goto got_pdev;
822                 }
823
824                 if (pdev && drhd->include_all) {
825                 got_pdev:
826                         *bus = pdev->bus->number;
827                         *devfn = pdev->devfn;
828                         goto out;
829                 }
830         }
831         iommu = NULL;
832  out:
833         rcu_read_unlock();
834
835         return iommu;
836 }
837
838 static void domain_flush_cache(struct dmar_domain *domain,
839                                void *addr, int size)
840 {
841         if (!domain->iommu_coherency)
842                 clflush_cache_range(addr, size);
843 }
844
845 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
846 {
847         struct context_entry *context;
848         int ret = 0;
849         unsigned long flags;
850
851         spin_lock_irqsave(&iommu->lock, flags);
852         context = iommu_context_addr(iommu, bus, devfn, 0);
853         if (context)
854                 ret = context_present(context);
855         spin_unlock_irqrestore(&iommu->lock, flags);
856         return ret;
857 }
858
859 static void free_context_table(struct intel_iommu *iommu)
860 {
861         int i;
862         unsigned long flags;
863         struct context_entry *context;
864
865         spin_lock_irqsave(&iommu->lock, flags);
866         if (!iommu->root_entry) {
867                 goto out;
868         }
869         for (i = 0; i < ROOT_ENTRY_NR; i++) {
870                 context = iommu_context_addr(iommu, i, 0, 0);
871                 if (context)
872                         free_pgtable_page(context);
873
874                 if (!sm_supported(iommu))
875                         continue;
876
877                 context = iommu_context_addr(iommu, i, 0x80, 0);
878                 if (context)
879                         free_pgtable_page(context);
880
881         }
882         free_pgtable_page(iommu->root_entry);
883         iommu->root_entry = NULL;
884 out:
885         spin_unlock_irqrestore(&iommu->lock, flags);
886 }
887
888 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
889                                       unsigned long pfn, int *target_level)
890 {
891         struct dma_pte *parent, *pte;
892         int level = agaw_to_level(domain->agaw);
893         int offset;
894
895         BUG_ON(!domain->pgd);
896
897         if (!domain_pfn_supported(domain, pfn))
898                 /* Address beyond IOMMU's addressing capabilities. */
899                 return NULL;
900
901         parent = domain->pgd;
902
903         while (1) {
904                 void *tmp_page;
905
906                 offset = pfn_level_offset(pfn, level);
907                 pte = &parent[offset];
908                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
909                         break;
910                 if (level == *target_level)
911                         break;
912
913                 if (!dma_pte_present(pte)) {
914                         uint64_t pteval;
915
916                         tmp_page = alloc_pgtable_page(domain->nid);
917
918                         if (!tmp_page)
919                                 return NULL;
920
921                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
922                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
923                         if (domain_use_first_level(domain))
924                                 pteval |= DMA_FL_PTE_XD;
925                         if (cmpxchg64(&pte->val, 0ULL, pteval))
926                                 /* Someone else set it while we were thinking; use theirs. */
927                                 free_pgtable_page(tmp_page);
928                         else
929                                 domain_flush_cache(domain, pte, sizeof(*pte));
930                 }
931                 if (level == 1)
932                         break;
933
934                 parent = phys_to_virt(dma_pte_addr(pte));
935                 level--;
936         }
937
938         if (!*target_level)
939                 *target_level = level;
940
941         return pte;
942 }
943
944 /* return address's pte at specific level */
945 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
946                                          unsigned long pfn,
947                                          int level, int *large_page)
948 {
949         struct dma_pte *parent, *pte;
950         int total = agaw_to_level(domain->agaw);
951         int offset;
952
953         parent = domain->pgd;
954         while (level <= total) {
955                 offset = pfn_level_offset(pfn, total);
956                 pte = &parent[offset];
957                 if (level == total)
958                         return pte;
959
960                 if (!dma_pte_present(pte)) {
961                         *large_page = total;
962                         break;
963                 }
964
965                 if (dma_pte_superpage(pte)) {
966                         *large_page = total;
967                         return pte;
968                 }
969
970                 parent = phys_to_virt(dma_pte_addr(pte));
971                 total--;
972         }
973         return NULL;
974 }
975
976 /* clear last level pte, a tlb flush should be followed */
977 static void dma_pte_clear_range(struct dmar_domain *domain,
978                                 unsigned long start_pfn,
979                                 unsigned long last_pfn)
980 {
981         unsigned int large_page;
982         struct dma_pte *first_pte, *pte;
983
984         BUG_ON(!domain_pfn_supported(domain, start_pfn));
985         BUG_ON(!domain_pfn_supported(domain, last_pfn));
986         BUG_ON(start_pfn > last_pfn);
987
988         /* we don't need lock here; nobody else touches the iova range */
989         do {
990                 large_page = 1;
991                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
992                 if (!pte) {
993                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
994                         continue;
995                 }
996                 do {
997                         dma_clear_pte(pte);
998                         start_pfn += lvl_to_nr_pages(large_page);
999                         pte++;
1000                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1001
1002                 domain_flush_cache(domain, first_pte,
1003                                    (void *)pte - (void *)first_pte);
1004
1005         } while (start_pfn && start_pfn <= last_pfn);
1006 }
1007
1008 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1009                                int retain_level, struct dma_pte *pte,
1010                                unsigned long pfn, unsigned long start_pfn,
1011                                unsigned long last_pfn)
1012 {
1013         pfn = max(start_pfn, pfn);
1014         pte = &pte[pfn_level_offset(pfn, level)];
1015
1016         do {
1017                 unsigned long level_pfn;
1018                 struct dma_pte *level_pte;
1019
1020                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1021                         goto next;
1022
1023                 level_pfn = pfn & level_mask(level);
1024                 level_pte = phys_to_virt(dma_pte_addr(pte));
1025
1026                 if (level > 2) {
1027                         dma_pte_free_level(domain, level - 1, retain_level,
1028                                            level_pte, level_pfn, start_pfn,
1029                                            last_pfn);
1030                 }
1031
1032                 /*
1033                  * Free the page table if we're below the level we want to
1034                  * retain and the range covers the entire table.
1035                  */
1036                 if (level < retain_level && !(start_pfn > level_pfn ||
1037                       last_pfn < level_pfn + level_size(level) - 1)) {
1038                         dma_clear_pte(pte);
1039                         domain_flush_cache(domain, pte, sizeof(*pte));
1040                         free_pgtable_page(level_pte);
1041                 }
1042 next:
1043                 pfn += level_size(level);
1044         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1045 }
1046
1047 /*
1048  * clear last level (leaf) ptes and free page table pages below the
1049  * level we wish to keep intact.
1050  */
1051 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1052                                    unsigned long start_pfn,
1053                                    unsigned long last_pfn,
1054                                    int retain_level)
1055 {
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         dma_pte_clear_range(domain, start_pfn, last_pfn);
1061
1062         /* We don't need lock here; nobody else touches the iova range */
1063         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1064                            domain->pgd, 0, start_pfn, last_pfn);
1065
1066         /* free pgd */
1067         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1068                 free_pgtable_page(domain->pgd);
1069                 domain->pgd = NULL;
1070         }
1071 }
1072
1073 /* When a page at a given level is being unlinked from its parent, we don't
1074    need to *modify* it at all. All we need to do is make a list of all the
1075    pages which can be freed just as soon as we've flushed the IOTLB and we
1076    know the hardware page-walk will no longer touch them.
1077    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1078    be freed. */
1079 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1080                                             int level, struct dma_pte *pte,
1081                                             struct page *freelist)
1082 {
1083         struct page *pg;
1084
1085         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1086         pg->freelist = freelist;
1087         freelist = pg;
1088
1089         if (level == 1)
1090                 return freelist;
1091
1092         pte = page_address(pg);
1093         do {
1094                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1095                         freelist = dma_pte_list_pagetables(domain, level - 1,
1096                                                            pte, freelist);
1097                 pte++;
1098         } while (!first_pte_in_page(pte));
1099
1100         return freelist;
1101 }
1102
1103 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1104                                         struct dma_pte *pte, unsigned long pfn,
1105                                         unsigned long start_pfn,
1106                                         unsigned long last_pfn,
1107                                         struct page *freelist)
1108 {
1109         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1110
1111         pfn = max(start_pfn, pfn);
1112         pte = &pte[pfn_level_offset(pfn, level)];
1113
1114         do {
1115                 unsigned long level_pfn;
1116
1117                 if (!dma_pte_present(pte))
1118                         goto next;
1119
1120                 level_pfn = pfn & level_mask(level);
1121
1122                 /* If range covers entire pagetable, free it */
1123                 if (start_pfn <= level_pfn &&
1124                     last_pfn >= level_pfn + level_size(level) - 1) {
1125                         /* These suborbinate page tables are going away entirely. Don't
1126                            bother to clear them; we're just going to *free* them. */
1127                         if (level > 1 && !dma_pte_superpage(pte))
1128                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1129
1130                         dma_clear_pte(pte);
1131                         if (!first_pte)
1132                                 first_pte = pte;
1133                         last_pte = pte;
1134                 } else if (level > 1) {
1135                         /* Recurse down into a level that isn't *entirely* obsolete */
1136                         freelist = dma_pte_clear_level(domain, level - 1,
1137                                                        phys_to_virt(dma_pte_addr(pte)),
1138                                                        level_pfn, start_pfn, last_pfn,
1139                                                        freelist);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144
1145         if (first_pte)
1146                 domain_flush_cache(domain, first_pte,
1147                                    (void *)++last_pte - (void *)first_pte);
1148
1149         return freelist;
1150 }
1151
1152 /* We can't just free the pages because the IOMMU may still be walking
1153    the page tables, and may have cached the intermediate levels. The
1154    pages can only be freed after the IOTLB flush has been done. */
1155 static struct page *domain_unmap(struct dmar_domain *domain,
1156                                  unsigned long start_pfn,
1157                                  unsigned long last_pfn)
1158 {
1159         struct page *freelist;
1160
1161         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1162         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1163         BUG_ON(start_pfn > last_pfn);
1164
1165         /* we don't need lock here; nobody else touches the iova range */
1166         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1167                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1168
1169         /* free pgd */
1170         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1171                 struct page *pgd_page = virt_to_page(domain->pgd);
1172                 pgd_page->freelist = freelist;
1173                 freelist = pgd_page;
1174
1175                 domain->pgd = NULL;
1176         }
1177
1178         return freelist;
1179 }
1180
1181 static void dma_free_pagelist(struct page *freelist)
1182 {
1183         struct page *pg;
1184
1185         while ((pg = freelist)) {
1186                 freelist = pg->freelist;
1187                 free_pgtable_page(page_address(pg));
1188         }
1189 }
1190
1191 static void iova_entry_free(unsigned long data)
1192 {
1193         struct page *freelist = (struct page *)data;
1194
1195         dma_free_pagelist(freelist);
1196 }
1197
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201         struct root_entry *root;
1202         unsigned long flags;
1203
1204         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1205         if (!root) {
1206                 pr_err("Allocating root entry for %s failed\n",
1207                         iommu->name);
1208                 return -ENOMEM;
1209         }
1210
1211         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1212
1213         spin_lock_irqsave(&iommu->lock, flags);
1214         iommu->root_entry = root;
1215         spin_unlock_irqrestore(&iommu->lock, flags);
1216
1217         return 0;
1218 }
1219
1220 static void iommu_set_root_entry(struct intel_iommu *iommu)
1221 {
1222         u64 addr;
1223         u32 sts;
1224         unsigned long flag;
1225
1226         addr = virt_to_phys(iommu->root_entry);
1227         if (sm_supported(iommu))
1228                 addr |= DMA_RTADDR_SMT;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1232
1233         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1234
1235         /* Make sure hardware complete it */
1236         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237                       readl, (sts & DMA_GSTS_RTPS), sts);
1238
1239         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 }
1241
1242 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1243 {
1244         u32 val;
1245         unsigned long flag;
1246
1247         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1248                 return;
1249
1250         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1251         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1252
1253         /* Make sure hardware complete it */
1254         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255                       readl, (!(val & DMA_GSTS_WBFS)), val);
1256
1257         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1258 }
1259
1260 /* return value determine if we need a write buffer flush */
1261 static void __iommu_flush_context(struct intel_iommu *iommu,
1262                                   u16 did, u16 source_id, u8 function_mask,
1263                                   u64 type)
1264 {
1265         u64 val = 0;
1266         unsigned long flag;
1267
1268         switch (type) {
1269         case DMA_CCMD_GLOBAL_INVL:
1270                 val = DMA_CCMD_GLOBAL_INVL;
1271                 break;
1272         case DMA_CCMD_DOMAIN_INVL:
1273                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1274                 break;
1275         case DMA_CCMD_DEVICE_INVL:
1276                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1277                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1278                 break;
1279         default:
1280                 BUG();
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 BUG();
1317         }
1318         /* Note: set drain read/write */
1319 #if 0
1320         /*
1321          * This is probably to be super secure.. Looks like we can
1322          * ignore it without any impact.
1323          */
1324         if (cap_read_drain(iommu->cap))
1325                 val |= DMA_TLB_READ_DRAIN;
1326 #endif
1327         if (cap_write_drain(iommu->cap))
1328                 val |= DMA_TLB_WRITE_DRAIN;
1329
1330         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1331         /* Note: Only uses first TLB reg currently */
1332         if (val_iva)
1333                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1334         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1335
1336         /* Make sure hardware complete it */
1337         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1338                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1339
1340         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1341
1342         /* check IOTLB invalidation granularity */
1343         if (DMA_TLB_IAIG(val) == 0)
1344                 pr_err("Flush IOTLB failed\n");
1345         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1346                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1347                         (unsigned long long)DMA_TLB_IIRG(type),
1348                         (unsigned long long)DMA_TLB_IAIG(val));
1349 }
1350
1351 static struct device_domain_info *
1352 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1353                          u8 bus, u8 devfn)
1354 {
1355         struct device_domain_info *info;
1356
1357         assert_spin_locked(&device_domain_lock);
1358
1359         if (!iommu->qi)
1360                 return NULL;
1361
1362         list_for_each_entry(info, &domain->devices, link)
1363                 if (info->iommu == iommu && info->bus == bus &&
1364                     info->devfn == devfn) {
1365                         if (info->ats_supported && info->dev)
1366                                 return info;
1367                         break;
1368                 }
1369
1370         return NULL;
1371 }
1372
1373 static void domain_update_iotlb(struct dmar_domain *domain)
1374 {
1375         struct device_domain_info *info;
1376         bool has_iotlb_device = false;
1377
1378         assert_spin_locked(&device_domain_lock);
1379
1380         list_for_each_entry(info, &domain->devices, link) {
1381                 struct pci_dev *pdev;
1382
1383                 if (!info->dev || !dev_is_pci(info->dev))
1384                         continue;
1385
1386                 pdev = to_pci_dev(info->dev);
1387                 if (pdev->ats_enabled) {
1388                         has_iotlb_device = true;
1389                         break;
1390                 }
1391         }
1392
1393         domain->has_iotlb_device = has_iotlb_device;
1394 }
1395
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1397 {
1398         struct pci_dev *pdev;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         if (!info || !dev_is_pci(info->dev))
1403                 return;
1404
1405         pdev = to_pci_dev(info->dev);
1406         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1407          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1408          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1409          * reserved, which should be set to 0.
1410          */
1411         if (!ecap_dit(info->iommu->ecap))
1412                 info->pfsid = 0;
1413         else {
1414                 struct pci_dev *pf_pdev;
1415
1416                 /* pdev will be returned if device is not a vf */
1417                 pf_pdev = pci_physfn(pdev);
1418                 info->pfsid = pci_dev_id(pf_pdev);
1419         }
1420
1421 #ifdef CONFIG_INTEL_IOMMU_SVM
1422         /* The PCIe spec, in its wisdom, declares that the behaviour of
1423            the device if you enable PASID support after ATS support is
1424            undefined. So always enable PASID support on devices which
1425            have it, even if we can't yet know if we're ever going to
1426            use it. */
1427         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428                 info->pasid_enabled = 1;
1429
1430         if (info->pri_supported &&
1431             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1432             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1433                 info->pri_enabled = 1;
1434 #endif
1435         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1436             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1437                 info->ats_enabled = 1;
1438                 domain_update_iotlb(info->domain);
1439                 info->ats_qdep = pci_ats_queue_depth(pdev);
1440         }
1441 }
1442
1443 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1444 {
1445         struct pci_dev *pdev;
1446
1447         assert_spin_locked(&device_domain_lock);
1448
1449         if (!dev_is_pci(info->dev))
1450                 return;
1451
1452         pdev = to_pci_dev(info->dev);
1453
1454         if (info->ats_enabled) {
1455                 pci_disable_ats(pdev);
1456                 info->ats_enabled = 0;
1457                 domain_update_iotlb(info->domain);
1458         }
1459 #ifdef CONFIG_INTEL_IOMMU_SVM
1460         if (info->pri_enabled) {
1461                 pci_disable_pri(pdev);
1462                 info->pri_enabled = 0;
1463         }
1464         if (info->pasid_enabled) {
1465                 pci_disable_pasid(pdev);
1466                 info->pasid_enabled = 0;
1467         }
1468 #endif
1469 }
1470
1471 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1472                                   u64 addr, unsigned mask)
1473 {
1474         u16 sid, qdep;
1475         unsigned long flags;
1476         struct device_domain_info *info;
1477
1478         if (!domain->has_iotlb_device)
1479                 return;
1480
1481         spin_lock_irqsave(&device_domain_lock, flags);
1482         list_for_each_entry(info, &domain->devices, link) {
1483                 if (!info->ats_enabled)
1484                         continue;
1485
1486                 sid = info->bus << 8 | info->devfn;
1487                 qdep = info->ats_qdep;
1488                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1489                                 qdep, addr, mask);
1490         }
1491         spin_unlock_irqrestore(&device_domain_lock, flags);
1492 }
1493
1494 static void domain_flush_piotlb(struct intel_iommu *iommu,
1495                                 struct dmar_domain *domain,
1496                                 u64 addr, unsigned long npages, bool ih)
1497 {
1498         u16 did = domain->iommu_did[iommu->seq_id];
1499
1500         if (domain->default_pasid)
1501                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1502                                 addr, npages, ih);
1503
1504         if (!list_empty(&domain->devices))
1505                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1506 }
1507
1508 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1509                                   struct dmar_domain *domain,
1510                                   unsigned long pfn, unsigned int pages,
1511                                   int ih, int map)
1512 {
1513         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1514         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1515         u16 did = domain->iommu_did[iommu->seq_id];
1516
1517         BUG_ON(pages == 0);
1518
1519         if (ih)
1520                 ih = 1 << 6;
1521
1522         if (domain_use_first_level(domain)) {
1523                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1524         } else {
1525                 /*
1526                  * Fallback to domain selective flush if no PSI support or
1527                  * the size is too big. PSI requires page size to be 2 ^ x,
1528                  * and the base address is naturally aligned to the size.
1529                  */
1530                 if (!cap_pgsel_inv(iommu->cap) ||
1531                     mask > cap_max_amask_val(iommu->cap))
1532                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1533                                                         DMA_TLB_DSI_FLUSH);
1534                 else
1535                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1536                                                         DMA_TLB_PSI_FLUSH);
1537         }
1538
1539         /*
1540          * In caching mode, changes of pages from non-present to present require
1541          * flush. However, device IOTLB doesn't need to be flushed in this case.
1542          */
1543         if (!cap_caching_mode(iommu->cap) || !map)
1544                 iommu_flush_dev_iotlb(domain, addr, mask);
1545 }
1546
1547 /* Notification for newly created mappings */
1548 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1549                                         struct dmar_domain *domain,
1550                                         unsigned long pfn, unsigned int pages)
1551 {
1552         /*
1553          * It's a non-present to present mapping. Only flush if caching mode
1554          * and second level.
1555          */
1556         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1557                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1558         else
1559                 iommu_flush_write_buffer(iommu);
1560 }
1561
1562 static void iommu_flush_iova(struct iova_domain *iovad)
1563 {
1564         struct dmar_domain *domain;
1565         int idx;
1566
1567         domain = container_of(iovad, struct dmar_domain, iovad);
1568
1569         for_each_domain_iommu(idx, domain) {
1570                 struct intel_iommu *iommu = g_iommus[idx];
1571                 u16 did = domain->iommu_did[iommu->seq_id];
1572
1573                 if (domain_use_first_level(domain))
1574                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1575                 else
1576                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1577                                                  DMA_TLB_DSI_FLUSH);
1578
1579                 if (!cap_caching_mode(iommu->cap))
1580                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1581                                               0, MAX_AGAW_PFN_WIDTH);
1582         }
1583 }
1584
1585 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1586 {
1587         u32 pmen;
1588         unsigned long flags;
1589
1590         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1591                 return;
1592
1593         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1594         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1595         pmen &= ~DMA_PMEN_EPM;
1596         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1597
1598         /* wait for the protected region status bit to clear */
1599         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1600                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1601
1602         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1603 }
1604
1605 static void iommu_enable_translation(struct intel_iommu *iommu)
1606 {
1607         u32 sts;
1608         unsigned long flags;
1609
1610         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1611         iommu->gcmd |= DMA_GCMD_TE;
1612         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1613
1614         /* Make sure hardware complete it */
1615         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1616                       readl, (sts & DMA_GSTS_TES), sts);
1617
1618         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1619 }
1620
1621 static void iommu_disable_translation(struct intel_iommu *iommu)
1622 {
1623         u32 sts;
1624         unsigned long flag;
1625
1626         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1627         iommu->gcmd &= ~DMA_GCMD_TE;
1628         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1629
1630         /* Make sure hardware complete it */
1631         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1632                       readl, (!(sts & DMA_GSTS_TES)), sts);
1633
1634         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1635 }
1636
1637 static int iommu_init_domains(struct intel_iommu *iommu)
1638 {
1639         u32 ndomains, nlongs;
1640         size_t size;
1641
1642         ndomains = cap_ndoms(iommu->cap);
1643         pr_debug("%s: Number of Domains supported <%d>\n",
1644                  iommu->name, ndomains);
1645         nlongs = BITS_TO_LONGS(ndomains);
1646
1647         spin_lock_init(&iommu->lock);
1648
1649         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1650         if (!iommu->domain_ids) {
1651                 pr_err("%s: Allocating domain id array failed\n",
1652                        iommu->name);
1653                 return -ENOMEM;
1654         }
1655
1656         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1657         iommu->domains = kzalloc(size, GFP_KERNEL);
1658
1659         if (iommu->domains) {
1660                 size = 256 * sizeof(struct dmar_domain *);
1661                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1662         }
1663
1664         if (!iommu->domains || !iommu->domains[0]) {
1665                 pr_err("%s: Allocating domain array failed\n",
1666                        iommu->name);
1667                 kfree(iommu->domain_ids);
1668                 kfree(iommu->domains);
1669                 iommu->domain_ids = NULL;
1670                 iommu->domains    = NULL;
1671                 return -ENOMEM;
1672         }
1673
1674         /*
1675          * If Caching mode is set, then invalid translations are tagged
1676          * with domain-id 0, hence we need to pre-allocate it. We also
1677          * use domain-id 0 as a marker for non-allocated domain-id, so
1678          * make sure it is not used for a real domain.
1679          */
1680         set_bit(0, iommu->domain_ids);
1681
1682         /*
1683          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1684          * entry for first-level or pass-through translation modes should
1685          * be programmed with a domain id different from those used for
1686          * second-level or nested translation. We reserve a domain id for
1687          * this purpose.
1688          */
1689         if (sm_supported(iommu))
1690                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1691
1692         return 0;
1693 }
1694
1695 static void disable_dmar_iommu(struct intel_iommu *iommu)
1696 {
1697         struct device_domain_info *info, *tmp;
1698         unsigned long flags;
1699
1700         if (!iommu->domains || !iommu->domain_ids)
1701                 return;
1702
1703         spin_lock_irqsave(&device_domain_lock, flags);
1704         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1705                 if (info->iommu != iommu)
1706                         continue;
1707
1708                 if (!info->dev || !info->domain)
1709                         continue;
1710
1711                 __dmar_remove_one_dev_info(info);
1712         }
1713         spin_unlock_irqrestore(&device_domain_lock, flags);
1714
1715         if (iommu->gcmd & DMA_GCMD_TE)
1716                 iommu_disable_translation(iommu);
1717 }
1718
1719 static void free_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721         if ((iommu->domains) && (iommu->domain_ids)) {
1722                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1723                 int i;
1724
1725                 for (i = 0; i < elems; i++)
1726                         kfree(iommu->domains[i]);
1727                 kfree(iommu->domains);
1728                 kfree(iommu->domain_ids);
1729                 iommu->domains = NULL;
1730                 iommu->domain_ids = NULL;
1731         }
1732
1733         g_iommus[iommu->seq_id] = NULL;
1734
1735         /* free context mapping */
1736         free_context_table(iommu);
1737
1738 #ifdef CONFIG_INTEL_IOMMU_SVM
1739         if (pasid_supported(iommu)) {
1740                 if (ecap_prs(iommu->ecap))
1741                         intel_svm_finish_prq(iommu);
1742         }
1743         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1744                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1745
1746 #endif
1747 }
1748
1749 /*
1750  * Check and return whether first level is used by default for
1751  * DMA translation.
1752  */
1753 static bool first_level_by_default(void)
1754 {
1755         struct dmar_drhd_unit *drhd;
1756         struct intel_iommu *iommu;
1757         static int first_level_support = -1;
1758
1759         if (likely(first_level_support != -1))
1760                 return first_level_support;
1761
1762         first_level_support = 1;
1763
1764         rcu_read_lock();
1765         for_each_active_iommu(iommu, drhd) {
1766                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1767                         first_level_support = 0;
1768                         break;
1769                 }
1770         }
1771         rcu_read_unlock();
1772
1773         return first_level_support;
1774 }
1775
1776 static struct dmar_domain *alloc_domain(int flags)
1777 {
1778         struct dmar_domain *domain;
1779
1780         domain = alloc_domain_mem();
1781         if (!domain)
1782                 return NULL;
1783
1784         memset(domain, 0, sizeof(*domain));
1785         domain->nid = NUMA_NO_NODE;
1786         domain->flags = flags;
1787         if (first_level_by_default())
1788                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1789         domain->has_iotlb_device = false;
1790         INIT_LIST_HEAD(&domain->devices);
1791
1792         return domain;
1793 }
1794
1795 /* Must be called with iommu->lock */
1796 static int domain_attach_iommu(struct dmar_domain *domain,
1797                                struct intel_iommu *iommu)
1798 {
1799         unsigned long ndomains;
1800         int num;
1801
1802         assert_spin_locked(&device_domain_lock);
1803         assert_spin_locked(&iommu->lock);
1804
1805         domain->iommu_refcnt[iommu->seq_id] += 1;
1806         domain->iommu_count += 1;
1807         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1808                 ndomains = cap_ndoms(iommu->cap);
1809                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1810
1811                 if (num >= ndomains) {
1812                         pr_err("%s: No free domain ids\n", iommu->name);
1813                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1814                         domain->iommu_count -= 1;
1815                         return -ENOSPC;
1816                 }
1817
1818                 set_bit(num, iommu->domain_ids);
1819                 set_iommu_domain(iommu, num, domain);
1820
1821                 domain->iommu_did[iommu->seq_id] = num;
1822                 domain->nid                      = iommu->node;
1823
1824                 domain_update_iommu_cap(domain);
1825         }
1826
1827         return 0;
1828 }
1829
1830 static int domain_detach_iommu(struct dmar_domain *domain,
1831                                struct intel_iommu *iommu)
1832 {
1833         int num, count;
1834
1835         assert_spin_locked(&device_domain_lock);
1836         assert_spin_locked(&iommu->lock);
1837
1838         domain->iommu_refcnt[iommu->seq_id] -= 1;
1839         count = --domain->iommu_count;
1840         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1841                 num = domain->iommu_did[iommu->seq_id];
1842                 clear_bit(num, iommu->domain_ids);
1843                 set_iommu_domain(iommu, num, NULL);
1844
1845                 domain_update_iommu_cap(domain);
1846                 domain->iommu_did[iommu->seq_id] = 0;
1847         }
1848
1849         return count;
1850 }
1851
1852 static struct iova_domain reserved_iova_list;
1853 static struct lock_class_key reserved_rbtree_key;
1854
1855 static int dmar_init_reserved_ranges(void)
1856 {
1857         struct pci_dev *pdev = NULL;
1858         struct iova *iova;
1859         int i;
1860
1861         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1862
1863         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1864                 &reserved_rbtree_key);
1865
1866         /* IOAPIC ranges shouldn't be accessed by DMA */
1867         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1868                 IOVA_PFN(IOAPIC_RANGE_END));
1869         if (!iova) {
1870                 pr_err("Reserve IOAPIC range failed\n");
1871                 return -ENODEV;
1872         }
1873
1874         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1875         for_each_pci_dev(pdev) {
1876                 struct resource *r;
1877
1878                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1879                         r = &pdev->resource[i];
1880                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1881                                 continue;
1882                         iova = reserve_iova(&reserved_iova_list,
1883                                             IOVA_PFN(r->start),
1884                                             IOVA_PFN(r->end));
1885                         if (!iova) {
1886                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1887                                 return -ENODEV;
1888                         }
1889                 }
1890         }
1891         return 0;
1892 }
1893
1894 static inline int guestwidth_to_adjustwidth(int gaw)
1895 {
1896         int agaw;
1897         int r = (gaw - 12) % 9;
1898
1899         if (r == 0)
1900                 agaw = gaw;
1901         else
1902                 agaw = gaw + 9 - r;
1903         if (agaw > 64)
1904                 agaw = 64;
1905         return agaw;
1906 }
1907
1908 static void domain_exit(struct dmar_domain *domain)
1909 {
1910
1911         /* Remove associated devices and clear attached or cached domains */
1912         domain_remove_dev_info(domain);
1913
1914         /* destroy iovas */
1915         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1916                 put_iova_domain(&domain->iovad);
1917
1918         if (domain->pgd) {
1919                 struct page *freelist;
1920
1921                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922                 dma_free_pagelist(freelist);
1923         }
1924
1925         free_domain_mem(domain);
1926 }
1927
1928 /*
1929  * Get the PASID directory size for scalable mode context entry.
1930  * Value of X in the PDTS field of a scalable mode context entry
1931  * indicates PASID directory with 2^(X + 7) entries.
1932  */
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1934 {
1935         int pds, max_pde;
1936
1937         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1939         if (pds < 7)
1940                 return 0;
1941
1942         return pds - 7;
1943 }
1944
1945 /*
1946  * Set the RID_PASID field of a scalable mode context entry. The
1947  * IOMMU hardware will use the PASID value set in this field for
1948  * DMA translations of DMA requests without PASID.
1949  */
1950 static inline void
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1952 {
1953         context->hi |= pasid & ((1 << 20) - 1);
1954         context->hi |= (1 << 20);
1955 }
1956
1957 /*
1958  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1959  * entry.
1960  */
1961 static inline void context_set_sm_dte(struct context_entry *context)
1962 {
1963         context->lo |= (1 << 2);
1964 }
1965
1966 /*
1967  * Set the PRE(Page Request Enable) field of a scalable mode context
1968  * entry.
1969  */
1970 static inline void context_set_sm_pre(struct context_entry *context)
1971 {
1972         context->lo |= (1 << 4);
1973 }
1974
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1977
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979                                       struct intel_iommu *iommu,
1980                                       struct pasid_table *table,
1981                                       u8 bus, u8 devfn)
1982 {
1983         u16 did = domain->iommu_did[iommu->seq_id];
1984         int translation = CONTEXT_TT_MULTI_LEVEL;
1985         struct device_domain_info *info = NULL;
1986         struct context_entry *context;
1987         unsigned long flags;
1988         int ret;
1989
1990         WARN_ON(did == 0);
1991
1992         if (hw_pass_through && domain_type_is_si(domain))
1993                 translation = CONTEXT_TT_PASS_THROUGH;
1994
1995         pr_debug("Set context mapping for %02x:%02x.%d\n",
1996                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1997
1998         BUG_ON(!domain->pgd);
1999
2000         spin_lock_irqsave(&device_domain_lock, flags);
2001         spin_lock(&iommu->lock);
2002
2003         ret = -ENOMEM;
2004         context = iommu_context_addr(iommu, bus, devfn, 1);
2005         if (!context)
2006                 goto out_unlock;
2007
2008         ret = 0;
2009         if (context_present(context))
2010                 goto out_unlock;
2011
2012         /*
2013          * For kdump cases, old valid entries may be cached due to the
2014          * in-flight DMA and copied pgtable, but there is no unmapping
2015          * behaviour for them, thus we need an explicit cache flush for
2016          * the newly-mapped device. For kdump, at this point, the device
2017          * is supposed to finish reset at its driver probe stage, so no
2018          * in-flight DMA will exist, and we don't need to worry anymore
2019          * hereafter.
2020          */
2021         if (context_copied(context)) {
2022                 u16 did_old = context_domain_id(context);
2023
2024                 if (did_old < cap_ndoms(iommu->cap)) {
2025                         iommu->flush.flush_context(iommu, did_old,
2026                                                    (((u16)bus) << 8) | devfn,
2027                                                    DMA_CCMD_MASK_NOBIT,
2028                                                    DMA_CCMD_DEVICE_INVL);
2029                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2030                                                  DMA_TLB_DSI_FLUSH);
2031                 }
2032         }
2033
2034         context_clear_entry(context);
2035
2036         if (sm_supported(iommu)) {
2037                 unsigned long pds;
2038
2039                 WARN_ON(!table);
2040
2041                 /* Setup the PASID DIR pointer: */
2042                 pds = context_get_sm_pds(table);
2043                 context->lo = (u64)virt_to_phys(table->table) |
2044                                 context_pdts(pds);
2045
2046                 /* Setup the RID_PASID field: */
2047                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2048
2049                 /*
2050                  * Setup the Device-TLB enable bit and Page request
2051                  * Enable bit:
2052                  */
2053                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054                 if (info && info->ats_supported)
2055                         context_set_sm_dte(context);
2056                 if (info && info->pri_supported)
2057                         context_set_sm_pre(context);
2058         } else {
2059                 struct dma_pte *pgd = domain->pgd;
2060                 int agaw;
2061
2062                 context_set_domain_id(context, did);
2063
2064                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2065                         /*
2066                          * Skip top levels of page tables for iommu which has
2067                          * less agaw than default. Unnecessary for PT mode.
2068                          */
2069                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2070                                 ret = -ENOMEM;
2071                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2072                                 if (!dma_pte_present(pgd))
2073                                         goto out_unlock;
2074                         }
2075
2076                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077                         if (info && info->ats_supported)
2078                                 translation = CONTEXT_TT_DEV_IOTLB;
2079                         else
2080                                 translation = CONTEXT_TT_MULTI_LEVEL;
2081
2082                         context_set_address_root(context, virt_to_phys(pgd));
2083                         context_set_address_width(context, agaw);
2084                 } else {
2085                         /*
2086                          * In pass through mode, AW must be programmed to
2087                          * indicate the largest AGAW value supported by
2088                          * hardware. And ASR is ignored by hardware.
2089                          */
2090                         context_set_address_width(context, iommu->msagaw);
2091                 }
2092
2093                 context_set_translation_type(context, translation);
2094         }
2095
2096         context_set_fault_enable(context);
2097         context_set_present(context);
2098         domain_flush_cache(domain, context, sizeof(*context));
2099
2100         /*
2101          * It's a non-present to present mapping. If hardware doesn't cache
2102          * non-present entry we only need to flush the write-buffer. If the
2103          * _does_ cache non-present entries, then it does so in the special
2104          * domain #0, which we have to flush:
2105          */
2106         if (cap_caching_mode(iommu->cap)) {
2107                 iommu->flush.flush_context(iommu, 0,
2108                                            (((u16)bus) << 8) | devfn,
2109                                            DMA_CCMD_MASK_NOBIT,
2110                                            DMA_CCMD_DEVICE_INVL);
2111                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2112         } else {
2113                 iommu_flush_write_buffer(iommu);
2114         }
2115         iommu_enable_dev_iotlb(info);
2116
2117         ret = 0;
2118
2119 out_unlock:
2120         spin_unlock(&iommu->lock);
2121         spin_unlock_irqrestore(&device_domain_lock, flags);
2122
2123         return ret;
2124 }
2125
2126 struct domain_context_mapping_data {
2127         struct dmar_domain *domain;
2128         struct intel_iommu *iommu;
2129         struct pasid_table *table;
2130 };
2131
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133                                      u16 alias, void *opaque)
2134 {
2135         struct domain_context_mapping_data *data = opaque;
2136
2137         return domain_context_mapping_one(data->domain, data->iommu,
2138                                           data->table, PCI_BUS_NUM(alias),
2139                                           alias & 0xff);
2140 }
2141
2142 static int
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2144 {
2145         struct domain_context_mapping_data data;
2146         struct pasid_table *table;
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         table = intel_pasid_get_table(dev);
2155
2156         if (!dev_is_pci(dev))
2157                 return domain_context_mapping_one(domain, iommu, table,
2158                                                   bus, devfn);
2159
2160         data.domain = domain;
2161         data.iommu = iommu;
2162         data.table = table;
2163
2164         return pci_for_each_dma_alias(to_pci_dev(dev),
2165                                       &domain_context_mapping_cb, &data);
2166 }
2167
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169                                     u16 alias, void *opaque)
2170 {
2171         struct intel_iommu *iommu = opaque;
2172
2173         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 }
2175
2176 static int domain_context_mapped(struct device *dev)
2177 {
2178         struct intel_iommu *iommu;
2179         u8 bus, devfn;
2180
2181         iommu = device_to_iommu(dev, &bus, &devfn);
2182         if (!iommu)
2183                 return -ENODEV;
2184
2185         if (!dev_is_pci(dev))
2186                 return device_context_mapped(iommu, bus, devfn);
2187
2188         return !pci_for_each_dma_alias(to_pci_dev(dev),
2189                                        domain_context_mapped_cb, iommu);
2190 }
2191
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194                                             size_t size)
2195 {
2196         host_addr &= ~PAGE_MASK;
2197         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 }
2199
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202                                           unsigned long iov_pfn,
2203                                           unsigned long phy_pfn,
2204                                           unsigned long pages)
2205 {
2206         int support, level = 1;
2207         unsigned long pfnmerge;
2208
2209         support = domain->iommu_superpage;
2210
2211         /* To use a large page, the virtual *and* physical addresses
2212            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213            of them will mean we have to use smaller pages. So just
2214            merge them and check both at once. */
2215         pfnmerge = iov_pfn | phy_pfn;
2216
2217         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218                 pages >>= VTD_STRIDE_SHIFT;
2219                 if (!pages)
2220                         break;
2221                 pfnmerge >>= VTD_STRIDE_SHIFT;
2222                 level++;
2223                 support--;
2224         }
2225         return level;
2226 }
2227
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229                             struct scatterlist *sg, unsigned long phys_pfn,
2230                             unsigned long nr_pages, int prot)
2231 {
2232         struct dma_pte *first_pte = NULL, *pte = NULL;
2233         phys_addr_t uninitialized_var(pteval);
2234         unsigned long sg_res = 0;
2235         unsigned int largepage_lvl = 0;
2236         unsigned long lvl_pages = 0;
2237         u64 attr;
2238
2239         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2240
2241         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2242                 return -EINVAL;
2243
2244         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2245         if (domain_use_first_level(domain))
2246                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2247
2248         if (!sg) {
2249                 sg_res = nr_pages;
2250                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2251         }
2252
2253         while (nr_pages > 0) {
2254                 uint64_t tmp;
2255
2256                 if (!sg_res) {
2257                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2258
2259                         sg_res = aligned_nrpages(sg->offset, sg->length);
2260                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2261                         sg->dma_length = sg->length;
2262                         pteval = (sg_phys(sg) - pgoff) | attr;
2263                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2264                 }
2265
2266                 if (!pte) {
2267                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2268
2269                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2270                         if (!pte)
2271                                 return -ENOMEM;
2272                         /* It is large page*/
2273                         if (largepage_lvl > 1) {
2274                                 unsigned long nr_superpages, end_pfn;
2275
2276                                 pteval |= DMA_PTE_LARGE_PAGE;
2277                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2278
2279                                 nr_superpages = sg_res / lvl_pages;
2280                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2281
2282                                 /*
2283                                  * Ensure that old small page tables are
2284                                  * removed to make room for superpage(s).
2285                                  * We're adding new large pages, so make sure
2286                                  * we don't remove their parent tables.
2287                                  */
2288                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2289                                                        largepage_lvl + 1);
2290                         } else {
2291                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2292                         }
2293
2294                 }
2295                 /* We don't need lock here, nobody else
2296                  * touches the iova range
2297                  */
2298                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2299                 if (tmp) {
2300                         static int dumps = 5;
2301                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2302                                 iov_pfn, tmp, (unsigned long long)pteval);
2303                         if (dumps) {
2304                                 dumps--;
2305                                 debug_dma_dump_mappings(NULL);
2306                         }
2307                         WARN_ON(1);
2308                 }
2309
2310                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2311
2312                 BUG_ON(nr_pages < lvl_pages);
2313                 BUG_ON(sg_res < lvl_pages);
2314
2315                 nr_pages -= lvl_pages;
2316                 iov_pfn += lvl_pages;
2317                 phys_pfn += lvl_pages;
2318                 pteval += lvl_pages * VTD_PAGE_SIZE;
2319                 sg_res -= lvl_pages;
2320
2321                 /* If the next PTE would be the first in a new page, then we
2322                    need to flush the cache on the entries we've just written.
2323                    And then we'll need to recalculate 'pte', so clear it and
2324                    let it get set again in the if (!pte) block above.
2325
2326                    If we're done (!nr_pages) we need to flush the cache too.
2327
2328                    Also if we've been setting superpages, we may need to
2329                    recalculate 'pte' and switch back to smaller pages for the
2330                    end of the mapping, if the trailing size is not enough to
2331                    use another superpage (i.e. sg_res < lvl_pages). */
2332                 pte++;
2333                 if (!nr_pages || first_pte_in_page(pte) ||
2334                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2335                         domain_flush_cache(domain, first_pte,
2336                                            (void *)pte - (void *)first_pte);
2337                         pte = NULL;
2338                 }
2339
2340                 if (!sg_res && nr_pages)
2341                         sg = sg_next(sg);
2342         }
2343         return 0;
2344 }
2345
2346 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2347                           struct scatterlist *sg, unsigned long phys_pfn,
2348                           unsigned long nr_pages, int prot)
2349 {
2350         int iommu_id, ret;
2351         struct intel_iommu *iommu;
2352
2353         /* Do the real mapping first */
2354         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2355         if (ret)
2356                 return ret;
2357
2358         for_each_domain_iommu(iommu_id, domain) {
2359                 iommu = g_iommus[iommu_id];
2360                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2361         }
2362
2363         return 0;
2364 }
2365
2366 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2367                                     struct scatterlist *sg, unsigned long nr_pages,
2368                                     int prot)
2369 {
2370         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2371 }
2372
2373 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2374                                      unsigned long phys_pfn, unsigned long nr_pages,
2375                                      int prot)
2376 {
2377         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2378 }
2379
2380 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2381 {
2382         unsigned long flags;
2383         struct context_entry *context;
2384         u16 did_old;
2385
2386         if (!iommu)
2387                 return;
2388
2389         spin_lock_irqsave(&iommu->lock, flags);
2390         context = iommu_context_addr(iommu, bus, devfn, 0);
2391         if (!context) {
2392                 spin_unlock_irqrestore(&iommu->lock, flags);
2393                 return;
2394         }
2395         did_old = context_domain_id(context);
2396         context_clear_entry(context);
2397         __iommu_flush_cache(iommu, context, sizeof(*context));
2398         spin_unlock_irqrestore(&iommu->lock, flags);
2399         iommu->flush.flush_context(iommu,
2400                                    did_old,
2401                                    (((u16)bus) << 8) | devfn,
2402                                    DMA_CCMD_MASK_NOBIT,
2403                                    DMA_CCMD_DEVICE_INVL);
2404         iommu->flush.flush_iotlb(iommu,
2405                                  did_old,
2406                                  0,
2407                                  0,
2408                                  DMA_TLB_DSI_FLUSH);
2409 }
2410
2411 static inline void unlink_domain_info(struct device_domain_info *info)
2412 {
2413         assert_spin_locked(&device_domain_lock);
2414         list_del(&info->link);
2415         list_del(&info->global);
2416         if (info->dev)
2417                 info->dev->archdata.iommu = NULL;
2418 }
2419
2420 static void domain_remove_dev_info(struct dmar_domain *domain)
2421 {
2422         struct device_domain_info *info, *tmp;
2423         unsigned long flags;
2424
2425         spin_lock_irqsave(&device_domain_lock, flags);
2426         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2427                 __dmar_remove_one_dev_info(info);
2428         spin_unlock_irqrestore(&device_domain_lock, flags);
2429 }
2430
2431 struct dmar_domain *find_domain(struct device *dev)
2432 {
2433         struct device_domain_info *info;
2434
2435         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2436                 return NULL;
2437
2438         /* No lock here, assumes no domain exit in normal case */
2439         info = get_domain_info(dev);
2440         if (likely(info))
2441                 return info->domain;
2442
2443         return NULL;
2444 }
2445
2446 static void do_deferred_attach(struct device *dev)
2447 {
2448         struct iommu_domain *domain;
2449
2450         dev->archdata.iommu = NULL;
2451         domain = iommu_get_domain_for_dev(dev);
2452         if (domain)
2453                 intel_iommu_attach_device(domain, dev);
2454 }
2455
2456 static inline struct device_domain_info *
2457 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2458 {
2459         struct device_domain_info *info;
2460
2461         list_for_each_entry(info, &device_domain_list, global)
2462                 if (info->segment == segment && info->bus == bus &&
2463                     info->devfn == devfn)
2464                         return info;
2465
2466         return NULL;
2467 }
2468
2469 static int domain_setup_first_level(struct intel_iommu *iommu,
2470                                     struct dmar_domain *domain,
2471                                     struct device *dev,
2472                                     int pasid)
2473 {
2474         int flags = PASID_FLAG_SUPERVISOR_MODE;
2475         struct dma_pte *pgd = domain->pgd;
2476         int agaw, level;
2477
2478         /*
2479          * Skip top levels of page tables for iommu which has
2480          * less agaw than default. Unnecessary for PT mode.
2481          */
2482         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2483                 pgd = phys_to_virt(dma_pte_addr(pgd));
2484                 if (!dma_pte_present(pgd))
2485                         return -ENOMEM;
2486         }
2487
2488         level = agaw_to_level(agaw);
2489         if (level != 4 && level != 5)
2490                 return -EINVAL;
2491
2492         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2493
2494         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2495                                              domain->iommu_did[iommu->seq_id],
2496                                              flags);
2497 }
2498
2499 static bool dev_is_real_dma_subdevice(struct device *dev)
2500 {
2501         return dev && dev_is_pci(dev) &&
2502                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2503 }
2504
2505 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2506                                                     int bus, int devfn,
2507                                                     struct device *dev,
2508                                                     struct dmar_domain *domain)
2509 {
2510         struct dmar_domain *found = NULL;
2511         struct device_domain_info *info;
2512         unsigned long flags;
2513         int ret;
2514
2515         info = alloc_devinfo_mem();
2516         if (!info)
2517                 return NULL;
2518
2519         if (!dev_is_real_dma_subdevice(dev)) {
2520                 info->bus = bus;
2521                 info->devfn = devfn;
2522                 info->segment = iommu->segment;
2523         } else {
2524                 struct pci_dev *pdev = to_pci_dev(dev);
2525
2526                 info->bus = pdev->bus->number;
2527                 info->devfn = pdev->devfn;
2528                 info->segment = pci_domain_nr(pdev->bus);
2529         }
2530
2531         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2532         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2533         info->ats_qdep = 0;
2534         info->dev = dev;
2535         info->domain = domain;
2536         info->iommu = iommu;
2537         info->pasid_table = NULL;
2538         info->auxd_enabled = 0;
2539         INIT_LIST_HEAD(&info->auxiliary_domains);
2540
2541         if (dev && dev_is_pci(dev)) {
2542                 struct pci_dev *pdev = to_pci_dev(info->dev);
2543
2544                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2545                     pci_ats_supported(pdev) &&
2546                     dmar_find_matched_atsr_unit(pdev))
2547                         info->ats_supported = 1;
2548
2549                 if (sm_supported(iommu)) {
2550                         if (pasid_supported(iommu)) {
2551                                 int features = pci_pasid_features(pdev);
2552                                 if (features >= 0)
2553                                         info->pasid_supported = features | 1;
2554                         }
2555
2556                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2557                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2558                                 info->pri_supported = 1;
2559                 }
2560         }
2561
2562         spin_lock_irqsave(&device_domain_lock, flags);
2563         if (dev)
2564                 found = find_domain(dev);
2565
2566         if (!found) {
2567                 struct device_domain_info *info2;
2568                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2569                                                        info->devfn);
2570                 if (info2) {
2571                         found      = info2->domain;
2572                         info2->dev = dev;
2573                 }
2574         }
2575
2576         if (found) {
2577                 spin_unlock_irqrestore(&device_domain_lock, flags);
2578                 free_devinfo_mem(info);
2579                 /* Caller must free the original domain */
2580                 return found;
2581         }
2582
2583         spin_lock(&iommu->lock);
2584         ret = domain_attach_iommu(domain, iommu);
2585         spin_unlock(&iommu->lock);
2586
2587         if (ret) {
2588                 spin_unlock_irqrestore(&device_domain_lock, flags);
2589                 free_devinfo_mem(info);
2590                 return NULL;
2591         }
2592
2593         list_add(&info->link, &domain->devices);
2594         list_add(&info->global, &device_domain_list);
2595         if (dev)
2596                 dev->archdata.iommu = info;
2597         spin_unlock_irqrestore(&device_domain_lock, flags);
2598
2599         /* PASID table is mandatory for a PCI device in scalable mode. */
2600         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2601                 ret = intel_pasid_alloc_table(dev);
2602                 if (ret) {
2603                         dev_err(dev, "PASID table allocation failed\n");
2604                         dmar_remove_one_dev_info(dev);
2605                         return NULL;
2606                 }
2607
2608                 /* Setup the PASID entry for requests without PASID: */
2609                 spin_lock(&iommu->lock);
2610                 if (hw_pass_through && domain_type_is_si(domain))
2611                         ret = intel_pasid_setup_pass_through(iommu, domain,
2612                                         dev, PASID_RID2PASID);
2613                 else if (domain_use_first_level(domain))
2614                         ret = domain_setup_first_level(iommu, domain, dev,
2615                                         PASID_RID2PASID);
2616                 else
2617                         ret = intel_pasid_setup_second_level(iommu, domain,
2618                                         dev, PASID_RID2PASID);
2619                 spin_unlock(&iommu->lock);
2620                 if (ret) {
2621                         dev_err(dev, "Setup RID2PASID failed\n");
2622                         dmar_remove_one_dev_info(dev);
2623                         return NULL;
2624                 }
2625         }
2626
2627         if (dev && domain_context_mapping(domain, dev)) {
2628                 dev_err(dev, "Domain context map failed\n");
2629                 dmar_remove_one_dev_info(dev);
2630                 return NULL;
2631         }
2632
2633         return domain;
2634 }
2635
2636 static int iommu_domain_identity_map(struct dmar_domain *domain,
2637                                      unsigned long first_vpfn,
2638                                      unsigned long last_vpfn)
2639 {
2640         /*
2641          * RMRR range might have overlap with physical memory range,
2642          * clear it first
2643          */
2644         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2645
2646         return __domain_mapping(domain, first_vpfn, NULL,
2647                                 first_vpfn, last_vpfn - first_vpfn + 1,
2648                                 DMA_PTE_READ|DMA_PTE_WRITE);
2649 }
2650
2651 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2652
2653 static int __init si_domain_init(int hw)
2654 {
2655         struct dmar_rmrr_unit *rmrr;
2656         struct device *dev;
2657         int i, nid, ret;
2658
2659         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2660         if (!si_domain)
2661                 return -EFAULT;
2662
2663         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2664                 domain_exit(si_domain);
2665                 return -EFAULT;
2666         }
2667
2668         if (hw)
2669                 return 0;
2670
2671         for_each_online_node(nid) {
2672                 unsigned long start_pfn, end_pfn;
2673                 int i;
2674
2675                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2676                         ret = iommu_domain_identity_map(si_domain,
2677                                         mm_to_dma_pfn(start_pfn),
2678                                         mm_to_dma_pfn(end_pfn));
2679                         if (ret)
2680                                 return ret;
2681                 }
2682         }
2683
2684         /*
2685          * Identity map the RMRRs so that devices with RMRRs could also use
2686          * the si_domain.
2687          */
2688         for_each_rmrr_units(rmrr) {
2689                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2690                                           i, dev) {
2691                         unsigned long long start = rmrr->base_address;
2692                         unsigned long long end = rmrr->end_address;
2693
2694                         if (WARN_ON(end < start ||
2695                                     end >> agaw_to_width(si_domain->agaw)))
2696                                 continue;
2697
2698                         ret = iommu_domain_identity_map(si_domain, start, end);
2699                         if (ret)
2700                                 return ret;
2701                 }
2702         }
2703
2704         return 0;
2705 }
2706
2707 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2708 {
2709         struct dmar_domain *ndomain;
2710         struct intel_iommu *iommu;
2711         u8 bus, devfn;
2712
2713         iommu = device_to_iommu(dev, &bus, &devfn);
2714         if (!iommu)
2715                 return -ENODEV;
2716
2717         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2718         if (ndomain != domain)
2719                 return -EBUSY;
2720
2721         return 0;
2722 }
2723
2724 static bool device_has_rmrr(struct device *dev)
2725 {
2726         struct dmar_rmrr_unit *rmrr;
2727         struct device *tmp;
2728         int i;
2729
2730         rcu_read_lock();
2731         for_each_rmrr_units(rmrr) {
2732                 /*
2733                  * Return TRUE if this RMRR contains the device that
2734                  * is passed in.
2735                  */
2736                 for_each_active_dev_scope(rmrr->devices,
2737                                           rmrr->devices_cnt, i, tmp)
2738                         if (tmp == dev ||
2739                             is_downstream_to_pci_bridge(dev, tmp)) {
2740                                 rcu_read_unlock();
2741                                 return true;
2742                         }
2743         }
2744         rcu_read_unlock();
2745         return false;
2746 }
2747
2748 /**
2749  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2750  * is relaxable (ie. is allowed to be not enforced under some conditions)
2751  * @dev: device handle
2752  *
2753  * We assume that PCI USB devices with RMRRs have them largely
2754  * for historical reasons and that the RMRR space is not actively used post
2755  * boot.  This exclusion may change if vendors begin to abuse it.
2756  *
2757  * The same exception is made for graphics devices, with the requirement that
2758  * any use of the RMRR regions will be torn down before assigning the device
2759  * to a guest.
2760  *
2761  * Return: true if the RMRR is relaxable, false otherwise
2762  */
2763 static bool device_rmrr_is_relaxable(struct device *dev)
2764 {
2765         struct pci_dev *pdev;
2766
2767         if (!dev_is_pci(dev))
2768                 return false;
2769
2770         pdev = to_pci_dev(dev);
2771         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2772                 return true;
2773         else
2774                 return false;
2775 }
2776
2777 /*
2778  * There are a couple cases where we need to restrict the functionality of
2779  * devices associated with RMRRs.  The first is when evaluating a device for
2780  * identity mapping because problems exist when devices are moved in and out
2781  * of domains and their respective RMRR information is lost.  This means that
2782  * a device with associated RMRRs will never be in a "passthrough" domain.
2783  * The second is use of the device through the IOMMU API.  This interface
2784  * expects to have full control of the IOVA space for the device.  We cannot
2785  * satisfy both the requirement that RMRR access is maintained and have an
2786  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2787  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2788  * We therefore prevent devices associated with an RMRR from participating in
2789  * the IOMMU API, which eliminates them from device assignment.
2790  *
2791  * In both cases, devices which have relaxable RMRRs are not concerned by this
2792  * restriction. See device_rmrr_is_relaxable comment.
2793  */
2794 static bool device_is_rmrr_locked(struct device *dev)
2795 {
2796         if (!device_has_rmrr(dev))
2797                 return false;
2798
2799         if (device_rmrr_is_relaxable(dev))
2800                 return false;
2801
2802         return true;
2803 }
2804
2805 /*
2806  * Return the required default domain type for a specific device.
2807  *
2808  * @dev: the device in query
2809  * @startup: true if this is during early boot
2810  *
2811  * Returns:
2812  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2813  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2814  *  - 0: both identity and dynamic domains work for this device
2815  */
2816 static int device_def_domain_type(struct device *dev)
2817 {
2818         if (dev_is_pci(dev)) {
2819                 struct pci_dev *pdev = to_pci_dev(dev);
2820
2821                 /*
2822                  * Prevent any device marked as untrusted from getting
2823                  * placed into the statically identity mapping domain.
2824                  */
2825                 if (pdev->untrusted)
2826                         return IOMMU_DOMAIN_DMA;
2827
2828                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2829                         return IOMMU_DOMAIN_IDENTITY;
2830
2831                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2832                         return IOMMU_DOMAIN_IDENTITY;
2833         }
2834
2835         return 0;
2836 }
2837
2838 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2839 {
2840         /*
2841          * Start from the sane iommu hardware state.
2842          * If the queued invalidation is already initialized by us
2843          * (for example, while enabling interrupt-remapping) then
2844          * we got the things already rolling from a sane state.
2845          */
2846         if (!iommu->qi) {
2847                 /*
2848                  * Clear any previous faults.
2849                  */
2850                 dmar_fault(-1, iommu);
2851                 /*
2852                  * Disable queued invalidation if supported and already enabled
2853                  * before OS handover.
2854                  */
2855                 dmar_disable_qi(iommu);
2856         }
2857
2858         if (dmar_enable_qi(iommu)) {
2859                 /*
2860                  * Queued Invalidate not enabled, use Register Based Invalidate
2861                  */
2862                 iommu->flush.flush_context = __iommu_flush_context;
2863                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2864                 pr_info("%s: Using Register based invalidation\n",
2865                         iommu->name);
2866         } else {
2867                 iommu->flush.flush_context = qi_flush_context;
2868                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2869                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2870         }
2871 }
2872
2873 static int copy_context_table(struct intel_iommu *iommu,
2874                               struct root_entry *old_re,
2875                               struct context_entry **tbl,
2876                               int bus, bool ext)
2877 {
2878         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2879         struct context_entry *new_ce = NULL, ce;
2880         struct context_entry *old_ce = NULL;
2881         struct root_entry re;
2882         phys_addr_t old_ce_phys;
2883
2884         tbl_idx = ext ? bus * 2 : bus;
2885         memcpy(&re, old_re, sizeof(re));
2886
2887         for (devfn = 0; devfn < 256; devfn++) {
2888                 /* First calculate the correct index */
2889                 idx = (ext ? devfn * 2 : devfn) % 256;
2890
2891                 if (idx == 0) {
2892                         /* First save what we may have and clean up */
2893                         if (new_ce) {
2894                                 tbl[tbl_idx] = new_ce;
2895                                 __iommu_flush_cache(iommu, new_ce,
2896                                                     VTD_PAGE_SIZE);
2897                                 pos = 1;
2898                         }
2899
2900                         if (old_ce)
2901                                 memunmap(old_ce);
2902
2903                         ret = 0;
2904                         if (devfn < 0x80)
2905                                 old_ce_phys = root_entry_lctp(&re);
2906                         else
2907                                 old_ce_phys = root_entry_uctp(&re);
2908
2909                         if (!old_ce_phys) {
2910                                 if (ext && devfn == 0) {
2911                                         /* No LCTP, try UCTP */
2912                                         devfn = 0x7f;
2913                                         continue;
2914                                 } else {
2915                                         goto out;
2916                                 }
2917                         }
2918
2919                         ret = -ENOMEM;
2920                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2921                                         MEMREMAP_WB);
2922                         if (!old_ce)
2923                                 goto out;
2924
2925                         new_ce = alloc_pgtable_page(iommu->node);
2926                         if (!new_ce)
2927                                 goto out_unmap;
2928
2929                         ret = 0;
2930                 }
2931
2932                 /* Now copy the context entry */
2933                 memcpy(&ce, old_ce + idx, sizeof(ce));
2934
2935                 if (!__context_present(&ce))
2936                         continue;
2937
2938                 did = context_domain_id(&ce);
2939                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2940                         set_bit(did, iommu->domain_ids);
2941
2942                 /*
2943                  * We need a marker for copied context entries. This
2944                  * marker needs to work for the old format as well as
2945                  * for extended context entries.
2946                  *
2947                  * Bit 67 of the context entry is used. In the old
2948                  * format this bit is available to software, in the
2949                  * extended format it is the PGE bit, but PGE is ignored
2950                  * by HW if PASIDs are disabled (and thus still
2951                  * available).
2952                  *
2953                  * So disable PASIDs first and then mark the entry
2954                  * copied. This means that we don't copy PASID
2955                  * translations from the old kernel, but this is fine as
2956                  * faults there are not fatal.
2957                  */
2958                 context_clear_pasid_enable(&ce);
2959                 context_set_copied(&ce);
2960
2961                 new_ce[idx] = ce;
2962         }
2963
2964         tbl[tbl_idx + pos] = new_ce;
2965
2966         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2967
2968 out_unmap:
2969         memunmap(old_ce);
2970
2971 out:
2972         return ret;
2973 }
2974
2975 static int copy_translation_tables(struct intel_iommu *iommu)
2976 {
2977         struct context_entry **ctxt_tbls;
2978         struct root_entry *old_rt;
2979         phys_addr_t old_rt_phys;
2980         int ctxt_table_entries;
2981         unsigned long flags;
2982         u64 rtaddr_reg;
2983         int bus, ret;
2984         bool new_ext, ext;
2985
2986         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2987         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2988         new_ext    = !!ecap_ecs(iommu->ecap);
2989
2990         /*
2991          * The RTT bit can only be changed when translation is disabled,
2992          * but disabling translation means to open a window for data
2993          * corruption. So bail out and don't copy anything if we would
2994          * have to change the bit.
2995          */
2996         if (new_ext != ext)
2997                 return -EINVAL;
2998
2999         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3000         if (!old_rt_phys)
3001                 return -EINVAL;
3002
3003         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3004         if (!old_rt)
3005                 return -ENOMEM;
3006
3007         /* This is too big for the stack - allocate it from slab */
3008         ctxt_table_entries = ext ? 512 : 256;
3009         ret = -ENOMEM;
3010         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3011         if (!ctxt_tbls)
3012                 goto out_unmap;
3013
3014         for (bus = 0; bus < 256; bus++) {
3015                 ret = copy_context_table(iommu, &old_rt[bus],
3016                                          ctxt_tbls, bus, ext);
3017                 if (ret) {
3018                         pr_err("%s: Failed to copy context table for bus %d\n",
3019                                 iommu->name, bus);
3020                         continue;
3021                 }
3022         }
3023
3024         spin_lock_irqsave(&iommu->lock, flags);
3025
3026         /* Context tables are copied, now write them to the root_entry table */
3027         for (bus = 0; bus < 256; bus++) {
3028                 int idx = ext ? bus * 2 : bus;
3029                 u64 val;
3030
3031                 if (ctxt_tbls[idx]) {
3032                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3033                         iommu->root_entry[bus].lo = val;
3034                 }
3035
3036                 if (!ext || !ctxt_tbls[idx + 1])
3037                         continue;
3038
3039                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3040                 iommu->root_entry[bus].hi = val;
3041         }
3042
3043         spin_unlock_irqrestore(&iommu->lock, flags);
3044
3045         kfree(ctxt_tbls);
3046
3047         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3048
3049         ret = 0;
3050
3051 out_unmap:
3052         memunmap(old_rt);
3053
3054         return ret;
3055 }
3056
3057 #ifdef CONFIG_INTEL_IOMMU_SVM
3058 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3059 {
3060         struct intel_iommu *iommu = data;
3061         ioasid_t ioasid;
3062
3063         if (!iommu)
3064                 return INVALID_IOASID;
3065         /*
3066          * VT-d virtual command interface always uses the full 20 bit
3067          * PASID range. Host can partition guest PASID range based on
3068          * policies but it is out of guest's control.
3069          */
3070         if (min < PASID_MIN || max > intel_pasid_max_id)
3071                 return INVALID_IOASID;
3072
3073         if (vcmd_alloc_pasid(iommu, &ioasid))
3074                 return INVALID_IOASID;
3075
3076         return ioasid;
3077 }
3078
3079 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3080 {
3081         struct intel_iommu *iommu = data;
3082
3083         if (!iommu)
3084                 return;
3085         /*
3086          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3087          * We can only free the PASID when all the devices are unbound.
3088          */
3089         if (ioasid_find(NULL, ioasid, NULL)) {
3090                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3091                 return;
3092         }
3093         vcmd_free_pasid(iommu, ioasid);
3094 }
3095
3096 static void register_pasid_allocator(struct intel_iommu *iommu)
3097 {
3098         /*
3099          * If we are running in the host, no need for custom allocator
3100          * in that PASIDs are allocated from the host system-wide.
3101          */
3102         if (!cap_caching_mode(iommu->cap))
3103                 return;
3104
3105         if (!sm_supported(iommu)) {
3106                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3107                 return;
3108         }
3109
3110         /*
3111          * Register a custom PASID allocator if we are running in a guest,
3112          * guest PASID must be obtained via virtual command interface.
3113          * There can be multiple vIOMMUs in each guest but only one allocator
3114          * is active. All vIOMMU allocators will eventually be calling the same
3115          * host allocator.
3116          */
3117         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3118                 return;
3119
3120         pr_info("Register custom PASID allocator\n");
3121         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3122         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3123         iommu->pasid_allocator.pdata = (void *)iommu;
3124         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3125                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3126                 /*
3127                  * Disable scalable mode on this IOMMU if there
3128                  * is no custom allocator. Mixing SM capable vIOMMU
3129                  * and non-SM vIOMMU are not supported.
3130                  */
3131                 intel_iommu_sm = 0;
3132         }
3133 }
3134 #endif
3135
3136 static int __init init_dmars(void)
3137 {
3138         struct dmar_drhd_unit *drhd;
3139         struct intel_iommu *iommu;
3140         int ret;
3141
3142         /*
3143          * for each drhd
3144          *    allocate root
3145          *    initialize and program root entry to not present
3146          * endfor
3147          */
3148         for_each_drhd_unit(drhd) {
3149                 /*
3150                  * lock not needed as this is only incremented in the single
3151                  * threaded kernel __init code path all other access are read
3152                  * only
3153                  */
3154                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3155                         g_num_of_iommus++;
3156                         continue;
3157                 }
3158                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3159         }
3160
3161         /* Preallocate enough resources for IOMMU hot-addition */
3162         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3163                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3164
3165         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3166                         GFP_KERNEL);
3167         if (!g_iommus) {
3168                 pr_err("Allocating global iommu array failed\n");
3169                 ret = -ENOMEM;
3170                 goto error;
3171         }
3172
3173         for_each_iommu(iommu, drhd) {
3174                 if (drhd->ignored) {
3175                         iommu_disable_translation(iommu);
3176                         continue;
3177                 }
3178
3179                 /*
3180                  * Find the max pasid size of all IOMMU's in the system.
3181                  * We need to ensure the system pasid table is no bigger
3182                  * than the smallest supported.
3183                  */
3184                 if (pasid_supported(iommu)) {
3185                         u32 temp = 2 << ecap_pss(iommu->ecap);
3186
3187                         intel_pasid_max_id = min_t(u32, temp,
3188                                                    intel_pasid_max_id);
3189                 }
3190
3191                 g_iommus[iommu->seq_id] = iommu;
3192
3193                 intel_iommu_init_qi(iommu);
3194
3195                 ret = iommu_init_domains(iommu);
3196                 if (ret)
3197                         goto free_iommu;
3198
3199                 init_translation_status(iommu);
3200
3201                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3202                         iommu_disable_translation(iommu);
3203                         clear_translation_pre_enabled(iommu);
3204                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3205                                 iommu->name);
3206                 }
3207
3208                 /*
3209                  * TBD:
3210                  * we could share the same root & context tables
3211                  * among all IOMMU's. Need to Split it later.
3212                  */
3213                 ret = iommu_alloc_root_entry(iommu);
3214                 if (ret)
3215                         goto free_iommu;
3216
3217                 if (translation_pre_enabled(iommu)) {
3218                         pr_info("Translation already enabled - trying to copy translation structures\n");
3219
3220                         ret = copy_translation_tables(iommu);
3221                         if (ret) {
3222                                 /*
3223                                  * We found the IOMMU with translation
3224                                  * enabled - but failed to copy over the
3225                                  * old root-entry table. Try to proceed
3226                                  * by disabling translation now and
3227                                  * allocating a clean root-entry table.
3228                                  * This might cause DMAR faults, but
3229                                  * probably the dump will still succeed.
3230                                  */
3231                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3232                                        iommu->name);
3233                                 iommu_disable_translation(iommu);
3234                                 clear_translation_pre_enabled(iommu);
3235                         } else {
3236                                 pr_info("Copied translation tables from previous kernel for %s\n",
3237                                         iommu->name);
3238                         }
3239                 }
3240
3241                 if (!ecap_pass_through(iommu->ecap))
3242                         hw_pass_through = 0;
3243                 intel_svm_check(iommu);
3244         }
3245
3246         /*
3247          * Now that qi is enabled on all iommus, set the root entry and flush
3248          * caches. This is required on some Intel X58 chipsets, otherwise the
3249          * flush_context function will loop forever and the boot hangs.
3250          */
3251         for_each_active_iommu(iommu, drhd) {
3252                 iommu_flush_write_buffer(iommu);
3253 #ifdef CONFIG_INTEL_IOMMU_SVM
3254                 register_pasid_allocator(iommu);
3255 #endif
3256                 iommu_set_root_entry(iommu);
3257                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3258                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3259         }
3260
3261 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3262         dmar_map_gfx = 0;
3263 #endif
3264
3265         if (!dmar_map_gfx)
3266                 iommu_identity_mapping |= IDENTMAP_GFX;
3267
3268         check_tylersburg_isoch();
3269
3270         ret = si_domain_init(hw_pass_through);
3271         if (ret)
3272                 goto free_iommu;
3273
3274         /*
3275          * for each drhd
3276          *   enable fault log
3277          *   global invalidate context cache
3278          *   global invalidate iotlb
3279          *   enable translation
3280          */
3281         for_each_iommu(iommu, drhd) {
3282                 if (drhd->ignored) {
3283                         /*
3284                          * we always have to disable PMRs or DMA may fail on
3285                          * this device
3286                          */
3287                         if (force_on)
3288                                 iommu_disable_protect_mem_regions(iommu);
3289                         continue;
3290                 }
3291
3292                 iommu_flush_write_buffer(iommu);
3293
3294 #ifdef CONFIG_INTEL_IOMMU_SVM
3295                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3296                         /*
3297                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3298                          * could cause possible lock race condition.
3299                          */
3300                         up_write(&dmar_global_lock);
3301                         ret = intel_svm_enable_prq(iommu);
3302                         down_write(&dmar_global_lock);
3303                         if (ret)
3304                                 goto free_iommu;
3305                 }
3306 #endif
3307                 ret = dmar_set_interrupt(iommu);
3308                 if (ret)
3309                         goto free_iommu;
3310         }
3311
3312         return 0;
3313
3314 free_iommu:
3315         for_each_active_iommu(iommu, drhd) {
3316                 disable_dmar_iommu(iommu);
3317                 free_dmar_iommu(iommu);
3318         }
3319
3320         kfree(g_iommus);
3321
3322 error:
3323         return ret;
3324 }
3325
3326 /* This takes a number of _MM_ pages, not VTD pages */
3327 static unsigned long intel_alloc_iova(struct device *dev,
3328                                      struct dmar_domain *domain,
3329                                      unsigned long nrpages, uint64_t dma_mask)
3330 {
3331         unsigned long iova_pfn;
3332
3333         /*
3334          * Restrict dma_mask to the width that the iommu can handle.
3335          * First-level translation restricts the input-address to a
3336          * canonical address (i.e., address bits 63:N have the same
3337          * value as address bit [N-1], where N is 48-bits with 4-level
3338          * paging and 57-bits with 5-level paging). Hence, skip bit
3339          * [N-1].
3340          */
3341         if (domain_use_first_level(domain))
3342                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3343                                  dma_mask);
3344         else
3345                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3346                                  dma_mask);
3347
3348         /* Ensure we reserve the whole size-aligned region */
3349         nrpages = __roundup_pow_of_two(nrpages);
3350
3351         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3352                 /*
3353                  * First try to allocate an io virtual address in
3354                  * DMA_BIT_MASK(32) and if that fails then try allocating
3355                  * from higher range
3356                  */
3357                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3358                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3359                 if (iova_pfn)
3360                         return iova_pfn;
3361         }
3362         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3363                                    IOVA_PFN(dma_mask), true);
3364         if (unlikely(!iova_pfn)) {
3365                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3366                              nrpages);
3367                 return 0;
3368         }
3369
3370         return iova_pfn;
3371 }
3372
3373 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3374                                      size_t size, int dir, u64 dma_mask)
3375 {
3376         struct dmar_domain *domain;
3377         phys_addr_t start_paddr;
3378         unsigned long iova_pfn;
3379         int prot = 0;
3380         int ret;
3381         struct intel_iommu *iommu;
3382         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3383
3384         BUG_ON(dir == DMA_NONE);
3385
3386         if (unlikely(attach_deferred(dev)))
3387                 do_deferred_attach(dev);
3388
3389         domain = find_domain(dev);
3390         if (!domain)
3391                 return DMA_MAPPING_ERROR;
3392
3393         iommu = domain_get_iommu(domain);
3394         size = aligned_nrpages(paddr, size);
3395
3396         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3397         if (!iova_pfn)
3398                 goto error;
3399
3400         /*
3401          * Check if DMAR supports zero-length reads on write only
3402          * mappings..
3403          */
3404         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3405                         !cap_zlr(iommu->cap))
3406                 prot |= DMA_PTE_READ;
3407         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3408                 prot |= DMA_PTE_WRITE;
3409         /*
3410          * paddr - (paddr + size) might be partial page, we should map the whole
3411          * page.  Note: if two part of one page are separately mapped, we
3412          * might have two guest_addr mapping to the same host paddr, but this
3413          * is not a big problem
3414          */
3415         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3416                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3417         if (ret)
3418                 goto error;
3419
3420         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3421         start_paddr += paddr & ~PAGE_MASK;
3422
3423         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3424
3425         return start_paddr;
3426
3427 error:
3428         if (iova_pfn)
3429                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3430         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3431                 size, (unsigned long long)paddr, dir);
3432         return DMA_MAPPING_ERROR;
3433 }
3434
3435 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3436                                  unsigned long offset, size_t size,
3437                                  enum dma_data_direction dir,
3438                                  unsigned long attrs)
3439 {
3440         return __intel_map_single(dev, page_to_phys(page) + offset,
3441                                   size, dir, *dev->dma_mask);
3442 }
3443
3444 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3445                                      size_t size, enum dma_data_direction dir,
3446                                      unsigned long attrs)
3447 {
3448         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3449 }
3450
3451 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3452 {
3453         struct dmar_domain *domain;
3454         unsigned long start_pfn, last_pfn;
3455         unsigned long nrpages;
3456         unsigned long iova_pfn;
3457         struct intel_iommu *iommu;
3458         struct page *freelist;
3459         struct pci_dev *pdev = NULL;
3460
3461         domain = find_domain(dev);
3462         BUG_ON(!domain);
3463
3464         iommu = domain_get_iommu(domain);
3465
3466         iova_pfn = IOVA_PFN(dev_addr);
3467
3468         nrpages = aligned_nrpages(dev_addr, size);
3469         start_pfn = mm_to_dma_pfn(iova_pfn);
3470         last_pfn = start_pfn + nrpages - 1;
3471
3472         if (dev_is_pci(dev))
3473                 pdev = to_pci_dev(dev);
3474
3475         freelist = domain_unmap(domain, start_pfn, last_pfn);
3476         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3477                         !has_iova_flush_queue(&domain->iovad)) {
3478                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3479                                       nrpages, !freelist, 0);
3480                 /* free iova */
3481                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3482                 dma_free_pagelist(freelist);
3483         } else {
3484                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3485                            (unsigned long)freelist);
3486                 /*
3487                  * queue up the release of the unmap to save the 1/6th of the
3488                  * cpu used up by the iotlb flush operation...
3489                  */
3490         }
3491
3492         trace_unmap_single(dev, dev_addr, size);
3493 }
3494
3495 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3496                              size_t size, enum dma_data_direction dir,
3497                              unsigned long attrs)
3498 {
3499         intel_unmap(dev, dev_addr, size);
3500 }
3501
3502 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3503                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3504 {
3505         intel_unmap(dev, dev_addr, size);
3506 }
3507
3508 static void *intel_alloc_coherent(struct device *dev, size_t size,
3509                                   dma_addr_t *dma_handle, gfp_t flags,
3510                                   unsigned long attrs)
3511 {
3512         struct page *page = NULL;
3513         int order;
3514
3515         if (unlikely(attach_deferred(dev)))
3516                 do_deferred_attach(dev);
3517
3518         size = PAGE_ALIGN(size);
3519         order = get_order(size);
3520
3521         if (gfpflags_allow_blocking(flags)) {
3522                 unsigned int count = size >> PAGE_SHIFT;
3523
3524                 page = dma_alloc_from_contiguous(dev, count, order,
3525                                                  flags & __GFP_NOWARN);
3526         }
3527
3528         if (!page)
3529                 page = alloc_pages(flags, order);
3530         if (!page)
3531                 return NULL;
3532         memset(page_address(page), 0, size);
3533
3534         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3535                                          DMA_BIDIRECTIONAL,
3536                                          dev->coherent_dma_mask);
3537         if (*dma_handle != DMA_MAPPING_ERROR)
3538                 return page_address(page);
3539         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3540                 __free_pages(page, order);
3541
3542         return NULL;
3543 }
3544
3545 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3546                                 dma_addr_t dma_handle, unsigned long attrs)
3547 {
3548         int order;
3549         struct page *page = virt_to_page(vaddr);
3550
3551         size = PAGE_ALIGN(size);
3552         order = get_order(size);
3553
3554         intel_unmap(dev, dma_handle, size);
3555         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3556                 __free_pages(page, order);
3557 }
3558
3559 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3560                            int nelems, enum dma_data_direction dir,
3561                            unsigned long attrs)
3562 {
3563         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3564         unsigned long nrpages = 0;
3565         struct scatterlist *sg;
3566         int i;
3567
3568         for_each_sg(sglist, sg, nelems, i) {
3569                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3570         }
3571
3572         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3573
3574         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3575 }
3576
3577 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3578                         enum dma_data_direction dir, unsigned long attrs)
3579 {
3580         int i;
3581         struct dmar_domain *domain;
3582         size_t size = 0;
3583         int prot = 0;
3584         unsigned long iova_pfn;
3585         int ret;
3586         struct scatterlist *sg;
3587         unsigned long start_vpfn;
3588         struct intel_iommu *iommu;
3589
3590         BUG_ON(dir == DMA_NONE);
3591
3592         if (unlikely(attach_deferred(dev)))
3593                 do_deferred_attach(dev);
3594
3595         domain = find_domain(dev);
3596         if (!domain)
3597                 return 0;
3598
3599         iommu = domain_get_iommu(domain);
3600
3601         for_each_sg(sglist, sg, nelems, i)
3602                 size += aligned_nrpages(sg->offset, sg->length);
3603
3604         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3605                                 *dev->dma_mask);
3606         if (!iova_pfn) {
3607                 sglist->dma_length = 0;
3608                 return 0;
3609         }
3610
3611         /*
3612          * Check if DMAR supports zero-length reads on write only
3613          * mappings..
3614          */
3615         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3616                         !cap_zlr(iommu->cap))
3617                 prot |= DMA_PTE_READ;
3618         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3619                 prot |= DMA_PTE_WRITE;
3620
3621         start_vpfn = mm_to_dma_pfn(iova_pfn);
3622
3623         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3624         if (unlikely(ret)) {
3625                 dma_pte_free_pagetable(domain, start_vpfn,
3626                                        start_vpfn + size - 1,
3627                                        agaw_to_level(domain->agaw) + 1);
3628                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3629                 return 0;
3630         }
3631
3632         for_each_sg(sglist, sg, nelems, i)
3633                 trace_map_sg(dev, i + 1, nelems, sg);
3634
3635         return nelems;
3636 }
3637
3638 static u64 intel_get_required_mask(struct device *dev)
3639 {
3640         return DMA_BIT_MASK(32);
3641 }
3642
3643 static const struct dma_map_ops intel_dma_ops = {
3644         .alloc = intel_alloc_coherent,
3645         .free = intel_free_coherent,
3646         .map_sg = intel_map_sg,
3647         .unmap_sg = intel_unmap_sg,
3648         .map_page = intel_map_page,
3649         .unmap_page = intel_unmap_page,
3650         .map_resource = intel_map_resource,
3651         .unmap_resource = intel_unmap_resource,
3652         .dma_supported = dma_direct_supported,
3653         .mmap = dma_common_mmap,
3654         .get_sgtable = dma_common_get_sgtable,
3655         .get_required_mask = intel_get_required_mask,
3656 };
3657
3658 static void
3659 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3660                    enum dma_data_direction dir, enum dma_sync_target target)
3661 {
3662         struct dmar_domain *domain;
3663         phys_addr_t tlb_addr;
3664
3665         domain = find_domain(dev);
3666         if (WARN_ON(!domain))
3667                 return;
3668
3669         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3670         if (is_swiotlb_buffer(tlb_addr))
3671                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3672 }
3673
3674 static dma_addr_t
3675 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3676                   enum dma_data_direction dir, unsigned long attrs,
3677                   u64 dma_mask)
3678 {
3679         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3680         struct dmar_domain *domain;
3681         struct intel_iommu *iommu;
3682         unsigned long iova_pfn;
3683         unsigned long nrpages;
3684         phys_addr_t tlb_addr;
3685         int prot = 0;
3686         int ret;
3687
3688         if (unlikely(attach_deferred(dev)))
3689                 do_deferred_attach(dev);
3690
3691         domain = find_domain(dev);
3692
3693         if (WARN_ON(dir == DMA_NONE || !domain))
3694                 return DMA_MAPPING_ERROR;
3695
3696         iommu = domain_get_iommu(domain);
3697         if (WARN_ON(!iommu))
3698                 return DMA_MAPPING_ERROR;
3699
3700         nrpages = aligned_nrpages(0, size);
3701         iova_pfn = intel_alloc_iova(dev, domain,
3702                                     dma_to_mm_pfn(nrpages), dma_mask);
3703         if (!iova_pfn)
3704                 return DMA_MAPPING_ERROR;
3705
3706         /*
3707          * Check if DMAR supports zero-length reads on write only
3708          * mappings..
3709          */
3710         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3711                         !cap_zlr(iommu->cap))
3712                 prot |= DMA_PTE_READ;
3713         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714                 prot |= DMA_PTE_WRITE;
3715
3716         /*
3717          * If both the physical buffer start address and size are
3718          * page aligned, we don't need to use a bounce page.
3719          */
3720         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3721                 tlb_addr = swiotlb_tbl_map_single(dev,
3722                                 __phys_to_dma(dev, io_tlb_start),
3723                                 paddr, size, aligned_size, dir, attrs);
3724                 if (tlb_addr == DMA_MAPPING_ERROR) {
3725                         goto swiotlb_error;
3726                 } else {
3727                         /* Cleanup the padding area. */
3728                         void *padding_start = phys_to_virt(tlb_addr);
3729                         size_t padding_size = aligned_size;
3730
3731                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3732                             (dir == DMA_TO_DEVICE ||
3733                              dir == DMA_BIDIRECTIONAL)) {
3734                                 padding_start += size;
3735                                 padding_size -= size;
3736                         }
3737
3738                         memset(padding_start, 0, padding_size);
3739                 }
3740         } else {
3741                 tlb_addr = paddr;
3742         }
3743
3744         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3745                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3746         if (ret)
3747                 goto mapping_error;
3748
3749         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3750
3751         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3752
3753 mapping_error:
3754         if (is_swiotlb_buffer(tlb_addr))
3755                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3756                                          aligned_size, dir, attrs);
3757 swiotlb_error:
3758         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3759         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3760                 size, (unsigned long long)paddr, dir);
3761
3762         return DMA_MAPPING_ERROR;
3763 }
3764
3765 static void
3766 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3767                     enum dma_data_direction dir, unsigned long attrs)
3768 {
3769         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3770         struct dmar_domain *domain;
3771         phys_addr_t tlb_addr;
3772
3773         domain = find_domain(dev);
3774         if (WARN_ON(!domain))
3775                 return;
3776
3777         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3778         if (WARN_ON(!tlb_addr))
3779                 return;
3780
3781         intel_unmap(dev, dev_addr, size);
3782         if (is_swiotlb_buffer(tlb_addr))
3783                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3784                                          aligned_size, dir, attrs);
3785
3786         trace_bounce_unmap_single(dev, dev_addr, size);
3787 }
3788
3789 static dma_addr_t
3790 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3791                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3792 {
3793         return bounce_map_single(dev, page_to_phys(page) + offset,
3794                                  size, dir, attrs, *dev->dma_mask);
3795 }
3796
3797 static dma_addr_t
3798 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3799                     enum dma_data_direction dir, unsigned long attrs)
3800 {
3801         return bounce_map_single(dev, phys_addr, size,
3802                                  dir, attrs, *dev->dma_mask);
3803 }
3804
3805 static void
3806 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3807                   enum dma_data_direction dir, unsigned long attrs)
3808 {
3809         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3810 }
3811
3812 static void
3813 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3814                       enum dma_data_direction dir, unsigned long attrs)
3815 {
3816         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3817 }
3818
3819 static void
3820 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3821                 enum dma_data_direction dir, unsigned long attrs)
3822 {
3823         struct scatterlist *sg;
3824         int i;
3825
3826         for_each_sg(sglist, sg, nelems, i)
3827                 bounce_unmap_page(dev, sg->dma_address,
3828                                   sg_dma_len(sg), dir, attrs);
3829 }
3830
3831 static int
3832 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3833               enum dma_data_direction dir, unsigned long attrs)
3834 {
3835         int i;
3836         struct scatterlist *sg;
3837
3838         for_each_sg(sglist, sg, nelems, i) {
3839                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3840                                                   sg->offset, sg->length,
3841                                                   dir, attrs);
3842                 if (sg->dma_address == DMA_MAPPING_ERROR)
3843                         goto out_unmap;
3844                 sg_dma_len(sg) = sg->length;
3845         }
3846
3847         for_each_sg(sglist, sg, nelems, i)
3848                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3849
3850         return nelems;
3851
3852 out_unmap:
3853         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3854         return 0;
3855 }
3856
3857 static void
3858 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3859                            size_t size, enum dma_data_direction dir)
3860 {
3861         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3862 }
3863
3864 static void
3865 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3866                               size_t size, enum dma_data_direction dir)
3867 {
3868         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3869 }
3870
3871 static void
3872 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3873                        int nelems, enum dma_data_direction dir)
3874 {
3875         struct scatterlist *sg;
3876         int i;
3877
3878         for_each_sg(sglist, sg, nelems, i)
3879                 bounce_sync_single(dev, sg_dma_address(sg),
3880                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3881 }
3882
3883 static void
3884 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3885                           int nelems, enum dma_data_direction dir)
3886 {
3887         struct scatterlist *sg;
3888         int i;
3889
3890         for_each_sg(sglist, sg, nelems, i)
3891                 bounce_sync_single(dev, sg_dma_address(sg),
3892                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3893 }
3894
3895 static const struct dma_map_ops bounce_dma_ops = {
3896         .alloc                  = intel_alloc_coherent,
3897         .free                   = intel_free_coherent,
3898         .map_sg                 = bounce_map_sg,
3899         .unmap_sg               = bounce_unmap_sg,
3900         .map_page               = bounce_map_page,
3901         .unmap_page             = bounce_unmap_page,
3902         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3903         .sync_single_for_device = bounce_sync_single_for_device,
3904         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3905         .sync_sg_for_device     = bounce_sync_sg_for_device,
3906         .map_resource           = bounce_map_resource,
3907         .unmap_resource         = bounce_unmap_resource,
3908         .dma_supported          = dma_direct_supported,
3909 };
3910
3911 static inline int iommu_domain_cache_init(void)
3912 {
3913         int ret = 0;
3914
3915         iommu_domain_cache = kmem_cache_create("iommu_domain",
3916                                          sizeof(struct dmar_domain),
3917                                          0,
3918                                          SLAB_HWCACHE_ALIGN,
3919
3920                                          NULL);
3921         if (!iommu_domain_cache) {
3922                 pr_err("Couldn't create iommu_domain cache\n");
3923                 ret = -ENOMEM;
3924         }
3925
3926         return ret;
3927 }
3928
3929 static inline int iommu_devinfo_cache_init(void)
3930 {
3931         int ret = 0;
3932
3933         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3934                                          sizeof(struct device_domain_info),
3935                                          0,
3936                                          SLAB_HWCACHE_ALIGN,
3937                                          NULL);
3938         if (!iommu_devinfo_cache) {
3939                 pr_err("Couldn't create devinfo cache\n");
3940                 ret = -ENOMEM;
3941         }
3942
3943         return ret;
3944 }
3945
3946 static int __init iommu_init_mempool(void)
3947 {
3948         int ret;
3949         ret = iova_cache_get();
3950         if (ret)
3951                 return ret;
3952
3953         ret = iommu_domain_cache_init();
3954         if (ret)
3955                 goto domain_error;
3956
3957         ret = iommu_devinfo_cache_init();
3958         if (!ret)
3959                 return ret;
3960
3961         kmem_cache_destroy(iommu_domain_cache);
3962 domain_error:
3963         iova_cache_put();
3964
3965         return -ENOMEM;
3966 }
3967
3968 static void __init iommu_exit_mempool(void)
3969 {
3970         kmem_cache_destroy(iommu_devinfo_cache);
3971         kmem_cache_destroy(iommu_domain_cache);
3972         iova_cache_put();
3973 }
3974
3975 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3976 {
3977         struct dmar_drhd_unit *drhd;
3978         u32 vtbar;
3979         int rc;
3980
3981         /* We know that this device on this chipset has its own IOMMU.
3982          * If we find it under a different IOMMU, then the BIOS is lying
3983          * to us. Hope that the IOMMU for this device is actually
3984          * disabled, and it needs no translation...
3985          */
3986         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3987         if (rc) {
3988                 /* "can't" happen */
3989                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3990                 return;
3991         }
3992         vtbar &= 0xffff0000;
3993
3994         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3995         drhd = dmar_find_matched_drhd_unit(pdev);
3996         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3997                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3998                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3999                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4000         }
4001 }
4002 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4003
4004 static void __init init_no_remapping_devices(void)
4005 {
4006         struct dmar_drhd_unit *drhd;
4007         struct device *dev;
4008         int i;
4009
4010         for_each_drhd_unit(drhd) {
4011                 if (!drhd->include_all) {
4012                         for_each_active_dev_scope(drhd->devices,
4013                                                   drhd->devices_cnt, i, dev)
4014                                 break;
4015                         /* ignore DMAR unit if no devices exist */
4016                         if (i == drhd->devices_cnt)
4017                                 drhd->ignored = 1;
4018                 }
4019         }
4020
4021         for_each_active_drhd_unit(drhd) {
4022                 if (drhd->include_all)
4023                         continue;
4024
4025                 for_each_active_dev_scope(drhd->devices,
4026                                           drhd->devices_cnt, i, dev)
4027                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4028                                 break;
4029                 if (i < drhd->devices_cnt)
4030                         continue;
4031
4032                 /* This IOMMU has *only* gfx devices. Either bypass it or
4033                    set the gfx_mapped flag, as appropriate */
4034                 if (!dmar_map_gfx) {
4035                         drhd->ignored = 1;
4036                         for_each_active_dev_scope(drhd->devices,
4037                                                   drhd->devices_cnt, i, dev)
4038                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4039                 }
4040         }
4041 }
4042
4043 #ifdef CONFIG_SUSPEND
4044 static int init_iommu_hw(void)
4045 {
4046         struct dmar_drhd_unit *drhd;
4047         struct intel_iommu *iommu = NULL;
4048
4049         for_each_active_iommu(iommu, drhd)
4050                 if (iommu->qi)
4051                         dmar_reenable_qi(iommu);
4052
4053         for_each_iommu(iommu, drhd) {
4054                 if (drhd->ignored) {
4055                         /*
4056                          * we always have to disable PMRs or DMA may fail on
4057                          * this device
4058                          */
4059                         if (force_on)
4060                                 iommu_disable_protect_mem_regions(iommu);
4061                         continue;
4062                 }
4063
4064                 iommu_flush_write_buffer(iommu);
4065
4066                 iommu_set_root_entry(iommu);
4067
4068                 iommu->flush.flush_context(iommu, 0, 0, 0,
4069                                            DMA_CCMD_GLOBAL_INVL);
4070                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4071                 iommu_enable_translation(iommu);
4072                 iommu_disable_protect_mem_regions(iommu);
4073         }
4074
4075         return 0;
4076 }
4077
4078 static void iommu_flush_all(void)
4079 {
4080         struct dmar_drhd_unit *drhd;
4081         struct intel_iommu *iommu;
4082
4083         for_each_active_iommu(iommu, drhd) {
4084                 iommu->flush.flush_context(iommu, 0, 0, 0,
4085                                            DMA_CCMD_GLOBAL_INVL);
4086                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4087                                          DMA_TLB_GLOBAL_FLUSH);
4088         }
4089 }
4090
4091 static int iommu_suspend(void)
4092 {
4093         struct dmar_drhd_unit *drhd;
4094         struct intel_iommu *iommu = NULL;
4095         unsigned long flag;
4096
4097         for_each_active_iommu(iommu, drhd) {
4098                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4099                                                  GFP_ATOMIC);
4100                 if (!iommu->iommu_state)
4101                         goto nomem;
4102         }
4103
4104         iommu_flush_all();
4105
4106         for_each_active_iommu(iommu, drhd) {
4107                 iommu_disable_translation(iommu);
4108
4109                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4110
4111                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4112                         readl(iommu->reg + DMAR_FECTL_REG);
4113                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4114                         readl(iommu->reg + DMAR_FEDATA_REG);
4115                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4116                         readl(iommu->reg + DMAR_FEADDR_REG);
4117                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4118                         readl(iommu->reg + DMAR_FEUADDR_REG);
4119
4120                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4121         }
4122         return 0;
4123
4124 nomem:
4125         for_each_active_iommu(iommu, drhd)
4126                 kfree(iommu->iommu_state);
4127
4128         return -ENOMEM;
4129 }
4130
4131 static void iommu_resume(void)
4132 {
4133         struct dmar_drhd_unit *drhd;
4134         struct intel_iommu *iommu = NULL;
4135         unsigned long flag;
4136
4137         if (init_iommu_hw()) {
4138                 if (force_on)
4139                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4140                 else
4141                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4142                 return;
4143         }
4144
4145         for_each_active_iommu(iommu, drhd) {
4146
4147                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4148
4149                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4150                         iommu->reg + DMAR_FECTL_REG);
4151                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4152                         iommu->reg + DMAR_FEDATA_REG);
4153                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4154                         iommu->reg + DMAR_FEADDR_REG);
4155                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4156                         iommu->reg + DMAR_FEUADDR_REG);
4157
4158                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4159         }
4160
4161         for_each_active_iommu(iommu, drhd)
4162                 kfree(iommu->iommu_state);
4163 }
4164
4165 static struct syscore_ops iommu_syscore_ops = {
4166         .resume         = iommu_resume,
4167         .suspend        = iommu_suspend,
4168 };
4169
4170 static void __init init_iommu_pm_ops(void)
4171 {
4172         register_syscore_ops(&iommu_syscore_ops);
4173 }
4174
4175 #else
4176 static inline void init_iommu_pm_ops(void) {}
4177 #endif  /* CONFIG_PM */
4178
4179 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4180 {
4181         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4182             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4183             rmrr->end_address <= rmrr->base_address ||
4184             arch_rmrr_sanity_check(rmrr))
4185                 return -EINVAL;
4186
4187         return 0;
4188 }
4189
4190 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4191 {
4192         struct acpi_dmar_reserved_memory *rmrr;
4193         struct dmar_rmrr_unit *rmrru;
4194
4195         rmrr = (struct acpi_dmar_reserved_memory *)header;
4196         if (rmrr_sanity_check(rmrr)) {
4197                 pr_warn(FW_BUG
4198                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4199                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4200                            rmrr->base_address, rmrr->end_address,
4201                            dmi_get_system_info(DMI_BIOS_VENDOR),
4202                            dmi_get_system_info(DMI_BIOS_VERSION),
4203                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4204                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4205         }
4206
4207         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4208         if (!rmrru)
4209                 goto out;
4210
4211         rmrru->hdr = header;
4212
4213         rmrru->base_address = rmrr->base_address;
4214         rmrru->end_address = rmrr->end_address;
4215
4216         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4217                                 ((void *)rmrr) + rmrr->header.length,
4218                                 &rmrru->devices_cnt);
4219         if (rmrru->devices_cnt && rmrru->devices == NULL)
4220                 goto free_rmrru;
4221
4222         list_add(&rmrru->list, &dmar_rmrr_units);
4223
4224         return 0;
4225 free_rmrru:
4226         kfree(rmrru);
4227 out:
4228         return -ENOMEM;
4229 }
4230
4231 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4232 {
4233         struct dmar_atsr_unit *atsru;
4234         struct acpi_dmar_atsr *tmp;
4235
4236         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4237                                 dmar_rcu_check()) {
4238                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4239                 if (atsr->segment != tmp->segment)
4240                         continue;
4241                 if (atsr->header.length != tmp->header.length)
4242                         continue;
4243                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4244                         return atsru;
4245         }
4246
4247         return NULL;
4248 }
4249
4250 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4251 {
4252         struct acpi_dmar_atsr *atsr;
4253         struct dmar_atsr_unit *atsru;
4254
4255         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4256                 return 0;
4257
4258         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4259         atsru = dmar_find_atsr(atsr);
4260         if (atsru)
4261                 return 0;
4262
4263         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4264         if (!atsru)
4265                 return -ENOMEM;
4266
4267         /*
4268          * If memory is allocated from slab by ACPI _DSM method, we need to
4269          * copy the memory content because the memory buffer will be freed
4270          * on return.
4271          */
4272         atsru->hdr = (void *)(atsru + 1);
4273         memcpy(atsru->hdr, hdr, hdr->length);
4274         atsru->include_all = atsr->flags & 0x1;
4275         if (!atsru->include_all) {
4276                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4277                                 (void *)atsr + atsr->header.length,
4278                                 &atsru->devices_cnt);
4279                 if (atsru->devices_cnt && atsru->devices == NULL) {
4280                         kfree(atsru);
4281                         return -ENOMEM;
4282                 }
4283         }
4284
4285         list_add_rcu(&atsru->list, &dmar_atsr_units);
4286
4287         return 0;
4288 }
4289
4290 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4291 {
4292         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4293         kfree(atsru);
4294 }
4295
4296 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4297 {
4298         struct acpi_dmar_atsr *atsr;
4299         struct dmar_atsr_unit *atsru;
4300
4301         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4302         atsru = dmar_find_atsr(atsr);
4303         if (atsru) {
4304                 list_del_rcu(&atsru->list);
4305                 synchronize_rcu();
4306                 intel_iommu_free_atsr(atsru);
4307         }
4308
4309         return 0;
4310 }
4311
4312 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4313 {
4314         int i;
4315         struct device *dev;
4316         struct acpi_dmar_atsr *atsr;
4317         struct dmar_atsr_unit *atsru;
4318
4319         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320         atsru = dmar_find_atsr(atsr);
4321         if (!atsru)
4322                 return 0;
4323
4324         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4325                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4326                                           i, dev)
4327                         return -EBUSY;
4328         }
4329
4330         return 0;
4331 }
4332
4333 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4334 {
4335         int sp, ret;
4336         struct intel_iommu *iommu = dmaru->iommu;
4337
4338         if (g_iommus[iommu->seq_id])
4339                 return 0;
4340
4341         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4342                 pr_warn("%s: Doesn't support hardware pass through.\n",
4343                         iommu->name);
4344                 return -ENXIO;
4345         }
4346         if (!ecap_sc_support(iommu->ecap) &&
4347             domain_update_iommu_snooping(iommu)) {
4348                 pr_warn("%s: Doesn't support snooping.\n",
4349                         iommu->name);
4350                 return -ENXIO;
4351         }
4352         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4353         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4354                 pr_warn("%s: Doesn't support large page.\n",
4355                         iommu->name);
4356                 return -ENXIO;
4357         }
4358
4359         /*
4360          * Disable translation if already enabled prior to OS handover.
4361          */
4362         if (iommu->gcmd & DMA_GCMD_TE)
4363                 iommu_disable_translation(iommu);
4364
4365         g_iommus[iommu->seq_id] = iommu;
4366         ret = iommu_init_domains(iommu);
4367         if (ret == 0)
4368                 ret = iommu_alloc_root_entry(iommu);
4369         if (ret)
4370                 goto out;
4371
4372         intel_svm_check(iommu);
4373
4374         if (dmaru->ignored) {
4375                 /*
4376                  * we always have to disable PMRs or DMA may fail on this device
4377                  */
4378                 if (force_on)
4379                         iommu_disable_protect_mem_regions(iommu);
4380                 return 0;
4381         }
4382
4383         intel_iommu_init_qi(iommu);
4384         iommu_flush_write_buffer(iommu);
4385
4386 #ifdef CONFIG_INTEL_IOMMU_SVM
4387         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4388                 ret = intel_svm_enable_prq(iommu);
4389                 if (ret)
4390                         goto disable_iommu;
4391         }
4392 #endif
4393         ret = dmar_set_interrupt(iommu);
4394         if (ret)
4395                 goto disable_iommu;
4396
4397         iommu_set_root_entry(iommu);
4398         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4399         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4400         iommu_enable_translation(iommu);
4401
4402         iommu_disable_protect_mem_regions(iommu);
4403         return 0;
4404
4405 disable_iommu:
4406         disable_dmar_iommu(iommu);
4407 out:
4408         free_dmar_iommu(iommu);
4409         return ret;
4410 }
4411
4412 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4413 {
4414         int ret = 0;
4415         struct intel_iommu *iommu = dmaru->iommu;
4416
4417         if (!intel_iommu_enabled)
4418                 return 0;
4419         if (iommu == NULL)
4420                 return -EINVAL;
4421
4422         if (insert) {
4423                 ret = intel_iommu_add(dmaru);
4424         } else {
4425                 disable_dmar_iommu(iommu);
4426                 free_dmar_iommu(iommu);
4427         }
4428
4429         return ret;
4430 }
4431
4432 static void intel_iommu_free_dmars(void)
4433 {
4434         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4435         struct dmar_atsr_unit *atsru, *atsr_n;
4436
4437         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4438                 list_del(&rmrru->list);
4439                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4440                 kfree(rmrru);
4441         }
4442
4443         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4444                 list_del(&atsru->list);
4445                 intel_iommu_free_atsr(atsru);
4446         }
4447 }
4448
4449 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4450 {
4451         int i, ret = 1;
4452         struct pci_bus *bus;
4453         struct pci_dev *bridge = NULL;
4454         struct device *tmp;
4455         struct acpi_dmar_atsr *atsr;
4456         struct dmar_atsr_unit *atsru;
4457
4458         dev = pci_physfn(dev);
4459         for (bus = dev->bus; bus; bus = bus->parent) {
4460                 bridge = bus->self;
4461                 /* If it's an integrated device, allow ATS */
4462                 if (!bridge)
4463                         return 1;
4464                 /* Connected via non-PCIe: no ATS */
4465                 if (!pci_is_pcie(bridge) ||
4466                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4467                         return 0;
4468                 /* If we found the root port, look it up in the ATSR */
4469                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4470                         break;
4471         }
4472
4473         rcu_read_lock();
4474         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4475                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4476                 if (atsr->segment != pci_domain_nr(dev->bus))
4477                         continue;
4478
4479                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4480                         if (tmp == &bridge->dev)
4481                                 goto out;
4482
4483                 if (atsru->include_all)
4484                         goto out;
4485         }
4486         ret = 0;
4487 out:
4488         rcu_read_unlock();
4489
4490         return ret;
4491 }
4492
4493 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4494 {
4495         int ret;
4496         struct dmar_rmrr_unit *rmrru;
4497         struct dmar_atsr_unit *atsru;
4498         struct acpi_dmar_atsr *atsr;
4499         struct acpi_dmar_reserved_memory *rmrr;
4500
4501         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4502                 return 0;
4503
4504         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4505                 rmrr = container_of(rmrru->hdr,
4506                                     struct acpi_dmar_reserved_memory, header);
4507                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4508                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4509                                 ((void *)rmrr) + rmrr->header.length,
4510                                 rmrr->segment, rmrru->devices,
4511                                 rmrru->devices_cnt);
4512                         if (ret < 0)
4513                                 return ret;
4514                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4515                         dmar_remove_dev_scope(info, rmrr->segment,
4516                                 rmrru->devices, rmrru->devices_cnt);
4517                 }
4518         }
4519
4520         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4521                 if (atsru->include_all)
4522                         continue;
4523
4524                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4525                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4526                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4527                                         (void *)atsr + atsr->header.length,
4528                                         atsr->segment, atsru->devices,
4529                                         atsru->devices_cnt);
4530                         if (ret > 0)
4531                                 break;
4532                         else if (ret < 0)
4533                                 return ret;
4534                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4535                         if (dmar_remove_dev_scope(info, atsr->segment,
4536                                         atsru->devices, atsru->devices_cnt))
4537                                 break;
4538                 }
4539         }
4540
4541         return 0;
4542 }
4543
4544 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4545                                        unsigned long val, void *v)
4546 {
4547         struct memory_notify *mhp = v;
4548         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4549         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4550                         mhp->nr_pages - 1);
4551
4552         switch (val) {
4553         case MEM_GOING_ONLINE:
4554                 if (iommu_domain_identity_map(si_domain,
4555                                               start_vpfn, last_vpfn)) {
4556                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4557                                 start_vpfn, last_vpfn);
4558                         return NOTIFY_BAD;
4559                 }
4560                 break;
4561
4562         case MEM_OFFLINE:
4563         case MEM_CANCEL_ONLINE:
4564                 {
4565                         struct dmar_drhd_unit *drhd;
4566                         struct intel_iommu *iommu;
4567                         struct page *freelist;
4568
4569                         freelist = domain_unmap(si_domain,
4570                                                 start_vpfn, last_vpfn);
4571
4572                         rcu_read_lock();
4573                         for_each_active_iommu(iommu, drhd)
4574                                 iommu_flush_iotlb_psi(iommu, si_domain,
4575                                         start_vpfn, mhp->nr_pages,
4576                                         !freelist, 0);
4577                         rcu_read_unlock();
4578                         dma_free_pagelist(freelist);
4579                 }
4580                 break;
4581         }
4582
4583         return NOTIFY_OK;
4584 }
4585
4586 static struct notifier_block intel_iommu_memory_nb = {
4587         .notifier_call = intel_iommu_memory_notifier,
4588         .priority = 0
4589 };
4590
4591 static void free_all_cpu_cached_iovas(unsigned int cpu)
4592 {
4593         int i;
4594
4595         for (i = 0; i < g_num_of_iommus; i++) {
4596                 struct intel_iommu *iommu = g_iommus[i];
4597                 struct dmar_domain *domain;
4598                 int did;
4599
4600                 if (!iommu)
4601                         continue;
4602
4603                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4604                         domain = get_iommu_domain(iommu, (u16)did);
4605
4606                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4607                                 continue;
4608
4609                         free_cpu_cached_iovas(cpu, &domain->iovad);
4610                 }
4611         }
4612 }
4613
4614 static int intel_iommu_cpu_dead(unsigned int cpu)
4615 {
4616         free_all_cpu_cached_iovas(cpu);
4617         return 0;
4618 }
4619
4620 static void intel_disable_iommus(void)
4621 {
4622         struct intel_iommu *iommu = NULL;
4623         struct dmar_drhd_unit *drhd;
4624
4625         for_each_iommu(iommu, drhd)
4626                 iommu_disable_translation(iommu);
4627 }
4628
4629 void intel_iommu_shutdown(void)
4630 {
4631         struct dmar_drhd_unit *drhd;
4632         struct intel_iommu *iommu = NULL;
4633
4634         if (no_iommu || dmar_disabled)
4635                 return;
4636
4637         down_write(&dmar_global_lock);
4638
4639         /* Disable PMRs explicitly here. */
4640         for_each_iommu(iommu, drhd)
4641                 iommu_disable_protect_mem_regions(iommu);
4642
4643         /* Make sure the IOMMUs are switched off */
4644         intel_disable_iommus();
4645
4646         up_write(&dmar_global_lock);
4647 }
4648
4649 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4650 {
4651         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4652
4653         return container_of(iommu_dev, struct intel_iommu, iommu);
4654 }
4655
4656 static ssize_t intel_iommu_show_version(struct device *dev,
4657                                         struct device_attribute *attr,
4658                                         char *buf)
4659 {
4660         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4661         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4662         return sprintf(buf, "%d:%d\n",
4663                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4664 }
4665 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4666
4667 static ssize_t intel_iommu_show_address(struct device *dev,
4668                                         struct device_attribute *attr,
4669                                         char *buf)
4670 {
4671         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4672         return sprintf(buf, "%llx\n", iommu->reg_phys);
4673 }
4674 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4675
4676 static ssize_t intel_iommu_show_cap(struct device *dev,
4677                                     struct device_attribute *attr,
4678                                     char *buf)
4679 {
4680         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4681         return sprintf(buf, "%llx\n", iommu->cap);
4682 }
4683 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4684
4685 static ssize_t intel_iommu_show_ecap(struct device *dev,
4686                                     struct device_attribute *attr,
4687                                     char *buf)
4688 {
4689         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690         return sprintf(buf, "%llx\n", iommu->ecap);
4691 }
4692 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4693
4694 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4695                                       struct device_attribute *attr,
4696                                       char *buf)
4697 {
4698         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4700 }
4701 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4702
4703 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4704                                            struct device_attribute *attr,
4705                                            char *buf)
4706 {
4707         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4709                                                   cap_ndoms(iommu->cap)));
4710 }
4711 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4712
4713 static struct attribute *intel_iommu_attrs[] = {
4714         &dev_attr_version.attr,
4715         &dev_attr_address.attr,
4716         &dev_attr_cap.attr,
4717         &dev_attr_ecap.attr,
4718         &dev_attr_domains_supported.attr,
4719         &dev_attr_domains_used.attr,
4720         NULL,
4721 };
4722
4723 static struct attribute_group intel_iommu_group = {
4724         .name = "intel-iommu",
4725         .attrs = intel_iommu_attrs,
4726 };
4727
4728 const struct attribute_group *intel_iommu_groups[] = {
4729         &intel_iommu_group,
4730         NULL,
4731 };
4732
4733 static inline bool has_untrusted_dev(void)
4734 {
4735         struct pci_dev *pdev = NULL;
4736
4737         for_each_pci_dev(pdev)
4738                 if (pdev->untrusted)
4739                         return true;
4740
4741         return false;
4742 }
4743
4744 static int __init platform_optin_force_iommu(void)
4745 {
4746         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4747                 return 0;
4748
4749         if (no_iommu || dmar_disabled)
4750                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4751
4752         /*
4753          * If Intel-IOMMU is disabled by default, we will apply identity
4754          * map for all devices except those marked as being untrusted.
4755          */
4756         if (dmar_disabled)
4757                 iommu_set_default_passthrough(false);
4758
4759         dmar_disabled = 0;
4760         no_iommu = 0;
4761
4762         return 1;
4763 }
4764
4765 static int __init probe_acpi_namespace_devices(void)
4766 {
4767         struct dmar_drhd_unit *drhd;
4768         /* To avoid a -Wunused-but-set-variable warning. */
4769         struct intel_iommu *iommu __maybe_unused;
4770         struct device *dev;
4771         int i, ret = 0;
4772
4773         for_each_active_iommu(iommu, drhd) {
4774                 for_each_active_dev_scope(drhd->devices,
4775                                           drhd->devices_cnt, i, dev) {
4776                         struct acpi_device_physical_node *pn;
4777                         struct iommu_group *group;
4778                         struct acpi_device *adev;
4779
4780                         if (dev->bus != &acpi_bus_type)
4781                                 continue;
4782
4783                         adev = to_acpi_device(dev);
4784                         mutex_lock(&adev->physical_node_lock);
4785                         list_for_each_entry(pn,
4786                                             &adev->physical_node_list, node) {
4787                                 group = iommu_group_get(pn->dev);
4788                                 if (group) {
4789                                         iommu_group_put(group);
4790                                         continue;
4791                                 }
4792
4793                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4794                                 ret = iommu_probe_device(pn->dev);
4795                                 if (ret)
4796                                         break;
4797                         }
4798                         mutex_unlock(&adev->physical_node_lock);
4799
4800                         if (ret)
4801                                 return ret;
4802                 }
4803         }
4804
4805         return 0;
4806 }
4807
4808 int __init intel_iommu_init(void)
4809 {
4810         int ret = -ENODEV;
4811         struct dmar_drhd_unit *drhd;
4812         struct intel_iommu *iommu;
4813
4814         /*
4815          * Intel IOMMU is required for a TXT/tboot launch or platform
4816          * opt in, so enforce that.
4817          */
4818         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4819
4820         if (iommu_init_mempool()) {
4821                 if (force_on)
4822                         panic("tboot: Failed to initialize iommu memory\n");
4823                 return -ENOMEM;
4824         }
4825
4826         down_write(&dmar_global_lock);
4827         if (dmar_table_init()) {
4828                 if (force_on)
4829                         panic("tboot: Failed to initialize DMAR table\n");
4830                 goto out_free_dmar;
4831         }
4832
4833         if (dmar_dev_scope_init() < 0) {
4834                 if (force_on)
4835                         panic("tboot: Failed to initialize DMAR device scope\n");
4836                 goto out_free_dmar;
4837         }
4838
4839         up_write(&dmar_global_lock);
4840
4841         /*
4842          * The bus notifier takes the dmar_global_lock, so lockdep will
4843          * complain later when we register it under the lock.
4844          */
4845         dmar_register_bus_notifier();
4846
4847         down_write(&dmar_global_lock);
4848
4849         if (!no_iommu)
4850                 intel_iommu_debugfs_init();
4851
4852         if (no_iommu || dmar_disabled) {
4853                 /*
4854                  * We exit the function here to ensure IOMMU's remapping and
4855                  * mempool aren't setup, which means that the IOMMU's PMRs
4856                  * won't be disabled via the call to init_dmars(). So disable
4857                  * it explicitly here. The PMRs were setup by tboot prior to
4858                  * calling SENTER, but the kernel is expected to reset/tear
4859                  * down the PMRs.
4860                  */
4861                 if (intel_iommu_tboot_noforce) {
4862                         for_each_iommu(iommu, drhd)
4863                                 iommu_disable_protect_mem_regions(iommu);
4864                 }
4865
4866                 /*
4867                  * Make sure the IOMMUs are switched off, even when we
4868                  * boot into a kexec kernel and the previous kernel left
4869                  * them enabled
4870                  */
4871                 intel_disable_iommus();
4872                 goto out_free_dmar;
4873         }
4874
4875         if (list_empty(&dmar_rmrr_units))
4876                 pr_info("No RMRR found\n");
4877
4878         if (list_empty(&dmar_atsr_units))
4879                 pr_info("No ATSR found\n");
4880
4881         if (dmar_init_reserved_ranges()) {
4882                 if (force_on)
4883                         panic("tboot: Failed to reserve iommu ranges\n");
4884                 goto out_free_reserved_range;
4885         }
4886
4887         if (dmar_map_gfx)
4888                 intel_iommu_gfx_mapped = 1;
4889
4890         init_no_remapping_devices();
4891
4892         ret = init_dmars();
4893         if (ret) {
4894                 if (force_on)
4895                         panic("tboot: Failed to initialize DMARs\n");
4896                 pr_err("Initialization failed\n");
4897                 goto out_free_reserved_range;
4898         }
4899         up_write(&dmar_global_lock);
4900
4901         init_iommu_pm_ops();
4902
4903         down_read(&dmar_global_lock);
4904         for_each_active_iommu(iommu, drhd) {
4905                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4906                                        intel_iommu_groups,
4907                                        "%s", iommu->name);
4908                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4909                 iommu_device_register(&iommu->iommu);
4910         }
4911         up_read(&dmar_global_lock);
4912
4913         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4914         if (si_domain && !hw_pass_through)
4915                 register_memory_notifier(&intel_iommu_memory_nb);
4916         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4917                           intel_iommu_cpu_dead);
4918
4919         down_read(&dmar_global_lock);
4920         if (probe_acpi_namespace_devices())
4921                 pr_warn("ACPI name space devices didn't probe correctly\n");
4922
4923         /* Finally, we enable the DMA remapping hardware. */
4924         for_each_iommu(iommu, drhd) {
4925                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4926                         iommu_enable_translation(iommu);
4927
4928                 iommu_disable_protect_mem_regions(iommu);
4929         }
4930         up_read(&dmar_global_lock);
4931
4932         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4933
4934         intel_iommu_enabled = 1;
4935
4936         return 0;
4937
4938 out_free_reserved_range:
4939         put_iova_domain(&reserved_iova_list);
4940 out_free_dmar:
4941         intel_iommu_free_dmars();
4942         up_write(&dmar_global_lock);
4943         iommu_exit_mempool();
4944         return ret;
4945 }
4946
4947 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4948 {
4949         struct intel_iommu *iommu = opaque;
4950
4951         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4952         return 0;
4953 }
4954
4955 /*
4956  * NB - intel-iommu lacks any sort of reference counting for the users of
4957  * dependent devices.  If multiple endpoints have intersecting dependent
4958  * devices, unbinding the driver from any one of them will possibly leave
4959  * the others unable to operate.
4960  */
4961 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4962 {
4963         if (!iommu || !dev || !dev_is_pci(dev))
4964                 return;
4965
4966         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4967 }
4968
4969 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4970 {
4971         struct dmar_domain *domain;
4972         struct intel_iommu *iommu;
4973         unsigned long flags;
4974
4975         assert_spin_locked(&device_domain_lock);
4976
4977         if (WARN_ON(!info))
4978                 return;
4979
4980         iommu = info->iommu;
4981         domain = info->domain;
4982
4983         if (info->dev) {
4984                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4985                         intel_pasid_tear_down_entry(iommu, info->dev,
4986                                         PASID_RID2PASID, false);
4987
4988                 iommu_disable_dev_iotlb(info);
4989                 if (!dev_is_real_dma_subdevice(info->dev))
4990                         domain_context_clear(iommu, info->dev);
4991                 intel_pasid_free_table(info->dev);
4992         }
4993
4994         unlink_domain_info(info);
4995
4996         spin_lock_irqsave(&iommu->lock, flags);
4997         domain_detach_iommu(domain, iommu);
4998         spin_unlock_irqrestore(&iommu->lock, flags);
4999
5000         free_devinfo_mem(info);
5001 }
5002
5003 static void dmar_remove_one_dev_info(struct device *dev)
5004 {
5005         struct device_domain_info *info;
5006         unsigned long flags;
5007
5008         spin_lock_irqsave(&device_domain_lock, flags);
5009         info = get_domain_info(dev);
5010         if (info)
5011                 __dmar_remove_one_dev_info(info);
5012         spin_unlock_irqrestore(&device_domain_lock, flags);
5013 }
5014
5015 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5016 {
5017         int adjust_width;
5018
5019         /* calculate AGAW */
5020         domain->gaw = guest_width;
5021         adjust_width = guestwidth_to_adjustwidth(guest_width);
5022         domain->agaw = width_to_agaw(adjust_width);
5023
5024         domain->iommu_coherency = 0;
5025         domain->iommu_snooping = 0;
5026         domain->iommu_superpage = 0;
5027         domain->max_addr = 0;
5028
5029         /* always allocate the top pgd */
5030         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5031         if (!domain->pgd)
5032                 return -ENOMEM;
5033         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5034         return 0;
5035 }
5036
5037 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5038 {
5039         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5040         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5041
5042         if (!intel_iommu_strict &&
5043             init_iova_flush_queue(&dmar_domain->iovad,
5044                                   iommu_flush_iova, iova_entry_free))
5045                 pr_info("iova flush queue initialization failed\n");
5046 }
5047
5048 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5049 {
5050         struct dmar_domain *dmar_domain;
5051         struct iommu_domain *domain;
5052
5053         switch (type) {
5054         case IOMMU_DOMAIN_DMA:
5055         /* fallthrough */
5056         case IOMMU_DOMAIN_UNMANAGED:
5057                 dmar_domain = alloc_domain(0);
5058                 if (!dmar_domain) {
5059                         pr_err("Can't allocate dmar_domain\n");
5060                         return NULL;
5061                 }
5062                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5063                         pr_err("Domain initialization failed\n");
5064                         domain_exit(dmar_domain);
5065                         return NULL;
5066                 }
5067
5068                 if (type == IOMMU_DOMAIN_DMA)
5069                         intel_init_iova_domain(dmar_domain);
5070
5071                 domain_update_iommu_cap(dmar_domain);
5072
5073                 domain = &dmar_domain->domain;
5074                 domain->geometry.aperture_start = 0;
5075                 domain->geometry.aperture_end   =
5076                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5077                 domain->geometry.force_aperture = true;
5078
5079                 return domain;
5080         case IOMMU_DOMAIN_IDENTITY:
5081                 return &si_domain->domain;
5082         default:
5083                 return NULL;
5084         }
5085
5086         return NULL;
5087 }
5088
5089 static void intel_iommu_domain_free(struct iommu_domain *domain)
5090 {
5091         if (domain != &si_domain->domain)
5092                 domain_exit(to_dmar_domain(domain));
5093 }
5094
5095 /*
5096  * Check whether a @domain could be attached to the @dev through the
5097  * aux-domain attach/detach APIs.
5098  */
5099 static inline bool
5100 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5101 {
5102         struct device_domain_info *info = get_domain_info(dev);
5103
5104         return info && info->auxd_enabled &&
5105                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5106 }
5107
5108 static void auxiliary_link_device(struct dmar_domain *domain,
5109                                   struct device *dev)
5110 {
5111         struct device_domain_info *info = get_domain_info(dev);
5112
5113         assert_spin_locked(&device_domain_lock);
5114         if (WARN_ON(!info))
5115                 return;
5116
5117         domain->auxd_refcnt++;
5118         list_add(&domain->auxd, &info->auxiliary_domains);
5119 }
5120
5121 static void auxiliary_unlink_device(struct dmar_domain *domain,
5122                                     struct device *dev)
5123 {
5124         struct device_domain_info *info = get_domain_info(dev);
5125
5126         assert_spin_locked(&device_domain_lock);
5127         if (WARN_ON(!info))
5128                 return;
5129
5130         list_del(&domain->auxd);
5131         domain->auxd_refcnt--;
5132
5133         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5134                 ioasid_free(domain->default_pasid);
5135 }
5136
5137 static int aux_domain_add_dev(struct dmar_domain *domain,
5138                               struct device *dev)
5139 {
5140         int ret;
5141         u8 bus, devfn;
5142         unsigned long flags;
5143         struct intel_iommu *iommu;
5144
5145         iommu = device_to_iommu(dev, &bus, &devfn);
5146         if (!iommu)
5147                 return -ENODEV;
5148
5149         if (domain->default_pasid <= 0) {
5150                 int pasid;
5151
5152                 /* No private data needed for the default pasid */
5153                 pasid = ioasid_alloc(NULL, PASID_MIN,
5154                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5155                                      NULL);
5156                 if (pasid == INVALID_IOASID) {
5157                         pr_err("Can't allocate default pasid\n");
5158                         return -ENODEV;
5159                 }
5160                 domain->default_pasid = pasid;
5161         }
5162
5163         spin_lock_irqsave(&device_domain_lock, flags);
5164         /*
5165          * iommu->lock must be held to attach domain to iommu and setup the
5166          * pasid entry for second level translation.
5167          */
5168         spin_lock(&iommu->lock);
5169         ret = domain_attach_iommu(domain, iommu);
5170         if (ret)
5171                 goto attach_failed;
5172
5173         /* Setup the PASID entry for mediated devices: */
5174         if (domain_use_first_level(domain))
5175                 ret = domain_setup_first_level(iommu, domain, dev,
5176                                                domain->default_pasid);
5177         else
5178                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5179                                                      domain->default_pasid);
5180         if (ret)
5181                 goto table_failed;
5182         spin_unlock(&iommu->lock);
5183
5184         auxiliary_link_device(domain, dev);
5185
5186         spin_unlock_irqrestore(&device_domain_lock, flags);
5187
5188         return 0;
5189
5190 table_failed:
5191         domain_detach_iommu(domain, iommu);
5192 attach_failed:
5193         spin_unlock(&iommu->lock);
5194         spin_unlock_irqrestore(&device_domain_lock, flags);
5195         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5196                 ioasid_free(domain->default_pasid);
5197
5198         return ret;
5199 }
5200
5201 static void aux_domain_remove_dev(struct dmar_domain *domain,
5202                                   struct device *dev)
5203 {
5204         struct device_domain_info *info;
5205         struct intel_iommu *iommu;
5206         unsigned long flags;
5207
5208         if (!is_aux_domain(dev, &domain->domain))
5209                 return;
5210
5211         spin_lock_irqsave(&device_domain_lock, flags);
5212         info = get_domain_info(dev);
5213         iommu = info->iommu;
5214
5215         auxiliary_unlink_device(domain, dev);
5216
5217         spin_lock(&iommu->lock);
5218         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5219         domain_detach_iommu(domain, iommu);
5220         spin_unlock(&iommu->lock);
5221
5222         spin_unlock_irqrestore(&device_domain_lock, flags);
5223 }
5224
5225 static int prepare_domain_attach_device(struct iommu_domain *domain,
5226                                         struct device *dev)
5227 {
5228         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5229         struct intel_iommu *iommu;
5230         int addr_width;
5231         u8 bus, devfn;
5232
5233         iommu = device_to_iommu(dev, &bus, &devfn);
5234         if (!iommu)
5235                 return -ENODEV;
5236
5237         /* check if this iommu agaw is sufficient for max mapped address */
5238         addr_width = agaw_to_width(iommu->agaw);
5239         if (addr_width > cap_mgaw(iommu->cap))
5240                 addr_width = cap_mgaw(iommu->cap);
5241
5242         if (dmar_domain->max_addr > (1LL << addr_width)) {
5243                 dev_err(dev, "%s: iommu width (%d) is not "
5244                         "sufficient for the mapped address (%llx)\n",
5245                         __func__, addr_width, dmar_domain->max_addr);
5246                 return -EFAULT;
5247         }
5248         dmar_domain->gaw = addr_width;
5249
5250         /*
5251          * Knock out extra levels of page tables if necessary
5252          */
5253         while (iommu->agaw < dmar_domain->agaw) {
5254                 struct dma_pte *pte;
5255
5256                 pte = dmar_domain->pgd;
5257                 if (dma_pte_present(pte)) {
5258                         dmar_domain->pgd = (struct dma_pte *)
5259                                 phys_to_virt(dma_pte_addr(pte));
5260                         free_pgtable_page(pte);
5261                 }
5262                 dmar_domain->agaw--;
5263         }
5264
5265         return 0;
5266 }
5267
5268 static int intel_iommu_attach_device(struct iommu_domain *domain,
5269                                      struct device *dev)
5270 {
5271         int ret;
5272
5273         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5274             device_is_rmrr_locked(dev)) {
5275                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5276                 return -EPERM;
5277         }
5278
5279         if (is_aux_domain(dev, domain))
5280                 return -EPERM;
5281
5282         /* normally dev is not mapped */
5283         if (unlikely(domain_context_mapped(dev))) {
5284                 struct dmar_domain *old_domain;
5285
5286                 old_domain = find_domain(dev);
5287                 if (old_domain)
5288                         dmar_remove_one_dev_info(dev);
5289         }
5290
5291         ret = prepare_domain_attach_device(domain, dev);
5292         if (ret)
5293                 return ret;
5294
5295         return domain_add_dev_info(to_dmar_domain(domain), dev);
5296 }
5297
5298 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5299                                          struct device *dev)
5300 {
5301         int ret;
5302
5303         if (!is_aux_domain(dev, domain))
5304                 return -EPERM;
5305
5306         ret = prepare_domain_attach_device(domain, dev);
5307         if (ret)
5308                 return ret;
5309
5310         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5311 }
5312
5313 static void intel_iommu_detach_device(struct iommu_domain *domain,
5314                                       struct device *dev)
5315 {
5316         dmar_remove_one_dev_info(dev);
5317 }
5318
5319 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5320                                           struct device *dev)
5321 {
5322         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5323 }
5324
5325 /*
5326  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5327  * VT-d granularity. Invalidation is typically included in the unmap operation
5328  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5329  * owns the first level page tables. Invalidations of translation caches in the
5330  * guest are trapped and passed down to the host.
5331  *
5332  * vIOMMU in the guest will only expose first level page tables, therefore
5333  * we do not support IOTLB granularity for request without PASID (second level).
5334  *
5335  * For example, to find the VT-d granularity encoding for IOTLB
5336  * type and page selective granularity within PASID:
5337  * X: indexed by iommu cache type
5338  * Y: indexed by enum iommu_inv_granularity
5339  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5340  */
5341
5342 static const int
5343 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5344         /*
5345          * PASID based IOTLB invalidation: PASID selective (per PASID),
5346          * page selective (address granularity)
5347          */
5348         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5349         /* PASID based dev TLBs */
5350         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5351         /* PASID cache */
5352         {-EINVAL, -EINVAL, -EINVAL}
5353 };
5354
5355 static inline int to_vtd_granularity(int type, int granu)
5356 {
5357         return inv_type_granu_table[type][granu];
5358 }
5359
5360 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5361 {
5362         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5363
5364         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5365          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5366          * granu size in contiguous memory.
5367          */
5368         return order_base_2(nr_pages);
5369 }
5370
5371 #ifdef CONFIG_INTEL_IOMMU_SVM
5372 static int
5373 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5374                            struct iommu_cache_invalidate_info *inv_info)
5375 {
5376         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5377         struct device_domain_info *info;
5378         struct intel_iommu *iommu;
5379         unsigned long flags;
5380         int cache_type;
5381         u8 bus, devfn;
5382         u16 did, sid;
5383         int ret = 0;
5384         u64 size = 0;
5385
5386         if (!inv_info || !dmar_domain ||
5387             inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5388                 return -EINVAL;
5389
5390         if (!dev || !dev_is_pci(dev))
5391                 return -ENODEV;
5392
5393         iommu = device_to_iommu(dev, &bus, &devfn);
5394         if (!iommu)
5395                 return -ENODEV;
5396
5397         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5398                 return -EINVAL;
5399
5400         spin_lock_irqsave(&device_domain_lock, flags);
5401         spin_lock(&iommu->lock);
5402         info = get_domain_info(dev);
5403         if (!info) {
5404                 ret = -EINVAL;
5405                 goto out_unlock;
5406         }
5407         did = dmar_domain->iommu_did[iommu->seq_id];
5408         sid = PCI_DEVID(bus, devfn);
5409
5410         /* Size is only valid in address selective invalidation */
5411         if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5412                 size = to_vtd_size(inv_info->addr_info.granule_size,
5413                                    inv_info->addr_info.nb_granules);
5414
5415         for_each_set_bit(cache_type,
5416                          (unsigned long *)&inv_info->cache,
5417                          IOMMU_CACHE_INV_TYPE_NR) {
5418                 int granu = 0;
5419                 u64 pasid = 0;
5420
5421                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5422                 if (granu == -EINVAL) {
5423                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5424                                            cache_type, inv_info->granularity);
5425                         break;
5426                 }
5427
5428                 /*
5429                  * PASID is stored in different locations based on the
5430                  * granularity.
5431                  */
5432                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5433                     (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5434                         pasid = inv_info->pasid_info.pasid;
5435                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5436                          (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5437                         pasid = inv_info->addr_info.pasid;
5438
5439                 switch (BIT(cache_type)) {
5440                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5441                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5442                             size &&
5443                             (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5444                                 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5445                                                    inv_info->addr_info.addr, size);
5446                                 ret = -ERANGE;
5447                                 goto out_unlock;
5448                         }
5449
5450                         /*
5451                          * If granu is PASID-selective, address is ignored.
5452                          * We use npages = -1 to indicate that.
5453                          */
5454                         qi_flush_piotlb(iommu, did, pasid,
5455                                         mm_to_dma_pfn(inv_info->addr_info.addr),
5456                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5457                                         inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5458
5459                         /*
5460                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5461                          * in the guest may assume IOTLB flush is inclusive,
5462                          * which is more efficient.
5463                          */
5464                         if (info->ats_enabled)
5465                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5466                                                 info->pfsid, pasid,
5467                                                 info->ats_qdep,
5468                                                 inv_info->addr_info.addr,
5469                                                 size, granu);
5470                         break;
5471                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5472                         if (info->ats_enabled)
5473                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5474                                                 info->pfsid, pasid,
5475                                                 info->ats_qdep,
5476                                                 inv_info->addr_info.addr,
5477                                                 size, granu);
5478                         else
5479                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5480                         break;
5481                 default:
5482                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5483                                             cache_type);
5484                         ret = -EINVAL;
5485                 }
5486         }
5487 out_unlock:
5488         spin_unlock(&iommu->lock);
5489         spin_unlock_irqrestore(&device_domain_lock, flags);
5490
5491         return ret;
5492 }
5493 #endif
5494
5495 static int intel_iommu_map(struct iommu_domain *domain,
5496                            unsigned long iova, phys_addr_t hpa,
5497                            size_t size, int iommu_prot, gfp_t gfp)
5498 {
5499         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5500         u64 max_addr;
5501         int prot = 0;
5502         int ret;
5503
5504         if (iommu_prot & IOMMU_READ)
5505                 prot |= DMA_PTE_READ;
5506         if (iommu_prot & IOMMU_WRITE)
5507                 prot |= DMA_PTE_WRITE;
5508         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5509                 prot |= DMA_PTE_SNP;
5510
5511         max_addr = iova + size;
5512         if (dmar_domain->max_addr < max_addr) {
5513                 u64 end;
5514
5515                 /* check if minimum agaw is sufficient for mapped address */
5516                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5517                 if (end < max_addr) {
5518                         pr_err("%s: iommu width (%d) is not "
5519                                "sufficient for the mapped address (%llx)\n",
5520                                __func__, dmar_domain->gaw, max_addr);
5521                         return -EFAULT;
5522                 }
5523                 dmar_domain->max_addr = max_addr;
5524         }
5525         /* Round up size to next multiple of PAGE_SIZE, if it and
5526            the low bits of hpa would take us onto the next page */
5527         size = aligned_nrpages(hpa, size);
5528         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5529                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5530         return ret;
5531 }
5532
5533 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5534                                 unsigned long iova, size_t size,
5535                                 struct iommu_iotlb_gather *gather)
5536 {
5537         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5538         struct page *freelist = NULL;
5539         unsigned long start_pfn, last_pfn;
5540         unsigned int npages;
5541         int iommu_id, level = 0;
5542
5543         /* Cope with horrid API which requires us to unmap more than the
5544            size argument if it happens to be a large-page mapping. */
5545         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5546
5547         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5548                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5549
5550         start_pfn = iova >> VTD_PAGE_SHIFT;
5551         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5552
5553         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5554
5555         npages = last_pfn - start_pfn + 1;
5556
5557         for_each_domain_iommu(iommu_id, dmar_domain)
5558                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5559                                       start_pfn, npages, !freelist, 0);
5560
5561         dma_free_pagelist(freelist);
5562
5563         if (dmar_domain->max_addr == iova + size)
5564                 dmar_domain->max_addr = iova;
5565
5566         return size;
5567 }
5568
5569 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5570                                             dma_addr_t iova)
5571 {
5572         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573         struct dma_pte *pte;
5574         int level = 0;
5575         u64 phys = 0;
5576
5577         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5578         if (pte && dma_pte_present(pte))
5579                 phys = dma_pte_addr(pte) +
5580                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5581                                                 VTD_PAGE_SHIFT) - 1));
5582
5583         return phys;
5584 }
5585
5586 static inline bool scalable_mode_support(void)
5587 {
5588         struct dmar_drhd_unit *drhd;
5589         struct intel_iommu *iommu;
5590         bool ret = true;
5591
5592         rcu_read_lock();
5593         for_each_active_iommu(iommu, drhd) {
5594                 if (!sm_supported(iommu)) {
5595                         ret = false;
5596                         break;
5597                 }
5598         }
5599         rcu_read_unlock();
5600
5601         return ret;
5602 }
5603
5604 static inline bool iommu_pasid_support(void)
5605 {
5606         struct dmar_drhd_unit *drhd;
5607         struct intel_iommu *iommu;
5608         bool ret = true;
5609
5610         rcu_read_lock();
5611         for_each_active_iommu(iommu, drhd) {
5612                 if (!pasid_supported(iommu)) {
5613                         ret = false;
5614                         break;
5615                 }
5616         }
5617         rcu_read_unlock();
5618
5619         return ret;
5620 }
5621
5622 static inline bool nested_mode_support(void)
5623 {
5624         struct dmar_drhd_unit *drhd;
5625         struct intel_iommu *iommu;
5626         bool ret = true;
5627
5628         rcu_read_lock();
5629         for_each_active_iommu(iommu, drhd) {
5630                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5631                         ret = false;
5632                         break;
5633                 }
5634         }
5635         rcu_read_unlock();
5636
5637         return ret;
5638 }
5639
5640 static bool intel_iommu_capable(enum iommu_cap cap)
5641 {
5642         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5643                 return domain_update_iommu_snooping(NULL) == 1;
5644         if (cap == IOMMU_CAP_INTR_REMAP)
5645                 return irq_remapping_enabled == 1;
5646
5647         return false;
5648 }
5649
5650 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5651 {
5652         struct intel_iommu *iommu;
5653         u8 bus, devfn;
5654
5655         iommu = device_to_iommu(dev, &bus, &devfn);
5656         if (!iommu)
5657                 return ERR_PTR(-ENODEV);
5658
5659         if (translation_pre_enabled(iommu))
5660                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5661
5662         return &iommu->iommu;
5663 }
5664
5665 static void intel_iommu_release_device(struct device *dev)
5666 {
5667         struct intel_iommu *iommu;
5668         u8 bus, devfn;
5669
5670         iommu = device_to_iommu(dev, &bus, &devfn);
5671         if (!iommu)
5672                 return;
5673
5674         dmar_remove_one_dev_info(dev);
5675
5676         set_dma_ops(dev, NULL);
5677 }
5678
5679 static void intel_iommu_probe_finalize(struct device *dev)
5680 {
5681         struct iommu_domain *domain;
5682
5683         domain = iommu_get_domain_for_dev(dev);
5684         if (device_needs_bounce(dev))
5685                 set_dma_ops(dev, &bounce_dma_ops);
5686         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5687                 set_dma_ops(dev, &intel_dma_ops);
5688         else
5689                 set_dma_ops(dev, NULL);
5690 }
5691
5692 static void intel_iommu_get_resv_regions(struct device *device,
5693                                          struct list_head *head)
5694 {
5695         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5696         struct iommu_resv_region *reg;
5697         struct dmar_rmrr_unit *rmrr;
5698         struct device *i_dev;
5699         int i;
5700
5701         down_read(&dmar_global_lock);
5702         for_each_rmrr_units(rmrr) {
5703                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5704                                           i, i_dev) {
5705                         struct iommu_resv_region *resv;
5706                         enum iommu_resv_type type;
5707                         size_t length;
5708
5709                         if (i_dev != device &&
5710                             !is_downstream_to_pci_bridge(device, i_dev))
5711                                 continue;
5712
5713                         length = rmrr->end_address - rmrr->base_address + 1;
5714
5715                         type = device_rmrr_is_relaxable(device) ?
5716                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5717
5718                         resv = iommu_alloc_resv_region(rmrr->base_address,
5719                                                        length, prot, type);
5720                         if (!resv)
5721                                 break;
5722
5723                         list_add_tail(&resv->list, head);
5724                 }
5725         }
5726         up_read(&dmar_global_lock);
5727
5728 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5729         if (dev_is_pci(device)) {
5730                 struct pci_dev *pdev = to_pci_dev(device);
5731
5732                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5733                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5734                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5735                         if (reg)
5736                                 list_add_tail(&reg->list, head);
5737                 }
5738         }
5739 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5740
5741         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5742                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5743                                       0, IOMMU_RESV_MSI);
5744         if (!reg)
5745                 return;
5746         list_add_tail(&reg->list, head);
5747 }
5748
5749 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5750 {
5751         struct device_domain_info *info;
5752         struct context_entry *context;
5753         struct dmar_domain *domain;
5754         unsigned long flags;
5755         u64 ctx_lo;
5756         int ret;
5757
5758         domain = find_domain(dev);
5759         if (!domain)
5760                 return -EINVAL;
5761
5762         spin_lock_irqsave(&device_domain_lock, flags);
5763         spin_lock(&iommu->lock);
5764
5765         ret = -EINVAL;
5766         info = get_domain_info(dev);
5767         if (!info || !info->pasid_supported)
5768                 goto out;
5769
5770         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5771         if (WARN_ON(!context))
5772                 goto out;
5773
5774         ctx_lo = context[0].lo;
5775
5776         if (!(ctx_lo & CONTEXT_PASIDE)) {
5777                 ctx_lo |= CONTEXT_PASIDE;
5778                 context[0].lo = ctx_lo;
5779                 wmb();
5780                 iommu->flush.flush_context(iommu,
5781                                            domain->iommu_did[iommu->seq_id],
5782                                            PCI_DEVID(info->bus, info->devfn),
5783                                            DMA_CCMD_MASK_NOBIT,
5784                                            DMA_CCMD_DEVICE_INVL);
5785         }
5786
5787         /* Enable PASID support in the device, if it wasn't already */
5788         if (!info->pasid_enabled)
5789                 iommu_enable_dev_iotlb(info);
5790
5791         ret = 0;
5792
5793  out:
5794         spin_unlock(&iommu->lock);
5795         spin_unlock_irqrestore(&device_domain_lock, flags);
5796
5797         return ret;
5798 }
5799
5800 static void intel_iommu_apply_resv_region(struct device *dev,
5801                                           struct iommu_domain *domain,
5802                                           struct iommu_resv_region *region)
5803 {
5804         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5805         unsigned long start, end;
5806
5807         start = IOVA_PFN(region->start);
5808         end   = IOVA_PFN(region->start + region->length - 1);
5809
5810         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5811 }
5812
5813 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5814 {
5815         if (dev_is_pci(dev))
5816                 return pci_device_group(dev);
5817         return generic_device_group(dev);
5818 }
5819
5820 #ifdef CONFIG_INTEL_IOMMU_SVM
5821 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5822 {
5823         struct intel_iommu *iommu;
5824         u8 bus, devfn;
5825
5826         if (iommu_dummy(dev)) {
5827                 dev_warn(dev,
5828                          "No IOMMU translation for device; cannot enable SVM\n");
5829                 return NULL;
5830         }
5831
5832         iommu = device_to_iommu(dev, &bus, &devfn);
5833         if ((!iommu)) {
5834                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5835                 return NULL;
5836         }
5837
5838         return iommu;
5839 }
5840 #endif /* CONFIG_INTEL_IOMMU_SVM */
5841
5842 static int intel_iommu_enable_auxd(struct device *dev)
5843 {
5844         struct device_domain_info *info;
5845         struct intel_iommu *iommu;
5846         unsigned long flags;
5847         u8 bus, devfn;
5848         int ret;
5849
5850         iommu = device_to_iommu(dev, &bus, &devfn);
5851         if (!iommu || dmar_disabled)
5852                 return -EINVAL;
5853
5854         if (!sm_supported(iommu) || !pasid_supported(iommu))
5855                 return -EINVAL;
5856
5857         ret = intel_iommu_enable_pasid(iommu, dev);
5858         if (ret)
5859                 return -ENODEV;
5860
5861         spin_lock_irqsave(&device_domain_lock, flags);
5862         info = get_domain_info(dev);
5863         info->auxd_enabled = 1;
5864         spin_unlock_irqrestore(&device_domain_lock, flags);
5865
5866         return 0;
5867 }
5868
5869 static int intel_iommu_disable_auxd(struct device *dev)
5870 {
5871         struct device_domain_info *info;
5872         unsigned long flags;
5873
5874         spin_lock_irqsave(&device_domain_lock, flags);
5875         info = get_domain_info(dev);
5876         if (!WARN_ON(!info))
5877                 info->auxd_enabled = 0;
5878         spin_unlock_irqrestore(&device_domain_lock, flags);
5879
5880         return 0;
5881 }
5882
5883 /*
5884  * A PCI express designated vendor specific extended capability is defined
5885  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886  * for system software and tools to detect endpoint devices supporting the
5887  * Intel scalable IO virtualization without host driver dependency.
5888  *
5889  * Returns the address of the matching extended capability structure within
5890  * the device's PCI configuration space or 0 if the device does not support
5891  * it.
5892  */
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5894 {
5895         int pos;
5896         u16 vendor, id;
5897
5898         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5899         while (pos) {
5900                 pci_read_config_word(pdev, pos + 4, &vendor);
5901                 pci_read_config_word(pdev, pos + 8, &id);
5902                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5903                         return pos;
5904
5905                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5906         }
5907
5908         return 0;
5909 }
5910
5911 static bool
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5913 {
5914         if (feat == IOMMU_DEV_FEAT_AUX) {
5915                 int ret;
5916
5917                 if (!dev_is_pci(dev) || dmar_disabled ||
5918                     !scalable_mode_support() || !iommu_pasid_support())
5919                         return false;
5920
5921                 ret = pci_pasid_features(to_pci_dev(dev));
5922                 if (ret < 0)
5923                         return false;
5924
5925                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5926         }
5927
5928         if (feat == IOMMU_DEV_FEAT_SVA) {
5929                 struct device_domain_info *info = get_domain_info(dev);
5930
5931                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5932                         info->pasid_supported && info->pri_supported &&
5933                         info->ats_supported;
5934         }
5935
5936         return false;
5937 }
5938
5939 static int
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5941 {
5942         if (feat == IOMMU_DEV_FEAT_AUX)
5943                 return intel_iommu_enable_auxd(dev);
5944
5945         if (feat == IOMMU_DEV_FEAT_SVA) {
5946                 struct device_domain_info *info = get_domain_info(dev);
5947
5948                 if (!info)
5949                         return -EINVAL;
5950
5951                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5952                         return 0;
5953         }
5954
5955         return -ENODEV;
5956 }
5957
5958 static int
5959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5960 {
5961         if (feat == IOMMU_DEV_FEAT_AUX)
5962                 return intel_iommu_disable_auxd(dev);
5963
5964         return -ENODEV;
5965 }
5966
5967 static bool
5968 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5969 {
5970         struct device_domain_info *info = get_domain_info(dev);
5971
5972         if (feat == IOMMU_DEV_FEAT_AUX)
5973                 return scalable_mode_support() && info && info->auxd_enabled;
5974
5975         return false;
5976 }
5977
5978 static int
5979 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5980 {
5981         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5982
5983         return dmar_domain->default_pasid > 0 ?
5984                         dmar_domain->default_pasid : -EINVAL;
5985 }
5986
5987 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5988                                            struct device *dev)
5989 {
5990         return attach_deferred(dev);
5991 }
5992
5993 static int
5994 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5995                             enum iommu_attr attr, void *data)
5996 {
5997         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5998         unsigned long flags;
5999         int ret = 0;
6000
6001         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6002                 return -EINVAL;
6003
6004         switch (attr) {
6005         case DOMAIN_ATTR_NESTING:
6006                 spin_lock_irqsave(&device_domain_lock, flags);
6007                 if (nested_mode_support() &&
6008                     list_empty(&dmar_domain->devices)) {
6009                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6010                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6011                 } else {
6012                         ret = -ENODEV;
6013                 }
6014                 spin_unlock_irqrestore(&device_domain_lock, flags);
6015                 break;
6016         default:
6017                 ret = -EINVAL;
6018                 break;
6019         }
6020
6021         return ret;
6022 }
6023
6024 const struct iommu_ops intel_iommu_ops = {
6025         .capable                = intel_iommu_capable,
6026         .domain_alloc           = intel_iommu_domain_alloc,
6027         .domain_free            = intel_iommu_domain_free,
6028         .domain_set_attr        = intel_iommu_domain_set_attr,
6029         .attach_dev             = intel_iommu_attach_device,
6030         .detach_dev             = intel_iommu_detach_device,
6031         .aux_attach_dev         = intel_iommu_aux_attach_device,
6032         .aux_detach_dev         = intel_iommu_aux_detach_device,
6033         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6034         .map                    = intel_iommu_map,
6035         .unmap                  = intel_iommu_unmap,
6036         .iova_to_phys           = intel_iommu_iova_to_phys,
6037         .probe_device           = intel_iommu_probe_device,
6038         .probe_finalize         = intel_iommu_probe_finalize,
6039         .release_device         = intel_iommu_release_device,
6040         .get_resv_regions       = intel_iommu_get_resv_regions,
6041         .put_resv_regions       = generic_iommu_put_resv_regions,
6042         .apply_resv_region      = intel_iommu_apply_resv_region,
6043         .device_group           = intel_iommu_device_group,
6044         .dev_has_feat           = intel_iommu_dev_has_feat,
6045         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6046         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6047         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6048         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6049         .def_domain_type        = device_def_domain_type,
6050         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6051 #ifdef CONFIG_INTEL_IOMMU_SVM
6052         .cache_invalidate       = intel_iommu_sva_invalidate,
6053         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6054         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6055         .sva_bind               = intel_svm_bind,
6056         .sva_unbind             = intel_svm_unbind,
6057         .sva_get_pasid          = intel_svm_get_pasid,
6058 #endif
6059 };
6060
6061 static void quirk_iommu_igfx(struct pci_dev *dev)
6062 {
6063         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6064         dmar_map_gfx = 0;
6065 }
6066
6067 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6072 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6073 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6074 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6075
6076 /* Broadwell igfx malfunctions with dmar */
6077 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6079 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6080 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6081 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6082 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6083 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6084 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6085 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6086 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6101
6102 static void quirk_iommu_rwbf(struct pci_dev *dev)
6103 {
6104         /*
6105          * Mobile 4 Series Chipset neglects to set RWBF capability,
6106          * but needs it. Same seems to hold for the desktop versions.
6107          */
6108         pci_info(dev, "Forcing write-buffer flush capability\n");
6109         rwbf_quirk = 1;
6110 }
6111
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6119
6120 #define GGC 0x52
6121 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6122 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6123 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6124 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6125 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6126 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6127 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6128 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6129
6130 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6131 {
6132         unsigned short ggc;
6133
6134         if (pci_read_config_word(dev, GGC, &ggc))
6135                 return;
6136
6137         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6138                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6139                 dmar_map_gfx = 0;
6140         } else if (dmar_map_gfx) {
6141                 /* we have to ensure the gfx device is idle before we flush */
6142                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6143                 intel_iommu_strict = 1;
6144        }
6145 }
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6150
6151 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6152    ISOCH DMAR unit for the Azalia sound device, but not give it any
6153    TLB entries, which causes it to deadlock. Check for that.  We do
6154    this in a function called from init_dmars(), instead of in a PCI
6155    quirk, because we don't want to print the obnoxious "BIOS broken"
6156    message if VT-d is actually disabled.
6157 */
6158 static void __init check_tylersburg_isoch(void)
6159 {
6160         struct pci_dev *pdev;
6161         uint32_t vtisochctrl;
6162
6163         /* If there's no Azalia in the system anyway, forget it. */
6164         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6165         if (!pdev)
6166                 return;
6167         pci_dev_put(pdev);
6168
6169         /* System Management Registers. Might be hidden, in which case
6170            we can't do the sanity check. But that's OK, because the
6171            known-broken BIOSes _don't_ actually hide it, so far. */
6172         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6173         if (!pdev)
6174                 return;
6175
6176         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6177                 pci_dev_put(pdev);
6178                 return;
6179         }
6180
6181         pci_dev_put(pdev);
6182
6183         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6184         if (vtisochctrl & 1)
6185                 return;
6186
6187         /* Drop all bits other than the number of TLB entries */
6188         vtisochctrl &= 0x1c;
6189
6190         /* If we have the recommended number of TLB entries (16), fine. */
6191         if (vtisochctrl == 0x10)
6192                 return;
6193
6194         /* Zero TLB entries? You get to ride the short bus to school. */
6195         if (!vtisochctrl) {
6196                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6197                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6198                      dmi_get_system_info(DMI_BIOS_VENDOR),
6199                      dmi_get_system_info(DMI_BIOS_VERSION),
6200                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6201                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6202                 return;
6203         }
6204
6205         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6206                vtisochctrl);
6207 }