iommu/vt-d: Replace Intel specific PASID allocator with IOASID
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 #define for_each_domain_iommu(idx, domain)                      \
311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
312                 if (domain->iommu_refcnt[idx])
313
314 struct dmar_rmrr_unit {
315         struct list_head list;          /* list of rmrr units   */
316         struct acpi_dmar_header *hdr;   /* ACPI header          */
317         u64     base_address;           /* reserved base address*/
318         u64     end_address;            /* reserved end address */
319         struct dmar_dev_scope *devices; /* target devices */
320         int     devices_cnt;            /* target device count */
321 };
322
323 struct dmar_atsr_unit {
324         struct list_head list;          /* list of ATSR units */
325         struct acpi_dmar_header *hdr;   /* ACPI header */
326         struct dmar_dev_scope *devices; /* target devices */
327         int devices_cnt;                /* target device count */
328         u8 include_all:1;               /* include all ports */
329 };
330
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
333
334 #define for_each_rmrr_units(rmrr) \
335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
339
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345                                  struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347                                struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350                                      struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352                                             dma_addr_t iova);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
359
360 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
361 int intel_iommu_sm = 1;
362 #else
363 int intel_iommu_sm;
364 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
365
366 int intel_iommu_enabled = 0;
367 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
368
369 static int dmar_map_gfx = 1;
370 static int dmar_forcedac;
371 static int intel_iommu_strict;
372 static int intel_iommu_superpage = 1;
373 static int iommu_identity_mapping;
374 static int intel_no_bounce;
375
376 #define IDENTMAP_ALL            1
377 #define IDENTMAP_GFX            2
378 #define IDENTMAP_AZALIA         4
379
380 int intel_iommu_gfx_mapped;
381 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
382
383 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
384 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
385 static DEFINE_SPINLOCK(device_domain_lock);
386 static LIST_HEAD(device_domain_list);
387
388 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
389                                 to_pci_dev(d)->untrusted)
390
391 /*
392  * Iterate over elements in device_domain_list and call the specified
393  * callback @fn against each element.
394  */
395 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
396                                      void *data), void *data)
397 {
398         int ret = 0;
399         unsigned long flags;
400         struct device_domain_info *info;
401
402         spin_lock_irqsave(&device_domain_lock, flags);
403         list_for_each_entry(info, &device_domain_list, global) {
404                 ret = fn(info, data);
405                 if (ret) {
406                         spin_unlock_irqrestore(&device_domain_lock, flags);
407                         return ret;
408                 }
409         }
410         spin_unlock_irqrestore(&device_domain_lock, flags);
411
412         return 0;
413 }
414
415 const struct iommu_ops intel_iommu_ops;
416
417 static bool translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
420 }
421
422 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
423 {
424         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
425 }
426
427 static void init_translation_status(struct intel_iommu *iommu)
428 {
429         u32 gsts;
430
431         gsts = readl(iommu->reg + DMAR_GSTS_REG);
432         if (gsts & DMA_GSTS_TES)
433                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
434 }
435
436 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
437 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
438 {
439         return container_of(dom, struct dmar_domain, domain);
440 }
441
442 static int __init intel_iommu_setup(char *str)
443 {
444         if (!str)
445                 return -EINVAL;
446         while (*str) {
447                 if (!strncmp(str, "on", 2)) {
448                         dmar_disabled = 0;
449                         pr_info("IOMMU enabled\n");
450                 } else if (!strncmp(str, "off", 3)) {
451                         dmar_disabled = 1;
452                         no_platform_optin = 1;
453                         pr_info("IOMMU disabled\n");
454                 } else if (!strncmp(str, "igfx_off", 8)) {
455                         dmar_map_gfx = 0;
456                         pr_info("Disable GFX device mapping\n");
457                 } else if (!strncmp(str, "forcedac", 8)) {
458                         pr_info("Forcing DAC for PCI devices\n");
459                         dmar_forcedac = 1;
460                 } else if (!strncmp(str, "strict", 6)) {
461                         pr_info("Disable batched IOTLB flush\n");
462                         intel_iommu_strict = 1;
463                 } else if (!strncmp(str, "sp_off", 6)) {
464                         pr_info("Disable supported super page\n");
465                         intel_iommu_superpage = 0;
466                 } else if (!strncmp(str, "sm_on", 5)) {
467                         pr_info("Intel-IOMMU: scalable mode supported\n");
468                         intel_iommu_sm = 1;
469                 } else if (!strncmp(str, "tboot_noforce", 13)) {
470                         printk(KERN_INFO
471                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
472                         intel_iommu_tboot_noforce = 1;
473                 } else if (!strncmp(str, "nobounce", 8)) {
474                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
475                         intel_no_bounce = 1;
476                 }
477
478                 str += strcspn(str, ",");
479                 while (*str == ',')
480                         str++;
481         }
482         return 0;
483 }
484 __setup("intel_iommu=", intel_iommu_setup);
485
486 static struct kmem_cache *iommu_domain_cache;
487 static struct kmem_cache *iommu_devinfo_cache;
488
489 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
490 {
491         struct dmar_domain **domains;
492         int idx = did >> 8;
493
494         domains = iommu->domains[idx];
495         if (!domains)
496                 return NULL;
497
498         return domains[did & 0xff];
499 }
500
501 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
502                              struct dmar_domain *domain)
503 {
504         struct dmar_domain **domains;
505         int idx = did >> 8;
506
507         if (!iommu->domains[idx]) {
508                 size_t size = 256 * sizeof(struct dmar_domain *);
509                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
510         }
511
512         domains = iommu->domains[idx];
513         if (WARN_ON(!domains))
514                 return;
515         else
516                 domains[did & 0xff] = domain;
517 }
518
519 void *alloc_pgtable_page(int node)
520 {
521         struct page *page;
522         void *vaddr = NULL;
523
524         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
525         if (page)
526                 vaddr = page_address(page);
527         return vaddr;
528 }
529
530 void free_pgtable_page(void *vaddr)
531 {
532         free_page((unsigned long)vaddr);
533 }
534
535 static inline void *alloc_domain_mem(void)
536 {
537         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
538 }
539
540 static void free_domain_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_domain_cache, vaddr);
543 }
544
545 static inline void * alloc_devinfo_mem(void)
546 {
547         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
548 }
549
550 static inline void free_devinfo_mem(void *vaddr)
551 {
552         kmem_cache_free(iommu_devinfo_cache, vaddr);
553 }
554
555 static inline int domain_type_is_si(struct dmar_domain *domain)
556 {
557         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
558 }
559
560 static inline int domain_pfn_supported(struct dmar_domain *domain,
561                                        unsigned long pfn)
562 {
563         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
564
565         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
566 }
567
568 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
569 {
570         unsigned long sagaw;
571         int agaw = -1;
572
573         sagaw = cap_sagaw(iommu->cap);
574         for (agaw = width_to_agaw(max_gaw);
575              agaw >= 0; agaw--) {
576                 if (test_bit(agaw, &sagaw))
577                         break;
578         }
579
580         return agaw;
581 }
582
583 /*
584  * Calculate max SAGAW for each iommu.
585  */
586 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
587 {
588         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
589 }
590
591 /*
592  * calculate agaw for each iommu.
593  * "SAGAW" may be different across iommus, use a default agaw, and
594  * get a supported less agaw for iommus that don't support the default agaw.
595  */
596 int iommu_calculate_agaw(struct intel_iommu *iommu)
597 {
598         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
599 }
600
601 /* This functionin only returns single iommu in a domain */
602 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
603 {
604         int iommu_id;
605
606         /* si_domain and vm domain should not get here. */
607         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
608                 return NULL;
609
610         for_each_domain_iommu(iommu_id, domain)
611                 break;
612
613         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
614                 return NULL;
615
616         return g_iommus[iommu_id];
617 }
618
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
620 {
621         struct dmar_drhd_unit *drhd;
622         struct intel_iommu *iommu;
623         bool found = false;
624         int i;
625
626         domain->iommu_coherency = 1;
627
628         for_each_domain_iommu(i, domain) {
629                 found = true;
630                 if (!ecap_coherent(g_iommus[i]->ecap)) {
631                         domain->iommu_coherency = 0;
632                         break;
633                 }
634         }
635         if (found)
636                 return;
637
638         /* No hardware attached; use lowest common denominator */
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (!ecap_coherent(iommu->ecap)) {
642                         domain->iommu_coherency = 0;
643                         break;
644                 }
645         }
646         rcu_read_unlock();
647 }
648
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
650 {
651         struct dmar_drhd_unit *drhd;
652         struct intel_iommu *iommu;
653         int ret = 1;
654
655         rcu_read_lock();
656         for_each_active_iommu(iommu, drhd) {
657                 if (iommu != skip) {
658                         if (!ecap_sc_support(iommu->ecap)) {
659                                 ret = 0;
660                                 break;
661                         }
662                 }
663         }
664         rcu_read_unlock();
665
666         return ret;
667 }
668
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
670 {
671         struct dmar_drhd_unit *drhd;
672         struct intel_iommu *iommu;
673         int mask = 0xf;
674
675         if (!intel_iommu_superpage) {
676                 return 0;
677         }
678
679         /* set iommu_superpage to the smallest common denominator */
680         rcu_read_lock();
681         for_each_active_iommu(iommu, drhd) {
682                 if (iommu != skip) {
683                         mask &= cap_super_page_val(iommu->cap);
684                         if (!mask)
685                                 break;
686                 }
687         }
688         rcu_read_unlock();
689
690         return fls(mask);
691 }
692
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
695 {
696         domain_update_iommu_coherency(domain);
697         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
699 }
700
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
702                                          u8 devfn, int alloc)
703 {
704         struct root_entry *root = &iommu->root_entry[bus];
705         struct context_entry *context;
706         u64 *entry;
707
708         entry = &root->lo;
709         if (sm_supported(iommu)) {
710                 if (devfn >= 0x80) {
711                         devfn -= 0x80;
712                         entry = &root->hi;
713                 }
714                 devfn *= 2;
715         }
716         if (*entry & 1)
717                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
718         else {
719                 unsigned long phy_addr;
720                 if (!alloc)
721                         return NULL;
722
723                 context = alloc_pgtable_page(iommu->node);
724                 if (!context)
725                         return NULL;
726
727                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728                 phy_addr = virt_to_phys((void *)context);
729                 *entry = phy_addr | 1;
730                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
731         }
732         return &context[devfn];
733 }
734
735 static int iommu_dummy(struct device *dev)
736 {
737         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
738 }
739
740 /**
741  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
742  *                               sub-hierarchy of a candidate PCI-PCI bridge
743  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
744  * @bridge: the candidate PCI-PCI bridge
745  *
746  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
747  */
748 static bool
749 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
750 {
751         struct pci_dev *pdev, *pbridge;
752
753         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
754                 return false;
755
756         pdev = to_pci_dev(dev);
757         pbridge = to_pci_dev(bridge);
758
759         if (pbridge->subordinate &&
760             pbridge->subordinate->number <= pdev->bus->number &&
761             pbridge->subordinate->busn_res.end >= pdev->bus->number)
762                 return true;
763
764         return false;
765 }
766
767 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
768 {
769         struct dmar_drhd_unit *drhd = NULL;
770         struct intel_iommu *iommu;
771         struct device *tmp;
772         struct pci_dev *pdev = NULL;
773         u16 segment = 0;
774         int i;
775
776         if (iommu_dummy(dev))
777                 return NULL;
778
779         if (dev_is_pci(dev)) {
780                 struct pci_dev *pf_pdev;
781
782                 pdev = to_pci_dev(dev);
783
784 #ifdef CONFIG_X86
785                 /* VMD child devices currently cannot be handled individually */
786                 if (is_vmd(pdev->bus))
787                         return NULL;
788 #endif
789
790                 /* VFs aren't listed in scope tables; we need to look up
791                  * the PF instead to find the IOMMU. */
792                 pf_pdev = pci_physfn(pdev);
793                 dev = &pf_pdev->dev;
794                 segment = pci_domain_nr(pdev->bus);
795         } else if (has_acpi_companion(dev))
796                 dev = &ACPI_COMPANION(dev)->dev;
797
798         rcu_read_lock();
799         for_each_active_iommu(iommu, drhd) {
800                 if (pdev && segment != drhd->segment)
801                         continue;
802
803                 for_each_active_dev_scope(drhd->devices,
804                                           drhd->devices_cnt, i, tmp) {
805                         if (tmp == dev) {
806                                 /* For a VF use its original BDF# not that of the PF
807                                  * which we used for the IOMMU lookup. Strictly speaking
808                                  * we could do this for all PCI devices; we only need to
809                                  * get the BDF# from the scope table for ACPI matches. */
810                                 if (pdev && pdev->is_virtfn)
811                                         goto got_pdev;
812
813                                 *bus = drhd->devices[i].bus;
814                                 *devfn = drhd->devices[i].devfn;
815                                 goto out;
816                         }
817
818                         if (is_downstream_to_pci_bridge(dev, tmp))
819                                 goto got_pdev;
820                 }
821
822                 if (pdev && drhd->include_all) {
823                 got_pdev:
824                         *bus = pdev->bus->number;
825                         *devfn = pdev->devfn;
826                         goto out;
827                 }
828         }
829         iommu = NULL;
830  out:
831         rcu_read_unlock();
832
833         return iommu;
834 }
835
836 static void domain_flush_cache(struct dmar_domain *domain,
837                                void *addr, int size)
838 {
839         if (!domain->iommu_coherency)
840                 clflush_cache_range(addr, size);
841 }
842
843 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
844 {
845         struct context_entry *context;
846         int ret = 0;
847         unsigned long flags;
848
849         spin_lock_irqsave(&iommu->lock, flags);
850         context = iommu_context_addr(iommu, bus, devfn, 0);
851         if (context)
852                 ret = context_present(context);
853         spin_unlock_irqrestore(&iommu->lock, flags);
854         return ret;
855 }
856
857 static void free_context_table(struct intel_iommu *iommu)
858 {
859         int i;
860         unsigned long flags;
861         struct context_entry *context;
862
863         spin_lock_irqsave(&iommu->lock, flags);
864         if (!iommu->root_entry) {
865                 goto out;
866         }
867         for (i = 0; i < ROOT_ENTRY_NR; i++) {
868                 context = iommu_context_addr(iommu, i, 0, 0);
869                 if (context)
870                         free_pgtable_page(context);
871
872                 if (!sm_supported(iommu))
873                         continue;
874
875                 context = iommu_context_addr(iommu, i, 0x80, 0);
876                 if (context)
877                         free_pgtable_page(context);
878
879         }
880         free_pgtable_page(iommu->root_entry);
881         iommu->root_entry = NULL;
882 out:
883         spin_unlock_irqrestore(&iommu->lock, flags);
884 }
885
886 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
887                                       unsigned long pfn, int *target_level)
888 {
889         struct dma_pte *parent, *pte;
890         int level = agaw_to_level(domain->agaw);
891         int offset;
892
893         BUG_ON(!domain->pgd);
894
895         if (!domain_pfn_supported(domain, pfn))
896                 /* Address beyond IOMMU's addressing capabilities. */
897                 return NULL;
898
899         parent = domain->pgd;
900
901         while (1) {
902                 void *tmp_page;
903
904                 offset = pfn_level_offset(pfn, level);
905                 pte = &parent[offset];
906                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
907                         break;
908                 if (level == *target_level)
909                         break;
910
911                 if (!dma_pte_present(pte)) {
912                         uint64_t pteval;
913
914                         tmp_page = alloc_pgtable_page(domain->nid);
915
916                         if (!tmp_page)
917                                 return NULL;
918
919                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
920                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
921                         if (cmpxchg64(&pte->val, 0ULL, pteval))
922                                 /* Someone else set it while we were thinking; use theirs. */
923                                 free_pgtable_page(tmp_page);
924                         else
925                                 domain_flush_cache(domain, pte, sizeof(*pte));
926                 }
927                 if (level == 1)
928                         break;
929
930                 parent = phys_to_virt(dma_pte_addr(pte));
931                 level--;
932         }
933
934         if (!*target_level)
935                 *target_level = level;
936
937         return pte;
938 }
939
940 /* return address's pte at specific level */
941 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
942                                          unsigned long pfn,
943                                          int level, int *large_page)
944 {
945         struct dma_pte *parent, *pte;
946         int total = agaw_to_level(domain->agaw);
947         int offset;
948
949         parent = domain->pgd;
950         while (level <= total) {
951                 offset = pfn_level_offset(pfn, total);
952                 pte = &parent[offset];
953                 if (level == total)
954                         return pte;
955
956                 if (!dma_pte_present(pte)) {
957                         *large_page = total;
958                         break;
959                 }
960
961                 if (dma_pte_superpage(pte)) {
962                         *large_page = total;
963                         return pte;
964                 }
965
966                 parent = phys_to_virt(dma_pte_addr(pte));
967                 total--;
968         }
969         return NULL;
970 }
971
972 /* clear last level pte, a tlb flush should be followed */
973 static void dma_pte_clear_range(struct dmar_domain *domain,
974                                 unsigned long start_pfn,
975                                 unsigned long last_pfn)
976 {
977         unsigned int large_page;
978         struct dma_pte *first_pte, *pte;
979
980         BUG_ON(!domain_pfn_supported(domain, start_pfn));
981         BUG_ON(!domain_pfn_supported(domain, last_pfn));
982         BUG_ON(start_pfn > last_pfn);
983
984         /* we don't need lock here; nobody else touches the iova range */
985         do {
986                 large_page = 1;
987                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
988                 if (!pte) {
989                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
990                         continue;
991                 }
992                 do {
993                         dma_clear_pte(pte);
994                         start_pfn += lvl_to_nr_pages(large_page);
995                         pte++;
996                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
997
998                 domain_flush_cache(domain, first_pte,
999                                    (void *)pte - (void *)first_pte);
1000
1001         } while (start_pfn && start_pfn <= last_pfn);
1002 }
1003
1004 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1005                                int retain_level, struct dma_pte *pte,
1006                                unsigned long pfn, unsigned long start_pfn,
1007                                unsigned long last_pfn)
1008 {
1009         pfn = max(start_pfn, pfn);
1010         pte = &pte[pfn_level_offset(pfn, level)];
1011
1012         do {
1013                 unsigned long level_pfn;
1014                 struct dma_pte *level_pte;
1015
1016                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1017                         goto next;
1018
1019                 level_pfn = pfn & level_mask(level);
1020                 level_pte = phys_to_virt(dma_pte_addr(pte));
1021
1022                 if (level > 2) {
1023                         dma_pte_free_level(domain, level - 1, retain_level,
1024                                            level_pte, level_pfn, start_pfn,
1025                                            last_pfn);
1026                 }
1027
1028                 /*
1029                  * Free the page table if we're below the level we want to
1030                  * retain and the range covers the entire table.
1031                  */
1032                 if (level < retain_level && !(start_pfn > level_pfn ||
1033                       last_pfn < level_pfn + level_size(level) - 1)) {
1034                         dma_clear_pte(pte);
1035                         domain_flush_cache(domain, pte, sizeof(*pte));
1036                         free_pgtable_page(level_pte);
1037                 }
1038 next:
1039                 pfn += level_size(level);
1040         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1041 }
1042
1043 /*
1044  * clear last level (leaf) ptes and free page table pages below the
1045  * level we wish to keep intact.
1046  */
1047 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1048                                    unsigned long start_pfn,
1049                                    unsigned long last_pfn,
1050                                    int retain_level)
1051 {
1052         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1053         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1054         BUG_ON(start_pfn > last_pfn);
1055
1056         dma_pte_clear_range(domain, start_pfn, last_pfn);
1057
1058         /* We don't need lock here; nobody else touches the iova range */
1059         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1060                            domain->pgd, 0, start_pfn, last_pfn);
1061
1062         /* free pgd */
1063         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1064                 free_pgtable_page(domain->pgd);
1065                 domain->pgd = NULL;
1066         }
1067 }
1068
1069 /* When a page at a given level is being unlinked from its parent, we don't
1070    need to *modify* it at all. All we need to do is make a list of all the
1071    pages which can be freed just as soon as we've flushed the IOTLB and we
1072    know the hardware page-walk will no longer touch them.
1073    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1074    be freed. */
1075 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1076                                             int level, struct dma_pte *pte,
1077                                             struct page *freelist)
1078 {
1079         struct page *pg;
1080
1081         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1082         pg->freelist = freelist;
1083         freelist = pg;
1084
1085         if (level == 1)
1086                 return freelist;
1087
1088         pte = page_address(pg);
1089         do {
1090                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1091                         freelist = dma_pte_list_pagetables(domain, level - 1,
1092                                                            pte, freelist);
1093                 pte++;
1094         } while (!first_pte_in_page(pte));
1095
1096         return freelist;
1097 }
1098
1099 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1100                                         struct dma_pte *pte, unsigned long pfn,
1101                                         unsigned long start_pfn,
1102                                         unsigned long last_pfn,
1103                                         struct page *freelist)
1104 {
1105         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1106
1107         pfn = max(start_pfn, pfn);
1108         pte = &pte[pfn_level_offset(pfn, level)];
1109
1110         do {
1111                 unsigned long level_pfn;
1112
1113                 if (!dma_pte_present(pte))
1114                         goto next;
1115
1116                 level_pfn = pfn & level_mask(level);
1117
1118                 /* If range covers entire pagetable, free it */
1119                 if (start_pfn <= level_pfn &&
1120                     last_pfn >= level_pfn + level_size(level) - 1) {
1121                         /* These suborbinate page tables are going away entirely. Don't
1122                            bother to clear them; we're just going to *free* them. */
1123                         if (level > 1 && !dma_pte_superpage(pte))
1124                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1125
1126                         dma_clear_pte(pte);
1127                         if (!first_pte)
1128                                 first_pte = pte;
1129                         last_pte = pte;
1130                 } else if (level > 1) {
1131                         /* Recurse down into a level that isn't *entirely* obsolete */
1132                         freelist = dma_pte_clear_level(domain, level - 1,
1133                                                        phys_to_virt(dma_pte_addr(pte)),
1134                                                        level_pfn, start_pfn, last_pfn,
1135                                                        freelist);
1136                 }
1137 next:
1138                 pfn += level_size(level);
1139         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1140
1141         if (first_pte)
1142                 domain_flush_cache(domain, first_pte,
1143                                    (void *)++last_pte - (void *)first_pte);
1144
1145         return freelist;
1146 }
1147
1148 /* We can't just free the pages because the IOMMU may still be walking
1149    the page tables, and may have cached the intermediate levels. The
1150    pages can only be freed after the IOTLB flush has been done. */
1151 static struct page *domain_unmap(struct dmar_domain *domain,
1152                                  unsigned long start_pfn,
1153                                  unsigned long last_pfn)
1154 {
1155         struct page *freelist;
1156
1157         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1158         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1159         BUG_ON(start_pfn > last_pfn);
1160
1161         /* we don't need lock here; nobody else touches the iova range */
1162         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1163                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1164
1165         /* free pgd */
1166         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167                 struct page *pgd_page = virt_to_page(domain->pgd);
1168                 pgd_page->freelist = freelist;
1169                 freelist = pgd_page;
1170
1171                 domain->pgd = NULL;
1172         }
1173
1174         return freelist;
1175 }
1176
1177 static void dma_free_pagelist(struct page *freelist)
1178 {
1179         struct page *pg;
1180
1181         while ((pg = freelist)) {
1182                 freelist = pg->freelist;
1183                 free_pgtable_page(page_address(pg));
1184         }
1185 }
1186
1187 static void iova_entry_free(unsigned long data)
1188 {
1189         struct page *freelist = (struct page *)data;
1190
1191         dma_free_pagelist(freelist);
1192 }
1193
1194 /* iommu handling */
1195 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1196 {
1197         struct root_entry *root;
1198         unsigned long flags;
1199
1200         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1201         if (!root) {
1202                 pr_err("Allocating root entry for %s failed\n",
1203                         iommu->name);
1204                 return -ENOMEM;
1205         }
1206
1207         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1208
1209         spin_lock_irqsave(&iommu->lock, flags);
1210         iommu->root_entry = root;
1211         spin_unlock_irqrestore(&iommu->lock, flags);
1212
1213         return 0;
1214 }
1215
1216 static void iommu_set_root_entry(struct intel_iommu *iommu)
1217 {
1218         u64 addr;
1219         u32 sts;
1220         unsigned long flag;
1221
1222         addr = virt_to_phys(iommu->root_entry);
1223         if (sm_supported(iommu))
1224                 addr |= DMA_RTADDR_SMT;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1228
1229         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1230
1231         /* Make sure hardware complete it */
1232         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233                       readl, (sts & DMA_GSTS_RTPS), sts);
1234
1235         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236 }
1237
1238 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1239 {
1240         u32 val;
1241         unsigned long flag;
1242
1243         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1244                 return;
1245
1246         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1247         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1248
1249         /* Make sure hardware complete it */
1250         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1251                       readl, (!(val & DMA_GSTS_WBFS)), val);
1252
1253         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1254 }
1255
1256 /* return value determine if we need a write buffer flush */
1257 static void __iommu_flush_context(struct intel_iommu *iommu,
1258                                   u16 did, u16 source_id, u8 function_mask,
1259                                   u64 type)
1260 {
1261         u64 val = 0;
1262         unsigned long flag;
1263
1264         switch (type) {
1265         case DMA_CCMD_GLOBAL_INVL:
1266                 val = DMA_CCMD_GLOBAL_INVL;
1267                 break;
1268         case DMA_CCMD_DOMAIN_INVL:
1269                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1270                 break;
1271         case DMA_CCMD_DEVICE_INVL:
1272                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1273                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1274                 break;
1275         default:
1276                 BUG();
1277         }
1278         val |= DMA_CCMD_ICC;
1279
1280         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1281         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1282
1283         /* Make sure hardware complete it */
1284         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1285                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1286
1287         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1288 }
1289
1290 /* return value determine if we need a write buffer flush */
1291 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1292                                 u64 addr, unsigned int size_order, u64 type)
1293 {
1294         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1295         u64 val = 0, val_iva = 0;
1296         unsigned long flag;
1297
1298         switch (type) {
1299         case DMA_TLB_GLOBAL_FLUSH:
1300                 /* global flush doesn't need set IVA_REG */
1301                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1302                 break;
1303         case DMA_TLB_DSI_FLUSH:
1304                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1305                 break;
1306         case DMA_TLB_PSI_FLUSH:
1307                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1308                 /* IH bit is passed in as part of address */
1309                 val_iva = size_order | addr;
1310                 break;
1311         default:
1312                 BUG();
1313         }
1314         /* Note: set drain read/write */
1315 #if 0
1316         /*
1317          * This is probably to be super secure.. Looks like we can
1318          * ignore it without any impact.
1319          */
1320         if (cap_read_drain(iommu->cap))
1321                 val |= DMA_TLB_READ_DRAIN;
1322 #endif
1323         if (cap_write_drain(iommu->cap))
1324                 val |= DMA_TLB_WRITE_DRAIN;
1325
1326         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1327         /* Note: Only uses first TLB reg currently */
1328         if (val_iva)
1329                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1330         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1331
1332         /* Make sure hardware complete it */
1333         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1334                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1335
1336         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1337
1338         /* check IOTLB invalidation granularity */
1339         if (DMA_TLB_IAIG(val) == 0)
1340                 pr_err("Flush IOTLB failed\n");
1341         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1342                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1343                         (unsigned long long)DMA_TLB_IIRG(type),
1344                         (unsigned long long)DMA_TLB_IAIG(val));
1345 }
1346
1347 static struct device_domain_info *
1348 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1349                          u8 bus, u8 devfn)
1350 {
1351         struct device_domain_info *info;
1352
1353         assert_spin_locked(&device_domain_lock);
1354
1355         if (!iommu->qi)
1356                 return NULL;
1357
1358         list_for_each_entry(info, &domain->devices, link)
1359                 if (info->iommu == iommu && info->bus == bus &&
1360                     info->devfn == devfn) {
1361                         if (info->ats_supported && info->dev)
1362                                 return info;
1363                         break;
1364                 }
1365
1366         return NULL;
1367 }
1368
1369 static void domain_update_iotlb(struct dmar_domain *domain)
1370 {
1371         struct device_domain_info *info;
1372         bool has_iotlb_device = false;
1373
1374         assert_spin_locked(&device_domain_lock);
1375
1376         list_for_each_entry(info, &domain->devices, link) {
1377                 struct pci_dev *pdev;
1378
1379                 if (!info->dev || !dev_is_pci(info->dev))
1380                         continue;
1381
1382                 pdev = to_pci_dev(info->dev);
1383                 if (pdev->ats_enabled) {
1384                         has_iotlb_device = true;
1385                         break;
1386                 }
1387         }
1388
1389         domain->has_iotlb_device = has_iotlb_device;
1390 }
1391
1392 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1393 {
1394         struct pci_dev *pdev;
1395
1396         assert_spin_locked(&device_domain_lock);
1397
1398         if (!info || !dev_is_pci(info->dev))
1399                 return;
1400
1401         pdev = to_pci_dev(info->dev);
1402         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1403          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1404          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1405          * reserved, which should be set to 0.
1406          */
1407         if (!ecap_dit(info->iommu->ecap))
1408                 info->pfsid = 0;
1409         else {
1410                 struct pci_dev *pf_pdev;
1411
1412                 /* pdev will be returned if device is not a vf */
1413                 pf_pdev = pci_physfn(pdev);
1414                 info->pfsid = pci_dev_id(pf_pdev);
1415         }
1416
1417 #ifdef CONFIG_INTEL_IOMMU_SVM
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->pri_supported &&
1427             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1428             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1429                 info->pri_enabled = 1;
1430 #endif
1431         if (!pdev->untrusted && info->ats_supported &&
1432             pci_ats_page_aligned(pdev) &&
1433             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1434                 info->ats_enabled = 1;
1435                 domain_update_iotlb(info->domain);
1436                 info->ats_qdep = pci_ats_queue_depth(pdev);
1437         }
1438 }
1439
1440 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1441 {
1442         struct pci_dev *pdev;
1443
1444         assert_spin_locked(&device_domain_lock);
1445
1446         if (!dev_is_pci(info->dev))
1447                 return;
1448
1449         pdev = to_pci_dev(info->dev);
1450
1451         if (info->ats_enabled) {
1452                 pci_disable_ats(pdev);
1453                 info->ats_enabled = 0;
1454                 domain_update_iotlb(info->domain);
1455         }
1456 #ifdef CONFIG_INTEL_IOMMU_SVM
1457         if (info->pri_enabled) {
1458                 pci_disable_pri(pdev);
1459                 info->pri_enabled = 0;
1460         }
1461         if (info->pasid_enabled) {
1462                 pci_disable_pasid(pdev);
1463                 info->pasid_enabled = 0;
1464         }
1465 #endif
1466 }
1467
1468 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1469                                   u64 addr, unsigned mask)
1470 {
1471         u16 sid, qdep;
1472         unsigned long flags;
1473         struct device_domain_info *info;
1474
1475         if (!domain->has_iotlb_device)
1476                 return;
1477
1478         spin_lock_irqsave(&device_domain_lock, flags);
1479         list_for_each_entry(info, &domain->devices, link) {
1480                 if (!info->ats_enabled)
1481                         continue;
1482
1483                 sid = info->bus << 8 | info->devfn;
1484                 qdep = info->ats_qdep;
1485                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1486                                 qdep, addr, mask);
1487         }
1488         spin_unlock_irqrestore(&device_domain_lock, flags);
1489 }
1490
1491 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1492                                   struct dmar_domain *domain,
1493                                   unsigned long pfn, unsigned int pages,
1494                                   int ih, int map)
1495 {
1496         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1497         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1498         u16 did = domain->iommu_did[iommu->seq_id];
1499
1500         BUG_ON(pages == 0);
1501
1502         if (ih)
1503                 ih = 1 << 6;
1504         /*
1505          * Fallback to domain selective flush if no PSI support or the size is
1506          * too big.
1507          * PSI requires page size to be 2 ^ x, and the base address is naturally
1508          * aligned to the size
1509          */
1510         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1511                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1512                                                 DMA_TLB_DSI_FLUSH);
1513         else
1514                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1515                                                 DMA_TLB_PSI_FLUSH);
1516
1517         /*
1518          * In caching mode, changes of pages from non-present to present require
1519          * flush. However, device IOTLB doesn't need to be flushed in this case.
1520          */
1521         if (!cap_caching_mode(iommu->cap) || !map)
1522                 iommu_flush_dev_iotlb(domain, addr, mask);
1523 }
1524
1525 /* Notification for newly created mappings */
1526 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1527                                         struct dmar_domain *domain,
1528                                         unsigned long pfn, unsigned int pages)
1529 {
1530         /* It's a non-present to present mapping. Only flush if caching mode */
1531         if (cap_caching_mode(iommu->cap))
1532                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1533         else
1534                 iommu_flush_write_buffer(iommu);
1535 }
1536
1537 static void iommu_flush_iova(struct iova_domain *iovad)
1538 {
1539         struct dmar_domain *domain;
1540         int idx;
1541
1542         domain = container_of(iovad, struct dmar_domain, iovad);
1543
1544         for_each_domain_iommu(idx, domain) {
1545                 struct intel_iommu *iommu = g_iommus[idx];
1546                 u16 did = domain->iommu_did[iommu->seq_id];
1547
1548                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1549
1550                 if (!cap_caching_mode(iommu->cap))
1551                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1552                                               0, MAX_AGAW_PFN_WIDTH);
1553         }
1554 }
1555
1556 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1557 {
1558         u32 pmen;
1559         unsigned long flags;
1560
1561         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1562                 return;
1563
1564         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1565         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1566         pmen &= ~DMA_PMEN_EPM;
1567         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1568
1569         /* wait for the protected region status bit to clear */
1570         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1571                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 }
1575
1576 static void iommu_enable_translation(struct intel_iommu *iommu)
1577 {
1578         u32 sts;
1579         unsigned long flags;
1580
1581         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1582         iommu->gcmd |= DMA_GCMD_TE;
1583         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585         /* Make sure hardware complete it */
1586         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                       readl, (sts & DMA_GSTS_TES), sts);
1588
1589         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1590 }
1591
1592 static void iommu_disable_translation(struct intel_iommu *iommu)
1593 {
1594         u32 sts;
1595         unsigned long flag;
1596
1597         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1598         iommu->gcmd &= ~DMA_GCMD_TE;
1599         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1600
1601         /* Make sure hardware complete it */
1602         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1603                       readl, (!(sts & DMA_GSTS_TES)), sts);
1604
1605         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1606 }
1607
1608 static int iommu_init_domains(struct intel_iommu *iommu)
1609 {
1610         u32 ndomains, nlongs;
1611         size_t size;
1612
1613         ndomains = cap_ndoms(iommu->cap);
1614         pr_debug("%s: Number of Domains supported <%d>\n",
1615                  iommu->name, ndomains);
1616         nlongs = BITS_TO_LONGS(ndomains);
1617
1618         spin_lock_init(&iommu->lock);
1619
1620         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1621         if (!iommu->domain_ids) {
1622                 pr_err("%s: Allocating domain id array failed\n",
1623                        iommu->name);
1624                 return -ENOMEM;
1625         }
1626
1627         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1628         iommu->domains = kzalloc(size, GFP_KERNEL);
1629
1630         if (iommu->domains) {
1631                 size = 256 * sizeof(struct dmar_domain *);
1632                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1633         }
1634
1635         if (!iommu->domains || !iommu->domains[0]) {
1636                 pr_err("%s: Allocating domain array failed\n",
1637                        iommu->name);
1638                 kfree(iommu->domain_ids);
1639                 kfree(iommu->domains);
1640                 iommu->domain_ids = NULL;
1641                 iommu->domains    = NULL;
1642                 return -ENOMEM;
1643         }
1644
1645         /*
1646          * If Caching mode is set, then invalid translations are tagged
1647          * with domain-id 0, hence we need to pre-allocate it. We also
1648          * use domain-id 0 as a marker for non-allocated domain-id, so
1649          * make sure it is not used for a real domain.
1650          */
1651         set_bit(0, iommu->domain_ids);
1652
1653         /*
1654          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1655          * entry for first-level or pass-through translation modes should
1656          * be programmed with a domain id different from those used for
1657          * second-level or nested translation. We reserve a domain id for
1658          * this purpose.
1659          */
1660         if (sm_supported(iommu))
1661                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1662
1663         return 0;
1664 }
1665
1666 static void disable_dmar_iommu(struct intel_iommu *iommu)
1667 {
1668         struct device_domain_info *info, *tmp;
1669         unsigned long flags;
1670
1671         if (!iommu->domains || !iommu->domain_ids)
1672                 return;
1673
1674         spin_lock_irqsave(&device_domain_lock, flags);
1675         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1676                 if (info->iommu != iommu)
1677                         continue;
1678
1679                 if (!info->dev || !info->domain)
1680                         continue;
1681
1682                 __dmar_remove_one_dev_info(info);
1683         }
1684         spin_unlock_irqrestore(&device_domain_lock, flags);
1685
1686         if (iommu->gcmd & DMA_GCMD_TE)
1687                 iommu_disable_translation(iommu);
1688 }
1689
1690 static void free_dmar_iommu(struct intel_iommu *iommu)
1691 {
1692         if ((iommu->domains) && (iommu->domain_ids)) {
1693                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1694                 int i;
1695
1696                 for (i = 0; i < elems; i++)
1697                         kfree(iommu->domains[i]);
1698                 kfree(iommu->domains);
1699                 kfree(iommu->domain_ids);
1700                 iommu->domains = NULL;
1701                 iommu->domain_ids = NULL;
1702         }
1703
1704         g_iommus[iommu->seq_id] = NULL;
1705
1706         /* free context mapping */
1707         free_context_table(iommu);
1708
1709 #ifdef CONFIG_INTEL_IOMMU_SVM
1710         if (pasid_supported(iommu)) {
1711                 if (ecap_prs(iommu->ecap))
1712                         intel_svm_finish_prq(iommu);
1713         }
1714 #endif
1715 }
1716
1717 static struct dmar_domain *alloc_domain(int flags)
1718 {
1719         struct dmar_domain *domain;
1720
1721         domain = alloc_domain_mem();
1722         if (!domain)
1723                 return NULL;
1724
1725         memset(domain, 0, sizeof(*domain));
1726         domain->nid = NUMA_NO_NODE;
1727         domain->flags = flags;
1728         domain->has_iotlb_device = false;
1729         INIT_LIST_HEAD(&domain->devices);
1730
1731         return domain;
1732 }
1733
1734 /* Must be called with iommu->lock */
1735 static int domain_attach_iommu(struct dmar_domain *domain,
1736                                struct intel_iommu *iommu)
1737 {
1738         unsigned long ndomains;
1739         int num;
1740
1741         assert_spin_locked(&device_domain_lock);
1742         assert_spin_locked(&iommu->lock);
1743
1744         domain->iommu_refcnt[iommu->seq_id] += 1;
1745         domain->iommu_count += 1;
1746         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1747                 ndomains = cap_ndoms(iommu->cap);
1748                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1749
1750                 if (num >= ndomains) {
1751                         pr_err("%s: No free domain ids\n", iommu->name);
1752                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1753                         domain->iommu_count -= 1;
1754                         return -ENOSPC;
1755                 }
1756
1757                 set_bit(num, iommu->domain_ids);
1758                 set_iommu_domain(iommu, num, domain);
1759
1760                 domain->iommu_did[iommu->seq_id] = num;
1761                 domain->nid                      = iommu->node;
1762
1763                 domain_update_iommu_cap(domain);
1764         }
1765
1766         return 0;
1767 }
1768
1769 static int domain_detach_iommu(struct dmar_domain *domain,
1770                                struct intel_iommu *iommu)
1771 {
1772         int num, count;
1773
1774         assert_spin_locked(&device_domain_lock);
1775         assert_spin_locked(&iommu->lock);
1776
1777         domain->iommu_refcnt[iommu->seq_id] -= 1;
1778         count = --domain->iommu_count;
1779         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1780                 num = domain->iommu_did[iommu->seq_id];
1781                 clear_bit(num, iommu->domain_ids);
1782                 set_iommu_domain(iommu, num, NULL);
1783
1784                 domain_update_iommu_cap(domain);
1785                 domain->iommu_did[iommu->seq_id] = 0;
1786         }
1787
1788         return count;
1789 }
1790
1791 static struct iova_domain reserved_iova_list;
1792 static struct lock_class_key reserved_rbtree_key;
1793
1794 static int dmar_init_reserved_ranges(void)
1795 {
1796         struct pci_dev *pdev = NULL;
1797         struct iova *iova;
1798         int i;
1799
1800         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1801
1802         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1803                 &reserved_rbtree_key);
1804
1805         /* IOAPIC ranges shouldn't be accessed by DMA */
1806         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1807                 IOVA_PFN(IOAPIC_RANGE_END));
1808         if (!iova) {
1809                 pr_err("Reserve IOAPIC range failed\n");
1810                 return -ENODEV;
1811         }
1812
1813         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1814         for_each_pci_dev(pdev) {
1815                 struct resource *r;
1816
1817                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1818                         r = &pdev->resource[i];
1819                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1820                                 continue;
1821                         iova = reserve_iova(&reserved_iova_list,
1822                                             IOVA_PFN(r->start),
1823                                             IOVA_PFN(r->end));
1824                         if (!iova) {
1825                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1826                                 return -ENODEV;
1827                         }
1828                 }
1829         }
1830         return 0;
1831 }
1832
1833 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1834 {
1835         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1836 }
1837
1838 static inline int guestwidth_to_adjustwidth(int gaw)
1839 {
1840         int agaw;
1841         int r = (gaw - 12) % 9;
1842
1843         if (r == 0)
1844                 agaw = gaw;
1845         else
1846                 agaw = gaw + 9 - r;
1847         if (agaw > 64)
1848                 agaw = 64;
1849         return agaw;
1850 }
1851
1852 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1853                        int guest_width)
1854 {
1855         int adjust_width, agaw;
1856         unsigned long sagaw;
1857         int err;
1858
1859         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1860
1861         err = init_iova_flush_queue(&domain->iovad,
1862                                     iommu_flush_iova, iova_entry_free);
1863         if (err)
1864                 return err;
1865
1866         domain_reserve_special_ranges(domain);
1867
1868         /* calculate AGAW */
1869         if (guest_width > cap_mgaw(iommu->cap))
1870                 guest_width = cap_mgaw(iommu->cap);
1871         domain->gaw = guest_width;
1872         adjust_width = guestwidth_to_adjustwidth(guest_width);
1873         agaw = width_to_agaw(adjust_width);
1874         sagaw = cap_sagaw(iommu->cap);
1875         if (!test_bit(agaw, &sagaw)) {
1876                 /* hardware doesn't support it, choose a bigger one */
1877                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1878                 agaw = find_next_bit(&sagaw, 5, agaw);
1879                 if (agaw >= 5)
1880                         return -ENODEV;
1881         }
1882         domain->agaw = agaw;
1883
1884         if (ecap_coherent(iommu->ecap))
1885                 domain->iommu_coherency = 1;
1886         else
1887                 domain->iommu_coherency = 0;
1888
1889         if (ecap_sc_support(iommu->ecap))
1890                 domain->iommu_snooping = 1;
1891         else
1892                 domain->iommu_snooping = 0;
1893
1894         if (intel_iommu_superpage)
1895                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1896         else
1897                 domain->iommu_superpage = 0;
1898
1899         domain->nid = iommu->node;
1900
1901         /* always allocate the top pgd */
1902         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1903         if (!domain->pgd)
1904                 return -ENOMEM;
1905         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1906         return 0;
1907 }
1908
1909 static void domain_exit(struct dmar_domain *domain)
1910 {
1911
1912         /* Remove associated devices and clear attached or cached domains */
1913         domain_remove_dev_info(domain);
1914
1915         /* destroy iovas */
1916         put_iova_domain(&domain->iovad);
1917
1918         if (domain->pgd) {
1919                 struct page *freelist;
1920
1921                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922                 dma_free_pagelist(freelist);
1923         }
1924
1925         free_domain_mem(domain);
1926 }
1927
1928 /*
1929  * Get the PASID directory size for scalable mode context entry.
1930  * Value of X in the PDTS field of a scalable mode context entry
1931  * indicates PASID directory with 2^(X + 7) entries.
1932  */
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1934 {
1935         int pds, max_pde;
1936
1937         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1939         if (pds < 7)
1940                 return 0;
1941
1942         return pds - 7;
1943 }
1944
1945 /*
1946  * Set the RID_PASID field of a scalable mode context entry. The
1947  * IOMMU hardware will use the PASID value set in this field for
1948  * DMA translations of DMA requests without PASID.
1949  */
1950 static inline void
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1952 {
1953         context->hi |= pasid & ((1 << 20) - 1);
1954         context->hi |= (1 << 20);
1955 }
1956
1957 /*
1958  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1959  * entry.
1960  */
1961 static inline void context_set_sm_dte(struct context_entry *context)
1962 {
1963         context->lo |= (1 << 2);
1964 }
1965
1966 /*
1967  * Set the PRE(Page Request Enable) field of a scalable mode context
1968  * entry.
1969  */
1970 static inline void context_set_sm_pre(struct context_entry *context)
1971 {
1972         context->lo |= (1 << 4);
1973 }
1974
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1977
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979                                       struct intel_iommu *iommu,
1980                                       struct pasid_table *table,
1981                                       u8 bus, u8 devfn)
1982 {
1983         u16 did = domain->iommu_did[iommu->seq_id];
1984         int translation = CONTEXT_TT_MULTI_LEVEL;
1985         struct device_domain_info *info = NULL;
1986         struct context_entry *context;
1987         unsigned long flags;
1988         int ret;
1989
1990         WARN_ON(did == 0);
1991
1992         if (hw_pass_through && domain_type_is_si(domain))
1993                 translation = CONTEXT_TT_PASS_THROUGH;
1994
1995         pr_debug("Set context mapping for %02x:%02x.%d\n",
1996                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1997
1998         BUG_ON(!domain->pgd);
1999
2000         spin_lock_irqsave(&device_domain_lock, flags);
2001         spin_lock(&iommu->lock);
2002
2003         ret = -ENOMEM;
2004         context = iommu_context_addr(iommu, bus, devfn, 1);
2005         if (!context)
2006                 goto out_unlock;
2007
2008         ret = 0;
2009         if (context_present(context))
2010                 goto out_unlock;
2011
2012         /*
2013          * For kdump cases, old valid entries may be cached due to the
2014          * in-flight DMA and copied pgtable, but there is no unmapping
2015          * behaviour for them, thus we need an explicit cache flush for
2016          * the newly-mapped device. For kdump, at this point, the device
2017          * is supposed to finish reset at its driver probe stage, so no
2018          * in-flight DMA will exist, and we don't need to worry anymore
2019          * hereafter.
2020          */
2021         if (context_copied(context)) {
2022                 u16 did_old = context_domain_id(context);
2023
2024                 if (did_old < cap_ndoms(iommu->cap)) {
2025                         iommu->flush.flush_context(iommu, did_old,
2026                                                    (((u16)bus) << 8) | devfn,
2027                                                    DMA_CCMD_MASK_NOBIT,
2028                                                    DMA_CCMD_DEVICE_INVL);
2029                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2030                                                  DMA_TLB_DSI_FLUSH);
2031                 }
2032         }
2033
2034         context_clear_entry(context);
2035
2036         if (sm_supported(iommu)) {
2037                 unsigned long pds;
2038
2039                 WARN_ON(!table);
2040
2041                 /* Setup the PASID DIR pointer: */
2042                 pds = context_get_sm_pds(table);
2043                 context->lo = (u64)virt_to_phys(table->table) |
2044                                 context_pdts(pds);
2045
2046                 /* Setup the RID_PASID field: */
2047                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2048
2049                 /*
2050                  * Setup the Device-TLB enable bit and Page request
2051                  * Enable bit:
2052                  */
2053                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054                 if (info && info->ats_supported)
2055                         context_set_sm_dte(context);
2056                 if (info && info->pri_supported)
2057                         context_set_sm_pre(context);
2058         } else {
2059                 struct dma_pte *pgd = domain->pgd;
2060                 int agaw;
2061
2062                 context_set_domain_id(context, did);
2063
2064                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2065                         /*
2066                          * Skip top levels of page tables for iommu which has
2067                          * less agaw than default. Unnecessary for PT mode.
2068                          */
2069                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2070                                 ret = -ENOMEM;
2071                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2072                                 if (!dma_pte_present(pgd))
2073                                         goto out_unlock;
2074                         }
2075
2076                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077                         if (info && info->ats_supported)
2078                                 translation = CONTEXT_TT_DEV_IOTLB;
2079                         else
2080                                 translation = CONTEXT_TT_MULTI_LEVEL;
2081
2082                         context_set_address_root(context, virt_to_phys(pgd));
2083                         context_set_address_width(context, agaw);
2084                 } else {
2085                         /*
2086                          * In pass through mode, AW must be programmed to
2087                          * indicate the largest AGAW value supported by
2088                          * hardware. And ASR is ignored by hardware.
2089                          */
2090                         context_set_address_width(context, iommu->msagaw);
2091                 }
2092
2093                 context_set_translation_type(context, translation);
2094         }
2095
2096         context_set_fault_enable(context);
2097         context_set_present(context);
2098         domain_flush_cache(domain, context, sizeof(*context));
2099
2100         /*
2101          * It's a non-present to present mapping. If hardware doesn't cache
2102          * non-present entry we only need to flush the write-buffer. If the
2103          * _does_ cache non-present entries, then it does so in the special
2104          * domain #0, which we have to flush:
2105          */
2106         if (cap_caching_mode(iommu->cap)) {
2107                 iommu->flush.flush_context(iommu, 0,
2108                                            (((u16)bus) << 8) | devfn,
2109                                            DMA_CCMD_MASK_NOBIT,
2110                                            DMA_CCMD_DEVICE_INVL);
2111                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2112         } else {
2113                 iommu_flush_write_buffer(iommu);
2114         }
2115         iommu_enable_dev_iotlb(info);
2116
2117         ret = 0;
2118
2119 out_unlock:
2120         spin_unlock(&iommu->lock);
2121         spin_unlock_irqrestore(&device_domain_lock, flags);
2122
2123         return ret;
2124 }
2125
2126 struct domain_context_mapping_data {
2127         struct dmar_domain *domain;
2128         struct intel_iommu *iommu;
2129         struct pasid_table *table;
2130 };
2131
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133                                      u16 alias, void *opaque)
2134 {
2135         struct domain_context_mapping_data *data = opaque;
2136
2137         return domain_context_mapping_one(data->domain, data->iommu,
2138                                           data->table, PCI_BUS_NUM(alias),
2139                                           alias & 0xff);
2140 }
2141
2142 static int
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2144 {
2145         struct domain_context_mapping_data data;
2146         struct pasid_table *table;
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         table = intel_pasid_get_table(dev);
2155
2156         if (!dev_is_pci(dev))
2157                 return domain_context_mapping_one(domain, iommu, table,
2158                                                   bus, devfn);
2159
2160         data.domain = domain;
2161         data.iommu = iommu;
2162         data.table = table;
2163
2164         return pci_for_each_dma_alias(to_pci_dev(dev),
2165                                       &domain_context_mapping_cb, &data);
2166 }
2167
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169                                     u16 alias, void *opaque)
2170 {
2171         struct intel_iommu *iommu = opaque;
2172
2173         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 }
2175
2176 static int domain_context_mapped(struct device *dev)
2177 {
2178         struct intel_iommu *iommu;
2179         u8 bus, devfn;
2180
2181         iommu = device_to_iommu(dev, &bus, &devfn);
2182         if (!iommu)
2183                 return -ENODEV;
2184
2185         if (!dev_is_pci(dev))
2186                 return device_context_mapped(iommu, bus, devfn);
2187
2188         return !pci_for_each_dma_alias(to_pci_dev(dev),
2189                                        domain_context_mapped_cb, iommu);
2190 }
2191
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194                                             size_t size)
2195 {
2196         host_addr &= ~PAGE_MASK;
2197         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 }
2199
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202                                           unsigned long iov_pfn,
2203                                           unsigned long phy_pfn,
2204                                           unsigned long pages)
2205 {
2206         int support, level = 1;
2207         unsigned long pfnmerge;
2208
2209         support = domain->iommu_superpage;
2210
2211         /* To use a large page, the virtual *and* physical addresses
2212            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213            of them will mean we have to use smaller pages. So just
2214            merge them and check both at once. */
2215         pfnmerge = iov_pfn | phy_pfn;
2216
2217         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218                 pages >>= VTD_STRIDE_SHIFT;
2219                 if (!pages)
2220                         break;
2221                 pfnmerge >>= VTD_STRIDE_SHIFT;
2222                 level++;
2223                 support--;
2224         }
2225         return level;
2226 }
2227
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229                             struct scatterlist *sg, unsigned long phys_pfn,
2230                             unsigned long nr_pages, int prot)
2231 {
2232         struct dma_pte *first_pte = NULL, *pte = NULL;
2233         phys_addr_t uninitialized_var(pteval);
2234         unsigned long sg_res = 0;
2235         unsigned int largepage_lvl = 0;
2236         unsigned long lvl_pages = 0;
2237
2238         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241                 return -EINVAL;
2242
2243         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245         if (!sg) {
2246                 sg_res = nr_pages;
2247                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248         }
2249
2250         while (nr_pages > 0) {
2251                 uint64_t tmp;
2252
2253                 if (!sg_res) {
2254                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256                         sg_res = aligned_nrpages(sg->offset, sg->length);
2257                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258                         sg->dma_length = sg->length;
2259                         pteval = (sg_phys(sg) - pgoff) | prot;
2260                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261                 }
2262
2263                 if (!pte) {
2264                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267                         if (!pte)
2268                                 return -ENOMEM;
2269                         /* It is large page*/
2270                         if (largepage_lvl > 1) {
2271                                 unsigned long nr_superpages, end_pfn;
2272
2273                                 pteval |= DMA_PTE_LARGE_PAGE;
2274                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276                                 nr_superpages = sg_res / lvl_pages;
2277                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279                                 /*
2280                                  * Ensure that old small page tables are
2281                                  * removed to make room for superpage(s).
2282                                  * We're adding new large pages, so make sure
2283                                  * we don't remove their parent tables.
2284                                  */
2285                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286                                                        largepage_lvl + 1);
2287                         } else {
2288                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289                         }
2290
2291                 }
2292                 /* We don't need lock here, nobody else
2293                  * touches the iova range
2294                  */
2295                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296                 if (tmp) {
2297                         static int dumps = 5;
2298                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299                                 iov_pfn, tmp, (unsigned long long)pteval);
2300                         if (dumps) {
2301                                 dumps--;
2302                                 debug_dma_dump_mappings(NULL);
2303                         }
2304                         WARN_ON(1);
2305                 }
2306
2307                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309                 BUG_ON(nr_pages < lvl_pages);
2310                 BUG_ON(sg_res < lvl_pages);
2311
2312                 nr_pages -= lvl_pages;
2313                 iov_pfn += lvl_pages;
2314                 phys_pfn += lvl_pages;
2315                 pteval += lvl_pages * VTD_PAGE_SIZE;
2316                 sg_res -= lvl_pages;
2317
2318                 /* If the next PTE would be the first in a new page, then we
2319                    need to flush the cache on the entries we've just written.
2320                    And then we'll need to recalculate 'pte', so clear it and
2321                    let it get set again in the if (!pte) block above.
2322
2323                    If we're done (!nr_pages) we need to flush the cache too.
2324
2325                    Also if we've been setting superpages, we may need to
2326                    recalculate 'pte' and switch back to smaller pages for the
2327                    end of the mapping, if the trailing size is not enough to
2328                    use another superpage (i.e. sg_res < lvl_pages). */
2329                 pte++;
2330                 if (!nr_pages || first_pte_in_page(pte) ||
2331                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332                         domain_flush_cache(domain, first_pte,
2333                                            (void *)pte - (void *)first_pte);
2334                         pte = NULL;
2335                 }
2336
2337                 if (!sg_res && nr_pages)
2338                         sg = sg_next(sg);
2339         }
2340         return 0;
2341 }
2342
2343 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344                           struct scatterlist *sg, unsigned long phys_pfn,
2345                           unsigned long nr_pages, int prot)
2346 {
2347         int iommu_id, ret;
2348         struct intel_iommu *iommu;
2349
2350         /* Do the real mapping first */
2351         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2352         if (ret)
2353                 return ret;
2354
2355         for_each_domain_iommu(iommu_id, domain) {
2356                 iommu = g_iommus[iommu_id];
2357                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2358         }
2359
2360         return 0;
2361 }
2362
2363 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364                                     struct scatterlist *sg, unsigned long nr_pages,
2365                                     int prot)
2366 {
2367         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2368 }
2369
2370 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2371                                      unsigned long phys_pfn, unsigned long nr_pages,
2372                                      int prot)
2373 {
2374         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2375 }
2376
2377 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2378 {
2379         unsigned long flags;
2380         struct context_entry *context;
2381         u16 did_old;
2382
2383         if (!iommu)
2384                 return;
2385
2386         spin_lock_irqsave(&iommu->lock, flags);
2387         context = iommu_context_addr(iommu, bus, devfn, 0);
2388         if (!context) {
2389                 spin_unlock_irqrestore(&iommu->lock, flags);
2390                 return;
2391         }
2392         did_old = context_domain_id(context);
2393         context_clear_entry(context);
2394         __iommu_flush_cache(iommu, context, sizeof(*context));
2395         spin_unlock_irqrestore(&iommu->lock, flags);
2396         iommu->flush.flush_context(iommu,
2397                                    did_old,
2398                                    (((u16)bus) << 8) | devfn,
2399                                    DMA_CCMD_MASK_NOBIT,
2400                                    DMA_CCMD_DEVICE_INVL);
2401         iommu->flush.flush_iotlb(iommu,
2402                                  did_old,
2403                                  0,
2404                                  0,
2405                                  DMA_TLB_DSI_FLUSH);
2406 }
2407
2408 static inline void unlink_domain_info(struct device_domain_info *info)
2409 {
2410         assert_spin_locked(&device_domain_lock);
2411         list_del(&info->link);
2412         list_del(&info->global);
2413         if (info->dev)
2414                 info->dev->archdata.iommu = NULL;
2415 }
2416
2417 static void domain_remove_dev_info(struct dmar_domain *domain)
2418 {
2419         struct device_domain_info *info, *tmp;
2420         unsigned long flags;
2421
2422         spin_lock_irqsave(&device_domain_lock, flags);
2423         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2424                 __dmar_remove_one_dev_info(info);
2425         spin_unlock_irqrestore(&device_domain_lock, flags);
2426 }
2427
2428 static struct dmar_domain *find_domain(struct device *dev)
2429 {
2430         struct device_domain_info *info;
2431
2432         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2433                      dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2434                 return NULL;
2435
2436         /* No lock here, assumes no domain exit in normal case */
2437         info = dev->archdata.iommu;
2438         if (likely(info))
2439                 return info->domain;
2440
2441         return NULL;
2442 }
2443
2444 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2445 {
2446         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2447                 struct iommu_domain *domain;
2448
2449                 dev->archdata.iommu = NULL;
2450                 domain = iommu_get_domain_for_dev(dev);
2451                 if (domain)
2452                         intel_iommu_attach_device(domain, dev);
2453         }
2454
2455         return find_domain(dev);
2456 }
2457
2458 static inline struct device_domain_info *
2459 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2460 {
2461         struct device_domain_info *info;
2462
2463         list_for_each_entry(info, &device_domain_list, global)
2464                 if (info->iommu->segment == segment && info->bus == bus &&
2465                     info->devfn == devfn)
2466                         return info;
2467
2468         return NULL;
2469 }
2470
2471 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2472                                                     int bus, int devfn,
2473                                                     struct device *dev,
2474                                                     struct dmar_domain *domain)
2475 {
2476         struct dmar_domain *found = NULL;
2477         struct device_domain_info *info;
2478         unsigned long flags;
2479         int ret;
2480
2481         info = alloc_devinfo_mem();
2482         if (!info)
2483                 return NULL;
2484
2485         info->bus = bus;
2486         info->devfn = devfn;
2487         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2488         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2489         info->ats_qdep = 0;
2490         info->dev = dev;
2491         info->domain = domain;
2492         info->iommu = iommu;
2493         info->pasid_table = NULL;
2494         info->auxd_enabled = 0;
2495         INIT_LIST_HEAD(&info->auxiliary_domains);
2496
2497         if (dev && dev_is_pci(dev)) {
2498                 struct pci_dev *pdev = to_pci_dev(info->dev);
2499
2500                 if (!pdev->untrusted &&
2501                     !pci_ats_disabled() &&
2502                     ecap_dev_iotlb_support(iommu->ecap) &&
2503                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2504                     dmar_find_matched_atsr_unit(pdev))
2505                         info->ats_supported = 1;
2506
2507                 if (sm_supported(iommu)) {
2508                         if (pasid_supported(iommu)) {
2509                                 int features = pci_pasid_features(pdev);
2510                                 if (features >= 0)
2511                                         info->pasid_supported = features | 1;
2512                         }
2513
2514                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2515                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2516                                 info->pri_supported = 1;
2517                 }
2518         }
2519
2520         spin_lock_irqsave(&device_domain_lock, flags);
2521         if (dev)
2522                 found = find_domain(dev);
2523
2524         if (!found) {
2525                 struct device_domain_info *info2;
2526                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2527                 if (info2) {
2528                         found      = info2->domain;
2529                         info2->dev = dev;
2530                 }
2531         }
2532
2533         if (found) {
2534                 spin_unlock_irqrestore(&device_domain_lock, flags);
2535                 free_devinfo_mem(info);
2536                 /* Caller must free the original domain */
2537                 return found;
2538         }
2539
2540         spin_lock(&iommu->lock);
2541         ret = domain_attach_iommu(domain, iommu);
2542         spin_unlock(&iommu->lock);
2543
2544         if (ret) {
2545                 spin_unlock_irqrestore(&device_domain_lock, flags);
2546                 free_devinfo_mem(info);
2547                 return NULL;
2548         }
2549
2550         list_add(&info->link, &domain->devices);
2551         list_add(&info->global, &device_domain_list);
2552         if (dev)
2553                 dev->archdata.iommu = info;
2554         spin_unlock_irqrestore(&device_domain_lock, flags);
2555
2556         /* PASID table is mandatory for a PCI device in scalable mode. */
2557         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2558                 ret = intel_pasid_alloc_table(dev);
2559                 if (ret) {
2560                         dev_err(dev, "PASID table allocation failed\n");
2561                         dmar_remove_one_dev_info(dev);
2562                         return NULL;
2563                 }
2564
2565                 /* Setup the PASID entry for requests without PASID: */
2566                 spin_lock(&iommu->lock);
2567                 if (hw_pass_through && domain_type_is_si(domain))
2568                         ret = intel_pasid_setup_pass_through(iommu, domain,
2569                                         dev, PASID_RID2PASID);
2570                 else
2571                         ret = intel_pasid_setup_second_level(iommu, domain,
2572                                         dev, PASID_RID2PASID);
2573                 spin_unlock(&iommu->lock);
2574                 if (ret) {
2575                         dev_err(dev, "Setup RID2PASID failed\n");
2576                         dmar_remove_one_dev_info(dev);
2577                         return NULL;
2578                 }
2579         }
2580
2581         if (dev && domain_context_mapping(domain, dev)) {
2582                 dev_err(dev, "Domain context map failed\n");
2583                 dmar_remove_one_dev_info(dev);
2584                 return NULL;
2585         }
2586
2587         return domain;
2588 }
2589
2590 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2591 {
2592         *(u16 *)opaque = alias;
2593         return 0;
2594 }
2595
2596 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2597 {
2598         struct device_domain_info *info;
2599         struct dmar_domain *domain = NULL;
2600         struct intel_iommu *iommu;
2601         u16 dma_alias;
2602         unsigned long flags;
2603         u8 bus, devfn;
2604
2605         iommu = device_to_iommu(dev, &bus, &devfn);
2606         if (!iommu)
2607                 return NULL;
2608
2609         if (dev_is_pci(dev)) {
2610                 struct pci_dev *pdev = to_pci_dev(dev);
2611
2612                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2613
2614                 spin_lock_irqsave(&device_domain_lock, flags);
2615                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2616                                                       PCI_BUS_NUM(dma_alias),
2617                                                       dma_alias & 0xff);
2618                 if (info) {
2619                         iommu = info->iommu;
2620                         domain = info->domain;
2621                 }
2622                 spin_unlock_irqrestore(&device_domain_lock, flags);
2623
2624                 /* DMA alias already has a domain, use it */
2625                 if (info)
2626                         goto out;
2627         }
2628
2629         /* Allocate and initialize new domain for the device */
2630         domain = alloc_domain(0);
2631         if (!domain)
2632                 return NULL;
2633         if (domain_init(domain, iommu, gaw)) {
2634                 domain_exit(domain);
2635                 return NULL;
2636         }
2637
2638 out:
2639         return domain;
2640 }
2641
2642 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2643                                               struct dmar_domain *domain)
2644 {
2645         struct intel_iommu *iommu;
2646         struct dmar_domain *tmp;
2647         u16 req_id, dma_alias;
2648         u8 bus, devfn;
2649
2650         iommu = device_to_iommu(dev, &bus, &devfn);
2651         if (!iommu)
2652                 return NULL;
2653
2654         req_id = ((u16)bus << 8) | devfn;
2655
2656         if (dev_is_pci(dev)) {
2657                 struct pci_dev *pdev = to_pci_dev(dev);
2658
2659                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2660
2661                 /* register PCI DMA alias device */
2662                 if (req_id != dma_alias) {
2663                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2664                                         dma_alias & 0xff, NULL, domain);
2665
2666                         if (!tmp || tmp != domain)
2667                                 return tmp;
2668                 }
2669         }
2670
2671         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2672         if (!tmp || tmp != domain)
2673                 return tmp;
2674
2675         return domain;
2676 }
2677
2678 static int iommu_domain_identity_map(struct dmar_domain *domain,
2679                                      unsigned long long start,
2680                                      unsigned long long end)
2681 {
2682         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2683         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2684
2685         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2686                           dma_to_mm_pfn(last_vpfn))) {
2687                 pr_err("Reserving iova failed\n");
2688                 return -ENOMEM;
2689         }
2690
2691         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2692         /*
2693          * RMRR range might have overlap with physical memory range,
2694          * clear it first
2695          */
2696         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2697
2698         return __domain_mapping(domain, first_vpfn, NULL,
2699                                 first_vpfn, last_vpfn - first_vpfn + 1,
2700                                 DMA_PTE_READ|DMA_PTE_WRITE);
2701 }
2702
2703 static int domain_prepare_identity_map(struct device *dev,
2704                                        struct dmar_domain *domain,
2705                                        unsigned long long start,
2706                                        unsigned long long end)
2707 {
2708         /* For _hardware_ passthrough, don't bother. But for software
2709            passthrough, we do it anyway -- it may indicate a memory
2710            range which is reserved in E820, so which didn't get set
2711            up to start with in si_domain */
2712         if (domain == si_domain && hw_pass_through) {
2713                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2714                          start, end);
2715                 return 0;
2716         }
2717
2718         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2719
2720         if (end < start) {
2721                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2722                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2723                         dmi_get_system_info(DMI_BIOS_VENDOR),
2724                         dmi_get_system_info(DMI_BIOS_VERSION),
2725                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2726                 return -EIO;
2727         }
2728
2729         if (end >> agaw_to_width(domain->agaw)) {
2730                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2731                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2732                      agaw_to_width(domain->agaw),
2733                      dmi_get_system_info(DMI_BIOS_VENDOR),
2734                      dmi_get_system_info(DMI_BIOS_VERSION),
2735                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2736                 return -EIO;
2737         }
2738
2739         return iommu_domain_identity_map(domain, start, end);
2740 }
2741
2742 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2743
2744 static int __init si_domain_init(int hw)
2745 {
2746         struct dmar_rmrr_unit *rmrr;
2747         struct device *dev;
2748         int i, nid, ret;
2749
2750         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2751         if (!si_domain)
2752                 return -EFAULT;
2753
2754         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2755                 domain_exit(si_domain);
2756                 return -EFAULT;
2757         }
2758
2759         if (hw)
2760                 return 0;
2761
2762         for_each_online_node(nid) {
2763                 unsigned long start_pfn, end_pfn;
2764                 int i;
2765
2766                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2767                         ret = iommu_domain_identity_map(si_domain,
2768                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2769                         if (ret)
2770                                 return ret;
2771                 }
2772         }
2773
2774         /*
2775          * Normally we use DMA domains for devices which have RMRRs. But we
2776          * loose this requirement for graphic and usb devices. Identity map
2777          * the RMRRs for graphic and USB devices so that they could use the
2778          * si_domain.
2779          */
2780         for_each_rmrr_units(rmrr) {
2781                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2782                                           i, dev) {
2783                         unsigned long long start = rmrr->base_address;
2784                         unsigned long long end = rmrr->end_address;
2785
2786                         if (device_is_rmrr_locked(dev))
2787                                 continue;
2788
2789                         if (WARN_ON(end < start ||
2790                                     end >> agaw_to_width(si_domain->agaw)))
2791                                 continue;
2792
2793                         ret = iommu_domain_identity_map(si_domain, start, end);
2794                         if (ret)
2795                                 return ret;
2796                 }
2797         }
2798
2799         return 0;
2800 }
2801
2802 static int identity_mapping(struct device *dev)
2803 {
2804         struct device_domain_info *info;
2805
2806         info = dev->archdata.iommu;
2807         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2808                 return (info->domain == si_domain);
2809
2810         return 0;
2811 }
2812
2813 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2814 {
2815         struct dmar_domain *ndomain;
2816         struct intel_iommu *iommu;
2817         u8 bus, devfn;
2818
2819         iommu = device_to_iommu(dev, &bus, &devfn);
2820         if (!iommu)
2821                 return -ENODEV;
2822
2823         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2824         if (ndomain != domain)
2825                 return -EBUSY;
2826
2827         return 0;
2828 }
2829
2830 static bool device_has_rmrr(struct device *dev)
2831 {
2832         struct dmar_rmrr_unit *rmrr;
2833         struct device *tmp;
2834         int i;
2835
2836         rcu_read_lock();
2837         for_each_rmrr_units(rmrr) {
2838                 /*
2839                  * Return TRUE if this RMRR contains the device that
2840                  * is passed in.
2841                  */
2842                 for_each_active_dev_scope(rmrr->devices,
2843                                           rmrr->devices_cnt, i, tmp)
2844                         if (tmp == dev ||
2845                             is_downstream_to_pci_bridge(dev, tmp)) {
2846                                 rcu_read_unlock();
2847                                 return true;
2848                         }
2849         }
2850         rcu_read_unlock();
2851         return false;
2852 }
2853
2854 /**
2855  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2856  * is relaxable (ie. is allowed to be not enforced under some conditions)
2857  * @dev: device handle
2858  *
2859  * We assume that PCI USB devices with RMRRs have them largely
2860  * for historical reasons and that the RMRR space is not actively used post
2861  * boot.  This exclusion may change if vendors begin to abuse it.
2862  *
2863  * The same exception is made for graphics devices, with the requirement that
2864  * any use of the RMRR regions will be torn down before assigning the device
2865  * to a guest.
2866  *
2867  * Return: true if the RMRR is relaxable, false otherwise
2868  */
2869 static bool device_rmrr_is_relaxable(struct device *dev)
2870 {
2871         struct pci_dev *pdev;
2872
2873         if (!dev_is_pci(dev))
2874                 return false;
2875
2876         pdev = to_pci_dev(dev);
2877         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2878                 return true;
2879         else
2880                 return false;
2881 }
2882
2883 /*
2884  * There are a couple cases where we need to restrict the functionality of
2885  * devices associated with RMRRs.  The first is when evaluating a device for
2886  * identity mapping because problems exist when devices are moved in and out
2887  * of domains and their respective RMRR information is lost.  This means that
2888  * a device with associated RMRRs will never be in a "passthrough" domain.
2889  * The second is use of the device through the IOMMU API.  This interface
2890  * expects to have full control of the IOVA space for the device.  We cannot
2891  * satisfy both the requirement that RMRR access is maintained and have an
2892  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2893  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2894  * We therefore prevent devices associated with an RMRR from participating in
2895  * the IOMMU API, which eliminates them from device assignment.
2896  *
2897  * In both cases, devices which have relaxable RMRRs are not concerned by this
2898  * restriction. See device_rmrr_is_relaxable comment.
2899  */
2900 static bool device_is_rmrr_locked(struct device *dev)
2901 {
2902         if (!device_has_rmrr(dev))
2903                 return false;
2904
2905         if (device_rmrr_is_relaxable(dev))
2906                 return false;
2907
2908         return true;
2909 }
2910
2911 /*
2912  * Return the required default domain type for a specific device.
2913  *
2914  * @dev: the device in query
2915  * @startup: true if this is during early boot
2916  *
2917  * Returns:
2918  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2919  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2920  *  - 0: both identity and dynamic domains work for this device
2921  */
2922 static int device_def_domain_type(struct device *dev)
2923 {
2924         if (dev_is_pci(dev)) {
2925                 struct pci_dev *pdev = to_pci_dev(dev);
2926
2927                 if (device_is_rmrr_locked(dev))
2928                         return IOMMU_DOMAIN_DMA;
2929
2930                 /*
2931                  * Prevent any device marked as untrusted from getting
2932                  * placed into the statically identity mapping domain.
2933                  */
2934                 if (pdev->untrusted)
2935                         return IOMMU_DOMAIN_DMA;
2936
2937                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2938                         return IOMMU_DOMAIN_IDENTITY;
2939
2940                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2941                         return IOMMU_DOMAIN_IDENTITY;
2942
2943                 /*
2944                  * We want to start off with all devices in the 1:1 domain, and
2945                  * take them out later if we find they can't access all of memory.
2946                  *
2947                  * However, we can't do this for PCI devices behind bridges,
2948                  * because all PCI devices behind the same bridge will end up
2949                  * with the same source-id on their transactions.
2950                  *
2951                  * Practically speaking, we can't change things around for these
2952                  * devices at run-time, because we can't be sure there'll be no
2953                  * DMA transactions in flight for any of their siblings.
2954                  *
2955                  * So PCI devices (unless they're on the root bus) as well as
2956                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2957                  * the 1:1 domain, just in _case_ one of their siblings turns out
2958                  * not to be able to map all of memory.
2959                  */
2960                 if (!pci_is_pcie(pdev)) {
2961                         if (!pci_is_root_bus(pdev->bus))
2962                                 return IOMMU_DOMAIN_DMA;
2963                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2964                                 return IOMMU_DOMAIN_DMA;
2965                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2966                         return IOMMU_DOMAIN_DMA;
2967         } else {
2968                 if (device_has_rmrr(dev))
2969                         return IOMMU_DOMAIN_DMA;
2970         }
2971
2972         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2973                         IOMMU_DOMAIN_IDENTITY : 0;
2974 }
2975
2976 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2977 {
2978         /*
2979          * Start from the sane iommu hardware state.
2980          * If the queued invalidation is already initialized by us
2981          * (for example, while enabling interrupt-remapping) then
2982          * we got the things already rolling from a sane state.
2983          */
2984         if (!iommu->qi) {
2985                 /*
2986                  * Clear any previous faults.
2987                  */
2988                 dmar_fault(-1, iommu);
2989                 /*
2990                  * Disable queued invalidation if supported and already enabled
2991                  * before OS handover.
2992                  */
2993                 dmar_disable_qi(iommu);
2994         }
2995
2996         if (dmar_enable_qi(iommu)) {
2997                 /*
2998                  * Queued Invalidate not enabled, use Register Based Invalidate
2999                  */
3000                 iommu->flush.flush_context = __iommu_flush_context;
3001                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3002                 pr_info("%s: Using Register based invalidation\n",
3003                         iommu->name);
3004         } else {
3005                 iommu->flush.flush_context = qi_flush_context;
3006                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3007                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3008         }
3009 }
3010
3011 static int copy_context_table(struct intel_iommu *iommu,
3012                               struct root_entry *old_re,
3013                               struct context_entry **tbl,
3014                               int bus, bool ext)
3015 {
3016         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3017         struct context_entry *new_ce = NULL, ce;
3018         struct context_entry *old_ce = NULL;
3019         struct root_entry re;
3020         phys_addr_t old_ce_phys;
3021
3022         tbl_idx = ext ? bus * 2 : bus;
3023         memcpy(&re, old_re, sizeof(re));
3024
3025         for (devfn = 0; devfn < 256; devfn++) {
3026                 /* First calculate the correct index */
3027                 idx = (ext ? devfn * 2 : devfn) % 256;
3028
3029                 if (idx == 0) {
3030                         /* First save what we may have and clean up */
3031                         if (new_ce) {
3032                                 tbl[tbl_idx] = new_ce;
3033                                 __iommu_flush_cache(iommu, new_ce,
3034                                                     VTD_PAGE_SIZE);
3035                                 pos = 1;
3036                         }
3037
3038                         if (old_ce)
3039                                 memunmap(old_ce);
3040
3041                         ret = 0;
3042                         if (devfn < 0x80)
3043                                 old_ce_phys = root_entry_lctp(&re);
3044                         else
3045                                 old_ce_phys = root_entry_uctp(&re);
3046
3047                         if (!old_ce_phys) {
3048                                 if (ext && devfn == 0) {
3049                                         /* No LCTP, try UCTP */
3050                                         devfn = 0x7f;
3051                                         continue;
3052                                 } else {
3053                                         goto out;
3054                                 }
3055                         }
3056
3057                         ret = -ENOMEM;
3058                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3059                                         MEMREMAP_WB);
3060                         if (!old_ce)
3061                                 goto out;
3062
3063                         new_ce = alloc_pgtable_page(iommu->node);
3064                         if (!new_ce)
3065                                 goto out_unmap;
3066
3067                         ret = 0;
3068                 }
3069
3070                 /* Now copy the context entry */
3071                 memcpy(&ce, old_ce + idx, sizeof(ce));
3072
3073                 if (!__context_present(&ce))
3074                         continue;
3075
3076                 did = context_domain_id(&ce);
3077                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3078                         set_bit(did, iommu->domain_ids);
3079
3080                 /*
3081                  * We need a marker for copied context entries. This
3082                  * marker needs to work for the old format as well as
3083                  * for extended context entries.
3084                  *
3085                  * Bit 67 of the context entry is used. In the old
3086                  * format this bit is available to software, in the
3087                  * extended format it is the PGE bit, but PGE is ignored
3088                  * by HW if PASIDs are disabled (and thus still
3089                  * available).
3090                  *
3091                  * So disable PASIDs first and then mark the entry
3092                  * copied. This means that we don't copy PASID
3093                  * translations from the old kernel, but this is fine as
3094                  * faults there are not fatal.
3095                  */
3096                 context_clear_pasid_enable(&ce);
3097                 context_set_copied(&ce);
3098
3099                 new_ce[idx] = ce;
3100         }
3101
3102         tbl[tbl_idx + pos] = new_ce;
3103
3104         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3105
3106 out_unmap:
3107         memunmap(old_ce);
3108
3109 out:
3110         return ret;
3111 }
3112
3113 static int copy_translation_tables(struct intel_iommu *iommu)
3114 {
3115         struct context_entry **ctxt_tbls;
3116         struct root_entry *old_rt;
3117         phys_addr_t old_rt_phys;
3118         int ctxt_table_entries;
3119         unsigned long flags;
3120         u64 rtaddr_reg;
3121         int bus, ret;
3122         bool new_ext, ext;
3123
3124         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3125         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3126         new_ext    = !!ecap_ecs(iommu->ecap);
3127
3128         /*
3129          * The RTT bit can only be changed when translation is disabled,
3130          * but disabling translation means to open a window for data
3131          * corruption. So bail out and don't copy anything if we would
3132          * have to change the bit.
3133          */
3134         if (new_ext != ext)
3135                 return -EINVAL;
3136
3137         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3138         if (!old_rt_phys)
3139                 return -EINVAL;
3140
3141         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3142         if (!old_rt)
3143                 return -ENOMEM;
3144
3145         /* This is too big for the stack - allocate it from slab */
3146         ctxt_table_entries = ext ? 512 : 256;
3147         ret = -ENOMEM;
3148         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3149         if (!ctxt_tbls)
3150                 goto out_unmap;
3151
3152         for (bus = 0; bus < 256; bus++) {
3153                 ret = copy_context_table(iommu, &old_rt[bus],
3154                                          ctxt_tbls, bus, ext);
3155                 if (ret) {
3156                         pr_err("%s: Failed to copy context table for bus %d\n",
3157                                 iommu->name, bus);
3158                         continue;
3159                 }
3160         }
3161
3162         spin_lock_irqsave(&iommu->lock, flags);
3163
3164         /* Context tables are copied, now write them to the root_entry table */
3165         for (bus = 0; bus < 256; bus++) {
3166                 int idx = ext ? bus * 2 : bus;
3167                 u64 val;
3168
3169                 if (ctxt_tbls[idx]) {
3170                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3171                         iommu->root_entry[bus].lo = val;
3172                 }
3173
3174                 if (!ext || !ctxt_tbls[idx + 1])
3175                         continue;
3176
3177                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3178                 iommu->root_entry[bus].hi = val;
3179         }
3180
3181         spin_unlock_irqrestore(&iommu->lock, flags);
3182
3183         kfree(ctxt_tbls);
3184
3185         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3186
3187         ret = 0;
3188
3189 out_unmap:
3190         memunmap(old_rt);
3191
3192         return ret;
3193 }
3194
3195 static int __init init_dmars(void)
3196 {
3197         struct dmar_drhd_unit *drhd;
3198         struct intel_iommu *iommu;
3199         int ret;
3200
3201         /*
3202          * for each drhd
3203          *    allocate root
3204          *    initialize and program root entry to not present
3205          * endfor
3206          */
3207         for_each_drhd_unit(drhd) {
3208                 /*
3209                  * lock not needed as this is only incremented in the single
3210                  * threaded kernel __init code path all other access are read
3211                  * only
3212                  */
3213                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3214                         g_num_of_iommus++;
3215                         continue;
3216                 }
3217                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3218         }
3219
3220         /* Preallocate enough resources for IOMMU hot-addition */
3221         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3222                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3223
3224         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3225                         GFP_KERNEL);
3226         if (!g_iommus) {
3227                 pr_err("Allocating global iommu array failed\n");
3228                 ret = -ENOMEM;
3229                 goto error;
3230         }
3231
3232         for_each_iommu(iommu, drhd) {
3233                 if (drhd->ignored) {
3234                         iommu_disable_translation(iommu);
3235                         continue;
3236                 }
3237
3238                 /*
3239                  * Find the max pasid size of all IOMMU's in the system.
3240                  * We need to ensure the system pasid table is no bigger
3241                  * than the smallest supported.
3242                  */
3243                 if (pasid_supported(iommu)) {
3244                         u32 temp = 2 << ecap_pss(iommu->ecap);
3245
3246                         intel_pasid_max_id = min_t(u32, temp,
3247                                                    intel_pasid_max_id);
3248                 }
3249
3250                 g_iommus[iommu->seq_id] = iommu;
3251
3252                 intel_iommu_init_qi(iommu);
3253
3254                 ret = iommu_init_domains(iommu);
3255                 if (ret)
3256                         goto free_iommu;
3257
3258                 init_translation_status(iommu);
3259
3260                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3261                         iommu_disable_translation(iommu);
3262                         clear_translation_pre_enabled(iommu);
3263                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3264                                 iommu->name);
3265                 }
3266
3267                 /*
3268                  * TBD:
3269                  * we could share the same root & context tables
3270                  * among all IOMMU's. Need to Split it later.
3271                  */
3272                 ret = iommu_alloc_root_entry(iommu);
3273                 if (ret)
3274                         goto free_iommu;
3275
3276                 if (translation_pre_enabled(iommu)) {
3277                         pr_info("Translation already enabled - trying to copy translation structures\n");
3278
3279                         ret = copy_translation_tables(iommu);
3280                         if (ret) {
3281                                 /*
3282                                  * We found the IOMMU with translation
3283                                  * enabled - but failed to copy over the
3284                                  * old root-entry table. Try to proceed
3285                                  * by disabling translation now and
3286                                  * allocating a clean root-entry table.
3287                                  * This might cause DMAR faults, but
3288                                  * probably the dump will still succeed.
3289                                  */
3290                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3291                                        iommu->name);
3292                                 iommu_disable_translation(iommu);
3293                                 clear_translation_pre_enabled(iommu);
3294                         } else {
3295                                 pr_info("Copied translation tables from previous kernel for %s\n",
3296                                         iommu->name);
3297                         }
3298                 }
3299
3300                 if (!ecap_pass_through(iommu->ecap))
3301                         hw_pass_through = 0;
3302                 intel_svm_check(iommu);
3303         }
3304
3305         /*
3306          * Now that qi is enabled on all iommus, set the root entry and flush
3307          * caches. This is required on some Intel X58 chipsets, otherwise the
3308          * flush_context function will loop forever and the boot hangs.
3309          */
3310         for_each_active_iommu(iommu, drhd) {
3311                 iommu_flush_write_buffer(iommu);
3312                 iommu_set_root_entry(iommu);
3313                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3314                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3315         }
3316
3317         if (iommu_default_passthrough())
3318                 iommu_identity_mapping |= IDENTMAP_ALL;
3319
3320 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3321         dmar_map_gfx = 0;
3322 #endif
3323
3324         if (!dmar_map_gfx)
3325                 iommu_identity_mapping |= IDENTMAP_GFX;
3326
3327         check_tylersburg_isoch();
3328
3329         ret = si_domain_init(hw_pass_through);
3330         if (ret)
3331                 goto free_iommu;
3332
3333         /*
3334          * for each drhd
3335          *   enable fault log
3336          *   global invalidate context cache
3337          *   global invalidate iotlb
3338          *   enable translation
3339          */
3340         for_each_iommu(iommu, drhd) {
3341                 if (drhd->ignored) {
3342                         /*
3343                          * we always have to disable PMRs or DMA may fail on
3344                          * this device
3345                          */
3346                         if (force_on)
3347                                 iommu_disable_protect_mem_regions(iommu);
3348                         continue;
3349                 }
3350
3351                 iommu_flush_write_buffer(iommu);
3352
3353 #ifdef CONFIG_INTEL_IOMMU_SVM
3354                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3355                         /*
3356                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3357                          * could cause possible lock race condition.
3358                          */
3359                         up_write(&dmar_global_lock);
3360                         ret = intel_svm_enable_prq(iommu);
3361                         down_write(&dmar_global_lock);
3362                         if (ret)
3363                                 goto free_iommu;
3364                 }
3365 #endif
3366                 ret = dmar_set_interrupt(iommu);
3367                 if (ret)
3368                         goto free_iommu;
3369         }
3370
3371         return 0;
3372
3373 free_iommu:
3374         for_each_active_iommu(iommu, drhd) {
3375                 disable_dmar_iommu(iommu);
3376                 free_dmar_iommu(iommu);
3377         }
3378
3379         kfree(g_iommus);
3380
3381 error:
3382         return ret;
3383 }
3384
3385 /* This takes a number of _MM_ pages, not VTD pages */
3386 static unsigned long intel_alloc_iova(struct device *dev,
3387                                      struct dmar_domain *domain,
3388                                      unsigned long nrpages, uint64_t dma_mask)
3389 {
3390         unsigned long iova_pfn;
3391
3392         /* Restrict dma_mask to the width that the iommu can handle */
3393         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3394         /* Ensure we reserve the whole size-aligned region */
3395         nrpages = __roundup_pow_of_two(nrpages);
3396
3397         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3398                 /*
3399                  * First try to allocate an io virtual address in
3400                  * DMA_BIT_MASK(32) and if that fails then try allocating
3401                  * from higher range
3402                  */
3403                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3404                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3405                 if (iova_pfn)
3406                         return iova_pfn;
3407         }
3408         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3409                                    IOVA_PFN(dma_mask), true);
3410         if (unlikely(!iova_pfn)) {
3411                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3412                 return 0;
3413         }
3414
3415         return iova_pfn;
3416 }
3417
3418 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3419 {
3420         struct dmar_domain *domain, *tmp;
3421         struct dmar_rmrr_unit *rmrr;
3422         struct device *i_dev;
3423         int i, ret;
3424
3425         /* Device shouldn't be attached by any domains. */
3426         domain = find_domain(dev);
3427         if (domain)
3428                 return NULL;
3429
3430         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3431         if (!domain)
3432                 goto out;
3433
3434         /* We have a new domain - setup possible RMRRs for the device */
3435         rcu_read_lock();
3436         for_each_rmrr_units(rmrr) {
3437                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3438                                           i, i_dev) {
3439                         if (i_dev != dev)
3440                                 continue;
3441
3442                         ret = domain_prepare_identity_map(dev, domain,
3443                                                           rmrr->base_address,
3444                                                           rmrr->end_address);
3445                         if (ret)
3446                                 dev_err(dev, "Mapping reserved region failed\n");
3447                 }
3448         }
3449         rcu_read_unlock();
3450
3451         tmp = set_domain_for_dev(dev, domain);
3452         if (!tmp || domain != tmp) {
3453                 domain_exit(domain);
3454                 domain = tmp;
3455         }
3456
3457 out:
3458         if (!domain)
3459                 dev_err(dev, "Allocating domain failed\n");
3460         else
3461                 domain->domain.type = IOMMU_DOMAIN_DMA;
3462
3463         return domain;
3464 }
3465
3466 /* Check if the dev needs to go through non-identity map and unmap process.*/
3467 static bool iommu_need_mapping(struct device *dev)
3468 {
3469         int ret;
3470
3471         if (iommu_dummy(dev))
3472                 return false;
3473
3474         ret = identity_mapping(dev);
3475         if (ret) {
3476                 u64 dma_mask = *dev->dma_mask;
3477
3478                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3479                         dma_mask = dev->coherent_dma_mask;
3480
3481                 if (dma_mask >= dma_direct_get_required_mask(dev))
3482                         return false;
3483
3484                 /*
3485                  * 32 bit DMA is removed from si_domain and fall back to
3486                  * non-identity mapping.
3487                  */
3488                 dmar_remove_one_dev_info(dev);
3489                 ret = iommu_request_dma_domain_for_dev(dev);
3490                 if (ret) {
3491                         struct iommu_domain *domain;
3492                         struct dmar_domain *dmar_domain;
3493
3494                         domain = iommu_get_domain_for_dev(dev);
3495                         if (domain) {
3496                                 dmar_domain = to_dmar_domain(domain);
3497                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3498                         }
3499                         dmar_remove_one_dev_info(dev);
3500                         get_private_domain_for_dev(dev);
3501                 }
3502
3503                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3504         }
3505
3506         return true;
3507 }
3508
3509 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3510                                      size_t size, int dir, u64 dma_mask)
3511 {
3512         struct dmar_domain *domain;
3513         phys_addr_t start_paddr;
3514         unsigned long iova_pfn;
3515         int prot = 0;
3516         int ret;
3517         struct intel_iommu *iommu;
3518         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3519
3520         BUG_ON(dir == DMA_NONE);
3521
3522         domain = deferred_attach_domain(dev);
3523         if (!domain)
3524                 return DMA_MAPPING_ERROR;
3525
3526         iommu = domain_get_iommu(domain);
3527         size = aligned_nrpages(paddr, size);
3528
3529         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3530         if (!iova_pfn)
3531                 goto error;
3532
3533         /*
3534          * Check if DMAR supports zero-length reads on write only
3535          * mappings..
3536          */
3537         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3538                         !cap_zlr(iommu->cap))
3539                 prot |= DMA_PTE_READ;
3540         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3541                 prot |= DMA_PTE_WRITE;
3542         /*
3543          * paddr - (paddr + size) might be partial page, we should map the whole
3544          * page.  Note: if two part of one page are separately mapped, we
3545          * might have two guest_addr mapping to the same host paddr, but this
3546          * is not a big problem
3547          */
3548         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3549                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3550         if (ret)
3551                 goto error;
3552
3553         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3554         start_paddr += paddr & ~PAGE_MASK;
3555
3556         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3557
3558         return start_paddr;
3559
3560 error:
3561         if (iova_pfn)
3562                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3563         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3564                 size, (unsigned long long)paddr, dir);
3565         return DMA_MAPPING_ERROR;
3566 }
3567
3568 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3569                                  unsigned long offset, size_t size,
3570                                  enum dma_data_direction dir,
3571                                  unsigned long attrs)
3572 {
3573         if (iommu_need_mapping(dev))
3574                 return __intel_map_single(dev, page_to_phys(page) + offset,
3575                                 size, dir, *dev->dma_mask);
3576         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3577 }
3578
3579 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3580                                      size_t size, enum dma_data_direction dir,
3581                                      unsigned long attrs)
3582 {
3583         if (iommu_need_mapping(dev))
3584                 return __intel_map_single(dev, phys_addr, size, dir,
3585                                 *dev->dma_mask);
3586         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3587 }
3588
3589 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3590 {
3591         struct dmar_domain *domain;
3592         unsigned long start_pfn, last_pfn;
3593         unsigned long nrpages;
3594         unsigned long iova_pfn;
3595         struct intel_iommu *iommu;
3596         struct page *freelist;
3597         struct pci_dev *pdev = NULL;
3598
3599         domain = find_domain(dev);
3600         BUG_ON(!domain);
3601
3602         iommu = domain_get_iommu(domain);
3603
3604         iova_pfn = IOVA_PFN(dev_addr);
3605
3606         nrpages = aligned_nrpages(dev_addr, size);
3607         start_pfn = mm_to_dma_pfn(iova_pfn);
3608         last_pfn = start_pfn + nrpages - 1;
3609
3610         if (dev_is_pci(dev))
3611                 pdev = to_pci_dev(dev);
3612
3613         freelist = domain_unmap(domain, start_pfn, last_pfn);
3614         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3615                         !has_iova_flush_queue(&domain->iovad)) {
3616                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3617                                       nrpages, !freelist, 0);
3618                 /* free iova */
3619                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3620                 dma_free_pagelist(freelist);
3621         } else {
3622                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3623                            (unsigned long)freelist);
3624                 /*
3625                  * queue up the release of the unmap to save the 1/6th of the
3626                  * cpu used up by the iotlb flush operation...
3627                  */
3628         }
3629
3630         trace_unmap_single(dev, dev_addr, size);
3631 }
3632
3633 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3634                              size_t size, enum dma_data_direction dir,
3635                              unsigned long attrs)
3636 {
3637         if (iommu_need_mapping(dev))
3638                 intel_unmap(dev, dev_addr, size);
3639         else
3640                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3641 }
3642
3643 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3644                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3645 {
3646         if (iommu_need_mapping(dev))
3647                 intel_unmap(dev, dev_addr, size);
3648 }
3649
3650 static void *intel_alloc_coherent(struct device *dev, size_t size,
3651                                   dma_addr_t *dma_handle, gfp_t flags,
3652                                   unsigned long attrs)
3653 {
3654         struct page *page = NULL;
3655         int order;
3656
3657         if (!iommu_need_mapping(dev))
3658                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3659
3660         size = PAGE_ALIGN(size);
3661         order = get_order(size);
3662
3663         if (gfpflags_allow_blocking(flags)) {
3664                 unsigned int count = size >> PAGE_SHIFT;
3665
3666                 page = dma_alloc_from_contiguous(dev, count, order,
3667                                                  flags & __GFP_NOWARN);
3668         }
3669
3670         if (!page)
3671                 page = alloc_pages(flags, order);
3672         if (!page)
3673                 return NULL;
3674         memset(page_address(page), 0, size);
3675
3676         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3677                                          DMA_BIDIRECTIONAL,
3678                                          dev->coherent_dma_mask);
3679         if (*dma_handle != DMA_MAPPING_ERROR)
3680                 return page_address(page);
3681         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3682                 __free_pages(page, order);
3683
3684         return NULL;
3685 }
3686
3687 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3688                                 dma_addr_t dma_handle, unsigned long attrs)
3689 {
3690         int order;
3691         struct page *page = virt_to_page(vaddr);
3692
3693         if (!iommu_need_mapping(dev))
3694                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3695
3696         size = PAGE_ALIGN(size);
3697         order = get_order(size);
3698
3699         intel_unmap(dev, dma_handle, size);
3700         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3701                 __free_pages(page, order);
3702 }
3703
3704 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3705                            int nelems, enum dma_data_direction dir,
3706                            unsigned long attrs)
3707 {
3708         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3709         unsigned long nrpages = 0;
3710         struct scatterlist *sg;
3711         int i;
3712
3713         if (!iommu_need_mapping(dev))
3714                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3715
3716         for_each_sg(sglist, sg, nelems, i) {
3717                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3718         }
3719
3720         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3721
3722         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3723 }
3724
3725 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3726                         enum dma_data_direction dir, unsigned long attrs)
3727 {
3728         int i;
3729         struct dmar_domain *domain;
3730         size_t size = 0;
3731         int prot = 0;
3732         unsigned long iova_pfn;
3733         int ret;
3734         struct scatterlist *sg;
3735         unsigned long start_vpfn;
3736         struct intel_iommu *iommu;
3737
3738         BUG_ON(dir == DMA_NONE);
3739         if (!iommu_need_mapping(dev))
3740                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3741
3742         domain = deferred_attach_domain(dev);
3743         if (!domain)
3744                 return 0;
3745
3746         iommu = domain_get_iommu(domain);
3747
3748         for_each_sg(sglist, sg, nelems, i)
3749                 size += aligned_nrpages(sg->offset, sg->length);
3750
3751         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3752                                 *dev->dma_mask);
3753         if (!iova_pfn) {
3754                 sglist->dma_length = 0;
3755                 return 0;
3756         }
3757
3758         /*
3759          * Check if DMAR supports zero-length reads on write only
3760          * mappings..
3761          */
3762         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3763                         !cap_zlr(iommu->cap))
3764                 prot |= DMA_PTE_READ;
3765         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3766                 prot |= DMA_PTE_WRITE;
3767
3768         start_vpfn = mm_to_dma_pfn(iova_pfn);
3769
3770         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3771         if (unlikely(ret)) {
3772                 dma_pte_free_pagetable(domain, start_vpfn,
3773                                        start_vpfn + size - 1,
3774                                        agaw_to_level(domain->agaw) + 1);
3775                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3776                 return 0;
3777         }
3778
3779         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3780                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3781
3782         return nelems;
3783 }
3784
3785 static u64 intel_get_required_mask(struct device *dev)
3786 {
3787         if (!iommu_need_mapping(dev))
3788                 return dma_direct_get_required_mask(dev);
3789         return DMA_BIT_MASK(32);
3790 }
3791
3792 static const struct dma_map_ops intel_dma_ops = {
3793         .alloc = intel_alloc_coherent,
3794         .free = intel_free_coherent,
3795         .map_sg = intel_map_sg,
3796         .unmap_sg = intel_unmap_sg,
3797         .map_page = intel_map_page,
3798         .unmap_page = intel_unmap_page,
3799         .map_resource = intel_map_resource,
3800         .unmap_resource = intel_unmap_resource,
3801         .dma_supported = dma_direct_supported,
3802         .mmap = dma_common_mmap,
3803         .get_sgtable = dma_common_get_sgtable,
3804         .get_required_mask = intel_get_required_mask,
3805 };
3806
3807 static void
3808 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3809                    enum dma_data_direction dir, enum dma_sync_target target)
3810 {
3811         struct dmar_domain *domain;
3812         phys_addr_t tlb_addr;
3813
3814         domain = find_domain(dev);
3815         if (WARN_ON(!domain))
3816                 return;
3817
3818         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3819         if (is_swiotlb_buffer(tlb_addr))
3820                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3821 }
3822
3823 static dma_addr_t
3824 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3825                   enum dma_data_direction dir, unsigned long attrs,
3826                   u64 dma_mask)
3827 {
3828         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3829         struct dmar_domain *domain;
3830         struct intel_iommu *iommu;
3831         unsigned long iova_pfn;
3832         unsigned long nrpages;
3833         phys_addr_t tlb_addr;
3834         int prot = 0;
3835         int ret;
3836
3837         domain = deferred_attach_domain(dev);
3838         if (WARN_ON(dir == DMA_NONE || !domain))
3839                 return DMA_MAPPING_ERROR;
3840
3841         iommu = domain_get_iommu(domain);
3842         if (WARN_ON(!iommu))
3843                 return DMA_MAPPING_ERROR;
3844
3845         nrpages = aligned_nrpages(0, size);
3846         iova_pfn = intel_alloc_iova(dev, domain,
3847                                     dma_to_mm_pfn(nrpages), dma_mask);
3848         if (!iova_pfn)
3849                 return DMA_MAPPING_ERROR;
3850
3851         /*
3852          * Check if DMAR supports zero-length reads on write only
3853          * mappings..
3854          */
3855         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3856                         !cap_zlr(iommu->cap))
3857                 prot |= DMA_PTE_READ;
3858         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3859                 prot |= DMA_PTE_WRITE;
3860
3861         /*
3862          * If both the physical buffer start address and size are
3863          * page aligned, we don't need to use a bounce page.
3864          */
3865         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3866                 tlb_addr = swiotlb_tbl_map_single(dev,
3867                                 __phys_to_dma(dev, io_tlb_start),
3868                                 paddr, size, aligned_size, dir, attrs);
3869                 if (tlb_addr == DMA_MAPPING_ERROR) {
3870                         goto swiotlb_error;
3871                 } else {
3872                         /* Cleanup the padding area. */
3873                         void *padding_start = phys_to_virt(tlb_addr);
3874                         size_t padding_size = aligned_size;
3875
3876                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3877                             (dir == DMA_TO_DEVICE ||
3878                              dir == DMA_BIDIRECTIONAL)) {
3879                                 padding_start += size;
3880                                 padding_size -= size;
3881                         }
3882
3883                         memset(padding_start, 0, padding_size);
3884                 }
3885         } else {
3886                 tlb_addr = paddr;
3887         }
3888
3889         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3890                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3891         if (ret)
3892                 goto mapping_error;
3893
3894         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3895
3896         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3897
3898 mapping_error:
3899         if (is_swiotlb_buffer(tlb_addr))
3900                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3901                                          aligned_size, dir, attrs);
3902 swiotlb_error:
3903         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3904         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3905                 size, (unsigned long long)paddr, dir);
3906
3907         return DMA_MAPPING_ERROR;
3908 }
3909
3910 static void
3911 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3912                     enum dma_data_direction dir, unsigned long attrs)
3913 {
3914         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3915         struct dmar_domain *domain;
3916         phys_addr_t tlb_addr;
3917
3918         domain = find_domain(dev);
3919         if (WARN_ON(!domain))
3920                 return;
3921
3922         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3923         if (WARN_ON(!tlb_addr))
3924                 return;
3925
3926         intel_unmap(dev, dev_addr, size);
3927         if (is_swiotlb_buffer(tlb_addr))
3928                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3929                                          aligned_size, dir, attrs);
3930
3931         trace_bounce_unmap_single(dev, dev_addr, size);
3932 }
3933
3934 static dma_addr_t
3935 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3936                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3937 {
3938         return bounce_map_single(dev, page_to_phys(page) + offset,
3939                                  size, dir, attrs, *dev->dma_mask);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3944                     enum dma_data_direction dir, unsigned long attrs)
3945 {
3946         return bounce_map_single(dev, phys_addr, size,
3947                                  dir, attrs, *dev->dma_mask);
3948 }
3949
3950 static void
3951 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3952                   enum dma_data_direction dir, unsigned long attrs)
3953 {
3954         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3955 }
3956
3957 static void
3958 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3959                       enum dma_data_direction dir, unsigned long attrs)
3960 {
3961         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3962 }
3963
3964 static void
3965 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3966                 enum dma_data_direction dir, unsigned long attrs)
3967 {
3968         struct scatterlist *sg;
3969         int i;
3970
3971         for_each_sg(sglist, sg, nelems, i)
3972                 bounce_unmap_page(dev, sg->dma_address,
3973                                   sg_dma_len(sg), dir, attrs);
3974 }
3975
3976 static int
3977 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3978               enum dma_data_direction dir, unsigned long attrs)
3979 {
3980         int i;
3981         struct scatterlist *sg;
3982
3983         for_each_sg(sglist, sg, nelems, i) {
3984                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3985                                                   sg->offset, sg->length,
3986                                                   dir, attrs);
3987                 if (sg->dma_address == DMA_MAPPING_ERROR)
3988                         goto out_unmap;
3989                 sg_dma_len(sg) = sg->length;
3990         }
3991
3992         return nelems;
3993
3994 out_unmap:
3995         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3996         return 0;
3997 }
3998
3999 static void
4000 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4001                            size_t size, enum dma_data_direction dir)
4002 {
4003         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4004 }
4005
4006 static void
4007 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4008                               size_t size, enum dma_data_direction dir)
4009 {
4010         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4011 }
4012
4013 static void
4014 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4015                        int nelems, enum dma_data_direction dir)
4016 {
4017         struct scatterlist *sg;
4018         int i;
4019
4020         for_each_sg(sglist, sg, nelems, i)
4021                 bounce_sync_single(dev, sg_dma_address(sg),
4022                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4023 }
4024
4025 static void
4026 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4027                           int nelems, enum dma_data_direction dir)
4028 {
4029         struct scatterlist *sg;
4030         int i;
4031
4032         for_each_sg(sglist, sg, nelems, i)
4033                 bounce_sync_single(dev, sg_dma_address(sg),
4034                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4035 }
4036
4037 static const struct dma_map_ops bounce_dma_ops = {
4038         .alloc                  = intel_alloc_coherent,
4039         .free                   = intel_free_coherent,
4040         .map_sg                 = bounce_map_sg,
4041         .unmap_sg               = bounce_unmap_sg,
4042         .map_page               = bounce_map_page,
4043         .unmap_page             = bounce_unmap_page,
4044         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4045         .sync_single_for_device = bounce_sync_single_for_device,
4046         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4047         .sync_sg_for_device     = bounce_sync_sg_for_device,
4048         .map_resource           = bounce_map_resource,
4049         .unmap_resource         = bounce_unmap_resource,
4050         .dma_supported          = dma_direct_supported,
4051 };
4052
4053 static inline int iommu_domain_cache_init(void)
4054 {
4055         int ret = 0;
4056
4057         iommu_domain_cache = kmem_cache_create("iommu_domain",
4058                                          sizeof(struct dmar_domain),
4059                                          0,
4060                                          SLAB_HWCACHE_ALIGN,
4061
4062                                          NULL);
4063         if (!iommu_domain_cache) {
4064                 pr_err("Couldn't create iommu_domain cache\n");
4065                 ret = -ENOMEM;
4066         }
4067
4068         return ret;
4069 }
4070
4071 static inline int iommu_devinfo_cache_init(void)
4072 {
4073         int ret = 0;
4074
4075         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4076                                          sizeof(struct device_domain_info),
4077                                          0,
4078                                          SLAB_HWCACHE_ALIGN,
4079                                          NULL);
4080         if (!iommu_devinfo_cache) {
4081                 pr_err("Couldn't create devinfo cache\n");
4082                 ret = -ENOMEM;
4083         }
4084
4085         return ret;
4086 }
4087
4088 static int __init iommu_init_mempool(void)
4089 {
4090         int ret;
4091         ret = iova_cache_get();
4092         if (ret)
4093                 return ret;
4094
4095         ret = iommu_domain_cache_init();
4096         if (ret)
4097                 goto domain_error;
4098
4099         ret = iommu_devinfo_cache_init();
4100         if (!ret)
4101                 return ret;
4102
4103         kmem_cache_destroy(iommu_domain_cache);
4104 domain_error:
4105         iova_cache_put();
4106
4107         return -ENOMEM;
4108 }
4109
4110 static void __init iommu_exit_mempool(void)
4111 {
4112         kmem_cache_destroy(iommu_devinfo_cache);
4113         kmem_cache_destroy(iommu_domain_cache);
4114         iova_cache_put();
4115 }
4116
4117 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4118 {
4119         struct dmar_drhd_unit *drhd;
4120         u32 vtbar;
4121         int rc;
4122
4123         /* We know that this device on this chipset has its own IOMMU.
4124          * If we find it under a different IOMMU, then the BIOS is lying
4125          * to us. Hope that the IOMMU for this device is actually
4126          * disabled, and it needs no translation...
4127          */
4128         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4129         if (rc) {
4130                 /* "can't" happen */
4131                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4132                 return;
4133         }
4134         vtbar &= 0xffff0000;
4135
4136         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4137         drhd = dmar_find_matched_drhd_unit(pdev);
4138         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4139                             TAINT_FIRMWARE_WORKAROUND,
4140                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4141                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4142 }
4143 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4144
4145 static void __init init_no_remapping_devices(void)
4146 {
4147         struct dmar_drhd_unit *drhd;
4148         struct device *dev;
4149         int i;
4150
4151         for_each_drhd_unit(drhd) {
4152                 if (!drhd->include_all) {
4153                         for_each_active_dev_scope(drhd->devices,
4154                                                   drhd->devices_cnt, i, dev)
4155                                 break;
4156                         /* ignore DMAR unit if no devices exist */
4157                         if (i == drhd->devices_cnt)
4158                                 drhd->ignored = 1;
4159                 }
4160         }
4161
4162         for_each_active_drhd_unit(drhd) {
4163                 if (drhd->include_all)
4164                         continue;
4165
4166                 for_each_active_dev_scope(drhd->devices,
4167                                           drhd->devices_cnt, i, dev)
4168                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4169                                 break;
4170                 if (i < drhd->devices_cnt)
4171                         continue;
4172
4173                 /* This IOMMU has *only* gfx devices. Either bypass it or
4174                    set the gfx_mapped flag, as appropriate */
4175                 if (!dmar_map_gfx) {
4176                         drhd->ignored = 1;
4177                         for_each_active_dev_scope(drhd->devices,
4178                                                   drhd->devices_cnt, i, dev)
4179                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4180                 }
4181         }
4182 }
4183
4184 #ifdef CONFIG_SUSPEND
4185 static int init_iommu_hw(void)
4186 {
4187         struct dmar_drhd_unit *drhd;
4188         struct intel_iommu *iommu = NULL;
4189
4190         for_each_active_iommu(iommu, drhd)
4191                 if (iommu->qi)
4192                         dmar_reenable_qi(iommu);
4193
4194         for_each_iommu(iommu, drhd) {
4195                 if (drhd->ignored) {
4196                         /*
4197                          * we always have to disable PMRs or DMA may fail on
4198                          * this device
4199                          */
4200                         if (force_on)
4201                                 iommu_disable_protect_mem_regions(iommu);
4202                         continue;
4203                 }
4204
4205                 iommu_flush_write_buffer(iommu);
4206
4207                 iommu_set_root_entry(iommu);
4208
4209                 iommu->flush.flush_context(iommu, 0, 0, 0,
4210                                            DMA_CCMD_GLOBAL_INVL);
4211                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4212                 iommu_enable_translation(iommu);
4213                 iommu_disable_protect_mem_regions(iommu);
4214         }
4215
4216         return 0;
4217 }
4218
4219 static void iommu_flush_all(void)
4220 {
4221         struct dmar_drhd_unit *drhd;
4222         struct intel_iommu *iommu;
4223
4224         for_each_active_iommu(iommu, drhd) {
4225                 iommu->flush.flush_context(iommu, 0, 0, 0,
4226                                            DMA_CCMD_GLOBAL_INVL);
4227                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4228                                          DMA_TLB_GLOBAL_FLUSH);
4229         }
4230 }
4231
4232 static int iommu_suspend(void)
4233 {
4234         struct dmar_drhd_unit *drhd;
4235         struct intel_iommu *iommu = NULL;
4236         unsigned long flag;
4237
4238         for_each_active_iommu(iommu, drhd) {
4239                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4240                                                  GFP_ATOMIC);
4241                 if (!iommu->iommu_state)
4242                         goto nomem;
4243         }
4244
4245         iommu_flush_all();
4246
4247         for_each_active_iommu(iommu, drhd) {
4248                 iommu_disable_translation(iommu);
4249
4250                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4251
4252                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4253                         readl(iommu->reg + DMAR_FECTL_REG);
4254                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4255                         readl(iommu->reg + DMAR_FEDATA_REG);
4256                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4257                         readl(iommu->reg + DMAR_FEADDR_REG);
4258                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4259                         readl(iommu->reg + DMAR_FEUADDR_REG);
4260
4261                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4262         }
4263         return 0;
4264
4265 nomem:
4266         for_each_active_iommu(iommu, drhd)
4267                 kfree(iommu->iommu_state);
4268
4269         return -ENOMEM;
4270 }
4271
4272 static void iommu_resume(void)
4273 {
4274         struct dmar_drhd_unit *drhd;
4275         struct intel_iommu *iommu = NULL;
4276         unsigned long flag;
4277
4278         if (init_iommu_hw()) {
4279                 if (force_on)
4280                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4281                 else
4282                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4283                 return;
4284         }
4285
4286         for_each_active_iommu(iommu, drhd) {
4287
4288                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4289
4290                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4291                         iommu->reg + DMAR_FECTL_REG);
4292                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4293                         iommu->reg + DMAR_FEDATA_REG);
4294                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4295                         iommu->reg + DMAR_FEADDR_REG);
4296                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4297                         iommu->reg + DMAR_FEUADDR_REG);
4298
4299                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4300         }
4301
4302         for_each_active_iommu(iommu, drhd)
4303                 kfree(iommu->iommu_state);
4304 }
4305
4306 static struct syscore_ops iommu_syscore_ops = {
4307         .resume         = iommu_resume,
4308         .suspend        = iommu_suspend,
4309 };
4310
4311 static void __init init_iommu_pm_ops(void)
4312 {
4313         register_syscore_ops(&iommu_syscore_ops);
4314 }
4315
4316 #else
4317 static inline void init_iommu_pm_ops(void) {}
4318 #endif  /* CONFIG_PM */
4319
4320 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4321 {
4322         struct acpi_dmar_reserved_memory *rmrr;
4323         struct dmar_rmrr_unit *rmrru;
4324         int ret;
4325
4326         rmrr = (struct acpi_dmar_reserved_memory *)header;
4327         ret = arch_rmrr_sanity_check(rmrr);
4328         if (ret)
4329                 return ret;
4330
4331         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4332         if (!rmrru)
4333                 goto out;
4334
4335         rmrru->hdr = header;
4336
4337         rmrru->base_address = rmrr->base_address;
4338         rmrru->end_address = rmrr->end_address;
4339
4340         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4341                                 ((void *)rmrr) + rmrr->header.length,
4342                                 &rmrru->devices_cnt);
4343         if (rmrru->devices_cnt && rmrru->devices == NULL)
4344                 goto free_rmrru;
4345
4346         list_add(&rmrru->list, &dmar_rmrr_units);
4347
4348         return 0;
4349 free_rmrru:
4350         kfree(rmrru);
4351 out:
4352         return -ENOMEM;
4353 }
4354
4355 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4356 {
4357         struct dmar_atsr_unit *atsru;
4358         struct acpi_dmar_atsr *tmp;
4359
4360         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4361                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4362                 if (atsr->segment != tmp->segment)
4363                         continue;
4364                 if (atsr->header.length != tmp->header.length)
4365                         continue;
4366                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4367                         return atsru;
4368         }
4369
4370         return NULL;
4371 }
4372
4373 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4374 {
4375         struct acpi_dmar_atsr *atsr;
4376         struct dmar_atsr_unit *atsru;
4377
4378         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4379                 return 0;
4380
4381         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4382         atsru = dmar_find_atsr(atsr);
4383         if (atsru)
4384                 return 0;
4385
4386         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4387         if (!atsru)
4388                 return -ENOMEM;
4389
4390         /*
4391          * If memory is allocated from slab by ACPI _DSM method, we need to
4392          * copy the memory content because the memory buffer will be freed
4393          * on return.
4394          */
4395         atsru->hdr = (void *)(atsru + 1);
4396         memcpy(atsru->hdr, hdr, hdr->length);
4397         atsru->include_all = atsr->flags & 0x1;
4398         if (!atsru->include_all) {
4399                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4400                                 (void *)atsr + atsr->header.length,
4401                                 &atsru->devices_cnt);
4402                 if (atsru->devices_cnt && atsru->devices == NULL) {
4403                         kfree(atsru);
4404                         return -ENOMEM;
4405                 }
4406         }
4407
4408         list_add_rcu(&atsru->list, &dmar_atsr_units);
4409
4410         return 0;
4411 }
4412
4413 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4414 {
4415         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4416         kfree(atsru);
4417 }
4418
4419 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4420 {
4421         struct acpi_dmar_atsr *atsr;
4422         struct dmar_atsr_unit *atsru;
4423
4424         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4425         atsru = dmar_find_atsr(atsr);
4426         if (atsru) {
4427                 list_del_rcu(&atsru->list);
4428                 synchronize_rcu();
4429                 intel_iommu_free_atsr(atsru);
4430         }
4431
4432         return 0;
4433 }
4434
4435 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4436 {
4437         int i;
4438         struct device *dev;
4439         struct acpi_dmar_atsr *atsr;
4440         struct dmar_atsr_unit *atsru;
4441
4442         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4443         atsru = dmar_find_atsr(atsr);
4444         if (!atsru)
4445                 return 0;
4446
4447         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4448                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4449                                           i, dev)
4450                         return -EBUSY;
4451         }
4452
4453         return 0;
4454 }
4455
4456 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4457 {
4458         int sp, ret;
4459         struct intel_iommu *iommu = dmaru->iommu;
4460
4461         if (g_iommus[iommu->seq_id])
4462                 return 0;
4463
4464         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4465                 pr_warn("%s: Doesn't support hardware pass through.\n",
4466                         iommu->name);
4467                 return -ENXIO;
4468         }
4469         if (!ecap_sc_support(iommu->ecap) &&
4470             domain_update_iommu_snooping(iommu)) {
4471                 pr_warn("%s: Doesn't support snooping.\n",
4472                         iommu->name);
4473                 return -ENXIO;
4474         }
4475         sp = domain_update_iommu_superpage(iommu) - 1;
4476         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4477                 pr_warn("%s: Doesn't support large page.\n",
4478                         iommu->name);
4479                 return -ENXIO;
4480         }
4481
4482         /*
4483          * Disable translation if already enabled prior to OS handover.
4484          */
4485         if (iommu->gcmd & DMA_GCMD_TE)
4486                 iommu_disable_translation(iommu);
4487
4488         g_iommus[iommu->seq_id] = iommu;
4489         ret = iommu_init_domains(iommu);
4490         if (ret == 0)
4491                 ret = iommu_alloc_root_entry(iommu);
4492         if (ret)
4493                 goto out;
4494
4495         intel_svm_check(iommu);
4496
4497         if (dmaru->ignored) {
4498                 /*
4499                  * we always have to disable PMRs or DMA may fail on this device
4500                  */
4501                 if (force_on)
4502                         iommu_disable_protect_mem_regions(iommu);
4503                 return 0;
4504         }
4505
4506         intel_iommu_init_qi(iommu);
4507         iommu_flush_write_buffer(iommu);
4508
4509 #ifdef CONFIG_INTEL_IOMMU_SVM
4510         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4511                 ret = intel_svm_enable_prq(iommu);
4512                 if (ret)
4513                         goto disable_iommu;
4514         }
4515 #endif
4516         ret = dmar_set_interrupt(iommu);
4517         if (ret)
4518                 goto disable_iommu;
4519
4520         iommu_set_root_entry(iommu);
4521         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4522         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4523         iommu_enable_translation(iommu);
4524
4525         iommu_disable_protect_mem_regions(iommu);
4526         return 0;
4527
4528 disable_iommu:
4529         disable_dmar_iommu(iommu);
4530 out:
4531         free_dmar_iommu(iommu);
4532         return ret;
4533 }
4534
4535 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4536 {
4537         int ret = 0;
4538         struct intel_iommu *iommu = dmaru->iommu;
4539
4540         if (!intel_iommu_enabled)
4541                 return 0;
4542         if (iommu == NULL)
4543                 return -EINVAL;
4544
4545         if (insert) {
4546                 ret = intel_iommu_add(dmaru);
4547         } else {
4548                 disable_dmar_iommu(iommu);
4549                 free_dmar_iommu(iommu);
4550         }
4551
4552         return ret;
4553 }
4554
4555 static void intel_iommu_free_dmars(void)
4556 {
4557         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4558         struct dmar_atsr_unit *atsru, *atsr_n;
4559
4560         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4561                 list_del(&rmrru->list);
4562                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4563                 kfree(rmrru);
4564         }
4565
4566         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4567                 list_del(&atsru->list);
4568                 intel_iommu_free_atsr(atsru);
4569         }
4570 }
4571
4572 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4573 {
4574         int i, ret = 1;
4575         struct pci_bus *bus;
4576         struct pci_dev *bridge = NULL;
4577         struct device *tmp;
4578         struct acpi_dmar_atsr *atsr;
4579         struct dmar_atsr_unit *atsru;
4580
4581         dev = pci_physfn(dev);
4582         for (bus = dev->bus; bus; bus = bus->parent) {
4583                 bridge = bus->self;
4584                 /* If it's an integrated device, allow ATS */
4585                 if (!bridge)
4586                         return 1;
4587                 /* Connected via non-PCIe: no ATS */
4588                 if (!pci_is_pcie(bridge) ||
4589                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4590                         return 0;
4591                 /* If we found the root port, look it up in the ATSR */
4592                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4593                         break;
4594         }
4595
4596         rcu_read_lock();
4597         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4598                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4599                 if (atsr->segment != pci_domain_nr(dev->bus))
4600                         continue;
4601
4602                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4603                         if (tmp == &bridge->dev)
4604                                 goto out;
4605
4606                 if (atsru->include_all)
4607                         goto out;
4608         }
4609         ret = 0;
4610 out:
4611         rcu_read_unlock();
4612
4613         return ret;
4614 }
4615
4616 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4617 {
4618         int ret;
4619         struct dmar_rmrr_unit *rmrru;
4620         struct dmar_atsr_unit *atsru;
4621         struct acpi_dmar_atsr *atsr;
4622         struct acpi_dmar_reserved_memory *rmrr;
4623
4624         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4625                 return 0;
4626
4627         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4628                 rmrr = container_of(rmrru->hdr,
4629                                     struct acpi_dmar_reserved_memory, header);
4630                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4631                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4632                                 ((void *)rmrr) + rmrr->header.length,
4633                                 rmrr->segment, rmrru->devices,
4634                                 rmrru->devices_cnt);
4635                         if (ret < 0)
4636                                 return ret;
4637                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4638                         dmar_remove_dev_scope(info, rmrr->segment,
4639                                 rmrru->devices, rmrru->devices_cnt);
4640                 }
4641         }
4642
4643         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4644                 if (atsru->include_all)
4645                         continue;
4646
4647                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4648                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4649                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4650                                         (void *)atsr + atsr->header.length,
4651                                         atsr->segment, atsru->devices,
4652                                         atsru->devices_cnt);
4653                         if (ret > 0)
4654                                 break;
4655                         else if (ret < 0)
4656                                 return ret;
4657                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4658                         if (dmar_remove_dev_scope(info, atsr->segment,
4659                                         atsru->devices, atsru->devices_cnt))
4660                                 break;
4661                 }
4662         }
4663
4664         return 0;
4665 }
4666
4667 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4668                                        unsigned long val, void *v)
4669 {
4670         struct memory_notify *mhp = v;
4671         unsigned long long start, end;
4672         unsigned long start_vpfn, last_vpfn;
4673
4674         switch (val) {
4675         case MEM_GOING_ONLINE:
4676                 start = mhp->start_pfn << PAGE_SHIFT;
4677                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4678                 if (iommu_domain_identity_map(si_domain, start, end)) {
4679                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4680                                 start, end);
4681                         return NOTIFY_BAD;
4682                 }
4683                 break;
4684
4685         case MEM_OFFLINE:
4686         case MEM_CANCEL_ONLINE:
4687                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4688                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4689                 while (start_vpfn <= last_vpfn) {
4690                         struct iova *iova;
4691                         struct dmar_drhd_unit *drhd;
4692                         struct intel_iommu *iommu;
4693                         struct page *freelist;
4694
4695                         iova = find_iova(&si_domain->iovad, start_vpfn);
4696                         if (iova == NULL) {
4697                                 pr_debug("Failed get IOVA for PFN %lx\n",
4698                                          start_vpfn);
4699                                 break;
4700                         }
4701
4702                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4703                                                      start_vpfn, last_vpfn);
4704                         if (iova == NULL) {
4705                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4706                                         start_vpfn, last_vpfn);
4707                                 return NOTIFY_BAD;
4708                         }
4709
4710                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4711                                                iova->pfn_hi);
4712
4713                         rcu_read_lock();
4714                         for_each_active_iommu(iommu, drhd)
4715                                 iommu_flush_iotlb_psi(iommu, si_domain,
4716                                         iova->pfn_lo, iova_size(iova),
4717                                         !freelist, 0);
4718                         rcu_read_unlock();
4719                         dma_free_pagelist(freelist);
4720
4721                         start_vpfn = iova->pfn_hi + 1;
4722                         free_iova_mem(iova);
4723                 }
4724                 break;
4725         }
4726
4727         return NOTIFY_OK;
4728 }
4729
4730 static struct notifier_block intel_iommu_memory_nb = {
4731         .notifier_call = intel_iommu_memory_notifier,
4732         .priority = 0
4733 };
4734
4735 static void free_all_cpu_cached_iovas(unsigned int cpu)
4736 {
4737         int i;
4738
4739         for (i = 0; i < g_num_of_iommus; i++) {
4740                 struct intel_iommu *iommu = g_iommus[i];
4741                 struct dmar_domain *domain;
4742                 int did;
4743
4744                 if (!iommu)
4745                         continue;
4746
4747                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4748                         domain = get_iommu_domain(iommu, (u16)did);
4749
4750                         if (!domain)
4751                                 continue;
4752                         free_cpu_cached_iovas(cpu, &domain->iovad);
4753                 }
4754         }
4755 }
4756
4757 static int intel_iommu_cpu_dead(unsigned int cpu)
4758 {
4759         free_all_cpu_cached_iovas(cpu);
4760         return 0;
4761 }
4762
4763 static void intel_disable_iommus(void)
4764 {
4765         struct intel_iommu *iommu = NULL;
4766         struct dmar_drhd_unit *drhd;
4767
4768         for_each_iommu(iommu, drhd)
4769                 iommu_disable_translation(iommu);
4770 }
4771
4772 void intel_iommu_shutdown(void)
4773 {
4774         struct dmar_drhd_unit *drhd;
4775         struct intel_iommu *iommu = NULL;
4776
4777         if (no_iommu || dmar_disabled)
4778                 return;
4779
4780         down_write(&dmar_global_lock);
4781
4782         /* Disable PMRs explicitly here. */
4783         for_each_iommu(iommu, drhd)
4784                 iommu_disable_protect_mem_regions(iommu);
4785
4786         /* Make sure the IOMMUs are switched off */
4787         intel_disable_iommus();
4788
4789         up_write(&dmar_global_lock);
4790 }
4791
4792 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4793 {
4794         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4795
4796         return container_of(iommu_dev, struct intel_iommu, iommu);
4797 }
4798
4799 static ssize_t intel_iommu_show_version(struct device *dev,
4800                                         struct device_attribute *attr,
4801                                         char *buf)
4802 {
4803         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4804         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4805         return sprintf(buf, "%d:%d\n",
4806                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4807 }
4808 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4809
4810 static ssize_t intel_iommu_show_address(struct device *dev,
4811                                         struct device_attribute *attr,
4812                                         char *buf)
4813 {
4814         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4815         return sprintf(buf, "%llx\n", iommu->reg_phys);
4816 }
4817 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4818
4819 static ssize_t intel_iommu_show_cap(struct device *dev,
4820                                     struct device_attribute *attr,
4821                                     char *buf)
4822 {
4823         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4824         return sprintf(buf, "%llx\n", iommu->cap);
4825 }
4826 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4827
4828 static ssize_t intel_iommu_show_ecap(struct device *dev,
4829                                     struct device_attribute *attr,
4830                                     char *buf)
4831 {
4832         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4833         return sprintf(buf, "%llx\n", iommu->ecap);
4834 }
4835 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4836
4837 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4838                                       struct device_attribute *attr,
4839                                       char *buf)
4840 {
4841         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4842         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4843 }
4844 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4845
4846 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4847                                            struct device_attribute *attr,
4848                                            char *buf)
4849 {
4850         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4851         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4852                                                   cap_ndoms(iommu->cap)));
4853 }
4854 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4855
4856 static struct attribute *intel_iommu_attrs[] = {
4857         &dev_attr_version.attr,
4858         &dev_attr_address.attr,
4859         &dev_attr_cap.attr,
4860         &dev_attr_ecap.attr,
4861         &dev_attr_domains_supported.attr,
4862         &dev_attr_domains_used.attr,
4863         NULL,
4864 };
4865
4866 static struct attribute_group intel_iommu_group = {
4867         .name = "intel-iommu",
4868         .attrs = intel_iommu_attrs,
4869 };
4870
4871 const struct attribute_group *intel_iommu_groups[] = {
4872         &intel_iommu_group,
4873         NULL,
4874 };
4875
4876 static inline bool has_untrusted_dev(void)
4877 {
4878         struct pci_dev *pdev = NULL;
4879
4880         for_each_pci_dev(pdev)
4881                 if (pdev->untrusted)
4882                         return true;
4883
4884         return false;
4885 }
4886
4887 static int __init platform_optin_force_iommu(void)
4888 {
4889         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4890                 return 0;
4891
4892         if (no_iommu || dmar_disabled)
4893                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4894
4895         /*
4896          * If Intel-IOMMU is disabled by default, we will apply identity
4897          * map for all devices except those marked as being untrusted.
4898          */
4899         if (dmar_disabled)
4900                 iommu_identity_mapping |= IDENTMAP_ALL;
4901
4902         dmar_disabled = 0;
4903         no_iommu = 0;
4904
4905         return 1;
4906 }
4907
4908 static int __init probe_acpi_namespace_devices(void)
4909 {
4910         struct dmar_drhd_unit *drhd;
4911         /* To avoid a -Wunused-but-set-variable warning. */
4912         struct intel_iommu *iommu __maybe_unused;
4913         struct device *dev;
4914         int i, ret = 0;
4915
4916         for_each_active_iommu(iommu, drhd) {
4917                 for_each_active_dev_scope(drhd->devices,
4918                                           drhd->devices_cnt, i, dev) {
4919                         struct acpi_device_physical_node *pn;
4920                         struct iommu_group *group;
4921                         struct acpi_device *adev;
4922
4923                         if (dev->bus != &acpi_bus_type)
4924                                 continue;
4925
4926                         adev = to_acpi_device(dev);
4927                         mutex_lock(&adev->physical_node_lock);
4928                         list_for_each_entry(pn,
4929                                             &adev->physical_node_list, node) {
4930                                 group = iommu_group_get(pn->dev);
4931                                 if (group) {
4932                                         iommu_group_put(group);
4933                                         continue;
4934                                 }
4935
4936                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4937                                 ret = iommu_probe_device(pn->dev);
4938                                 if (ret)
4939                                         break;
4940                         }
4941                         mutex_unlock(&adev->physical_node_lock);
4942
4943                         if (ret)
4944                                 return ret;
4945                 }
4946         }
4947
4948         return 0;
4949 }
4950
4951 int __init intel_iommu_init(void)
4952 {
4953         int ret = -ENODEV;
4954         struct dmar_drhd_unit *drhd;
4955         struct intel_iommu *iommu;
4956
4957         /*
4958          * Intel IOMMU is required for a TXT/tboot launch or platform
4959          * opt in, so enforce that.
4960          */
4961         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4962
4963         if (iommu_init_mempool()) {
4964                 if (force_on)
4965                         panic("tboot: Failed to initialize iommu memory\n");
4966                 return -ENOMEM;
4967         }
4968
4969         down_write(&dmar_global_lock);
4970         if (dmar_table_init()) {
4971                 if (force_on)
4972                         panic("tboot: Failed to initialize DMAR table\n");
4973                 goto out_free_dmar;
4974         }
4975
4976         if (dmar_dev_scope_init() < 0) {
4977                 if (force_on)
4978                         panic("tboot: Failed to initialize DMAR device scope\n");
4979                 goto out_free_dmar;
4980         }
4981
4982         up_write(&dmar_global_lock);
4983
4984         /*
4985          * The bus notifier takes the dmar_global_lock, so lockdep will
4986          * complain later when we register it under the lock.
4987          */
4988         dmar_register_bus_notifier();
4989
4990         down_write(&dmar_global_lock);
4991
4992         if (no_iommu || dmar_disabled) {
4993                 /*
4994                  * We exit the function here to ensure IOMMU's remapping and
4995                  * mempool aren't setup, which means that the IOMMU's PMRs
4996                  * won't be disabled via the call to init_dmars(). So disable
4997                  * it explicitly here. The PMRs were setup by tboot prior to
4998                  * calling SENTER, but the kernel is expected to reset/tear
4999                  * down the PMRs.
5000                  */
5001                 if (intel_iommu_tboot_noforce) {
5002                         for_each_iommu(iommu, drhd)
5003                                 iommu_disable_protect_mem_regions(iommu);
5004                 }
5005
5006                 /*
5007                  * Make sure the IOMMUs are switched off, even when we
5008                  * boot into a kexec kernel and the previous kernel left
5009                  * them enabled
5010                  */
5011                 intel_disable_iommus();
5012                 goto out_free_dmar;
5013         }
5014
5015         if (list_empty(&dmar_rmrr_units))
5016                 pr_info("No RMRR found\n");
5017
5018         if (list_empty(&dmar_atsr_units))
5019                 pr_info("No ATSR found\n");
5020
5021         if (dmar_init_reserved_ranges()) {
5022                 if (force_on)
5023                         panic("tboot: Failed to reserve iommu ranges\n");
5024                 goto out_free_reserved_range;
5025         }
5026
5027         if (dmar_map_gfx)
5028                 intel_iommu_gfx_mapped = 1;
5029
5030         init_no_remapping_devices();
5031
5032         ret = init_dmars();
5033         if (ret) {
5034                 if (force_on)
5035                         panic("tboot: Failed to initialize DMARs\n");
5036                 pr_err("Initialization failed\n");
5037                 goto out_free_reserved_range;
5038         }
5039         up_write(&dmar_global_lock);
5040
5041 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5042         /*
5043          * If the system has no untrusted device or the user has decided
5044          * to disable the bounce page mechanisms, we don't need swiotlb.
5045          * Mark this and the pre-allocated bounce pages will be released
5046          * later.
5047          */
5048         if (!has_untrusted_dev() || intel_no_bounce)
5049                 swiotlb = 0;
5050 #endif
5051         dma_ops = &intel_dma_ops;
5052
5053         init_iommu_pm_ops();
5054
5055         for_each_active_iommu(iommu, drhd) {
5056                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5057                                        intel_iommu_groups,
5058                                        "%s", iommu->name);
5059                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5060                 iommu_device_register(&iommu->iommu);
5061         }
5062
5063         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5064         if (si_domain && !hw_pass_through)
5065                 register_memory_notifier(&intel_iommu_memory_nb);
5066         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5067                           intel_iommu_cpu_dead);
5068
5069         down_read(&dmar_global_lock);
5070         if (probe_acpi_namespace_devices())
5071                 pr_warn("ACPI name space devices didn't probe correctly\n");
5072         up_read(&dmar_global_lock);
5073
5074         /* Finally, we enable the DMA remapping hardware. */
5075         for_each_iommu(iommu, drhd) {
5076                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5077                         iommu_enable_translation(iommu);
5078
5079                 iommu_disable_protect_mem_regions(iommu);
5080         }
5081         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5082
5083         intel_iommu_enabled = 1;
5084         intel_iommu_debugfs_init();
5085
5086         return 0;
5087
5088 out_free_reserved_range:
5089         put_iova_domain(&reserved_iova_list);
5090 out_free_dmar:
5091         intel_iommu_free_dmars();
5092         up_write(&dmar_global_lock);
5093         iommu_exit_mempool();
5094         return ret;
5095 }
5096
5097 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5098 {
5099         struct intel_iommu *iommu = opaque;
5100
5101         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5102         return 0;
5103 }
5104
5105 /*
5106  * NB - intel-iommu lacks any sort of reference counting for the users of
5107  * dependent devices.  If multiple endpoints have intersecting dependent
5108  * devices, unbinding the driver from any one of them will possibly leave
5109  * the others unable to operate.
5110  */
5111 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5112 {
5113         if (!iommu || !dev || !dev_is_pci(dev))
5114                 return;
5115
5116         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5117 }
5118
5119 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5120 {
5121         struct dmar_domain *domain;
5122         struct intel_iommu *iommu;
5123         unsigned long flags;
5124
5125         assert_spin_locked(&device_domain_lock);
5126
5127         if (WARN_ON(!info))
5128                 return;
5129
5130         iommu = info->iommu;
5131         domain = info->domain;
5132
5133         if (info->dev) {
5134                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5135                         intel_pasid_tear_down_entry(iommu, info->dev,
5136                                         PASID_RID2PASID);
5137
5138                 iommu_disable_dev_iotlb(info);
5139                 domain_context_clear(iommu, info->dev);
5140                 intel_pasid_free_table(info->dev);
5141         }
5142
5143         unlink_domain_info(info);
5144
5145         spin_lock_irqsave(&iommu->lock, flags);
5146         domain_detach_iommu(domain, iommu);
5147         spin_unlock_irqrestore(&iommu->lock, flags);
5148
5149         /* free the private domain */
5150         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5151             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5152             list_empty(&domain->devices))
5153                 domain_exit(info->domain);
5154
5155         free_devinfo_mem(info);
5156 }
5157
5158 static void dmar_remove_one_dev_info(struct device *dev)
5159 {
5160         struct device_domain_info *info;
5161         unsigned long flags;
5162
5163         spin_lock_irqsave(&device_domain_lock, flags);
5164         info = dev->archdata.iommu;
5165         if (info)
5166                 __dmar_remove_one_dev_info(info);
5167         spin_unlock_irqrestore(&device_domain_lock, flags);
5168 }
5169
5170 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5171 {
5172         int adjust_width;
5173
5174         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5175         domain_reserve_special_ranges(domain);
5176
5177         /* calculate AGAW */
5178         domain->gaw = guest_width;
5179         adjust_width = guestwidth_to_adjustwidth(guest_width);
5180         domain->agaw = width_to_agaw(adjust_width);
5181
5182         domain->iommu_coherency = 0;
5183         domain->iommu_snooping = 0;
5184         domain->iommu_superpage = 0;
5185         domain->max_addr = 0;
5186
5187         /* always allocate the top pgd */
5188         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5189         if (!domain->pgd)
5190                 return -ENOMEM;
5191         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5192         return 0;
5193 }
5194
5195 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5196 {
5197         struct dmar_domain *dmar_domain;
5198         struct iommu_domain *domain;
5199
5200         switch (type) {
5201         case IOMMU_DOMAIN_DMA:
5202         /* fallthrough */
5203         case IOMMU_DOMAIN_UNMANAGED:
5204                 dmar_domain = alloc_domain(0);
5205                 if (!dmar_domain) {
5206                         pr_err("Can't allocate dmar_domain\n");
5207                         return NULL;
5208                 }
5209                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5210                         pr_err("Domain initialization failed\n");
5211                         domain_exit(dmar_domain);
5212                         return NULL;
5213                 }
5214
5215                 if (type == IOMMU_DOMAIN_DMA &&
5216                     init_iova_flush_queue(&dmar_domain->iovad,
5217                                           iommu_flush_iova, iova_entry_free)) {
5218                         pr_warn("iova flush queue initialization failed\n");
5219                         intel_iommu_strict = 1;
5220                 }
5221
5222                 domain_update_iommu_cap(dmar_domain);
5223
5224                 domain = &dmar_domain->domain;
5225                 domain->geometry.aperture_start = 0;
5226                 domain->geometry.aperture_end   =
5227                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5228                 domain->geometry.force_aperture = true;
5229
5230                 return domain;
5231         case IOMMU_DOMAIN_IDENTITY:
5232                 return &si_domain->domain;
5233         default:
5234                 return NULL;
5235         }
5236
5237         return NULL;
5238 }
5239
5240 static void intel_iommu_domain_free(struct iommu_domain *domain)
5241 {
5242         if (domain != &si_domain->domain)
5243                 domain_exit(to_dmar_domain(domain));
5244 }
5245
5246 /*
5247  * Check whether a @domain could be attached to the @dev through the
5248  * aux-domain attach/detach APIs.
5249  */
5250 static inline bool
5251 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5252 {
5253         struct device_domain_info *info = dev->archdata.iommu;
5254
5255         return info && info->auxd_enabled &&
5256                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5257 }
5258
5259 static void auxiliary_link_device(struct dmar_domain *domain,
5260                                   struct device *dev)
5261 {
5262         struct device_domain_info *info = dev->archdata.iommu;
5263
5264         assert_spin_locked(&device_domain_lock);
5265         if (WARN_ON(!info))
5266                 return;
5267
5268         domain->auxd_refcnt++;
5269         list_add(&domain->auxd, &info->auxiliary_domains);
5270 }
5271
5272 static void auxiliary_unlink_device(struct dmar_domain *domain,
5273                                     struct device *dev)
5274 {
5275         struct device_domain_info *info = dev->archdata.iommu;
5276
5277         assert_spin_locked(&device_domain_lock);
5278         if (WARN_ON(!info))
5279                 return;
5280
5281         list_del(&domain->auxd);
5282         domain->auxd_refcnt--;
5283
5284         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5285                 ioasid_free(domain->default_pasid);
5286 }
5287
5288 static int aux_domain_add_dev(struct dmar_domain *domain,
5289                               struct device *dev)
5290 {
5291         int ret;
5292         u8 bus, devfn;
5293         unsigned long flags;
5294         struct intel_iommu *iommu;
5295
5296         iommu = device_to_iommu(dev, &bus, &devfn);
5297         if (!iommu)
5298                 return -ENODEV;
5299
5300         if (domain->default_pasid <= 0) {
5301                 int pasid;
5302
5303                 /* No private data needed for the default pasid */
5304                 pasid = ioasid_alloc(NULL, PASID_MIN,
5305                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5306                                      NULL);
5307                 if (pasid == INVALID_IOASID) {
5308                         pr_err("Can't allocate default pasid\n");
5309                         return -ENODEV;
5310                 }
5311                 domain->default_pasid = pasid;
5312         }
5313
5314         spin_lock_irqsave(&device_domain_lock, flags);
5315         /*
5316          * iommu->lock must be held to attach domain to iommu and setup the
5317          * pasid entry for second level translation.
5318          */
5319         spin_lock(&iommu->lock);
5320         ret = domain_attach_iommu(domain, iommu);
5321         if (ret)
5322                 goto attach_failed;
5323
5324         /* Setup the PASID entry for mediated devices: */
5325         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5326                                              domain->default_pasid);
5327         if (ret)
5328                 goto table_failed;
5329         spin_unlock(&iommu->lock);
5330
5331         auxiliary_link_device(domain, dev);
5332
5333         spin_unlock_irqrestore(&device_domain_lock, flags);
5334
5335         return 0;
5336
5337 table_failed:
5338         domain_detach_iommu(domain, iommu);
5339 attach_failed:
5340         spin_unlock(&iommu->lock);
5341         spin_unlock_irqrestore(&device_domain_lock, flags);
5342         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5343                 ioasid_free(domain->default_pasid);
5344
5345         return ret;
5346 }
5347
5348 static void aux_domain_remove_dev(struct dmar_domain *domain,
5349                                   struct device *dev)
5350 {
5351         struct device_domain_info *info;
5352         struct intel_iommu *iommu;
5353         unsigned long flags;
5354
5355         if (!is_aux_domain(dev, &domain->domain))
5356                 return;
5357
5358         spin_lock_irqsave(&device_domain_lock, flags);
5359         info = dev->archdata.iommu;
5360         iommu = info->iommu;
5361
5362         auxiliary_unlink_device(domain, dev);
5363
5364         spin_lock(&iommu->lock);
5365         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5366         domain_detach_iommu(domain, iommu);
5367         spin_unlock(&iommu->lock);
5368
5369         spin_unlock_irqrestore(&device_domain_lock, flags);
5370 }
5371
5372 static int prepare_domain_attach_device(struct iommu_domain *domain,
5373                                         struct device *dev)
5374 {
5375         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5376         struct intel_iommu *iommu;
5377         int addr_width;
5378         u8 bus, devfn;
5379
5380         iommu = device_to_iommu(dev, &bus, &devfn);
5381         if (!iommu)
5382                 return -ENODEV;
5383
5384         /* check if this iommu agaw is sufficient for max mapped address */
5385         addr_width = agaw_to_width(iommu->agaw);
5386         if (addr_width > cap_mgaw(iommu->cap))
5387                 addr_width = cap_mgaw(iommu->cap);
5388
5389         if (dmar_domain->max_addr > (1LL << addr_width)) {
5390                 dev_err(dev, "%s: iommu width (%d) is not "
5391                         "sufficient for the mapped address (%llx)\n",
5392                         __func__, addr_width, dmar_domain->max_addr);
5393                 return -EFAULT;
5394         }
5395         dmar_domain->gaw = addr_width;
5396
5397         /*
5398          * Knock out extra levels of page tables if necessary
5399          */
5400         while (iommu->agaw < dmar_domain->agaw) {
5401                 struct dma_pte *pte;
5402
5403                 pte = dmar_domain->pgd;
5404                 if (dma_pte_present(pte)) {
5405                         dmar_domain->pgd = (struct dma_pte *)
5406                                 phys_to_virt(dma_pte_addr(pte));
5407                         free_pgtable_page(pte);
5408                 }
5409                 dmar_domain->agaw--;
5410         }
5411
5412         return 0;
5413 }
5414
5415 static int intel_iommu_attach_device(struct iommu_domain *domain,
5416                                      struct device *dev)
5417 {
5418         int ret;
5419
5420         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5421             device_is_rmrr_locked(dev)) {
5422                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5423                 return -EPERM;
5424         }
5425
5426         if (is_aux_domain(dev, domain))
5427                 return -EPERM;
5428
5429         /* normally dev is not mapped */
5430         if (unlikely(domain_context_mapped(dev))) {
5431                 struct dmar_domain *old_domain;
5432
5433                 old_domain = find_domain(dev);
5434                 if (old_domain)
5435                         dmar_remove_one_dev_info(dev);
5436         }
5437
5438         ret = prepare_domain_attach_device(domain, dev);
5439         if (ret)
5440                 return ret;
5441
5442         return domain_add_dev_info(to_dmar_domain(domain), dev);
5443 }
5444
5445 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5446                                          struct device *dev)
5447 {
5448         int ret;
5449
5450         if (!is_aux_domain(dev, domain))
5451                 return -EPERM;
5452
5453         ret = prepare_domain_attach_device(domain, dev);
5454         if (ret)
5455                 return ret;
5456
5457         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5458 }
5459
5460 static void intel_iommu_detach_device(struct iommu_domain *domain,
5461                                       struct device *dev)
5462 {
5463         dmar_remove_one_dev_info(dev);
5464 }
5465
5466 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5467                                           struct device *dev)
5468 {
5469         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5470 }
5471
5472 static int intel_iommu_map(struct iommu_domain *domain,
5473                            unsigned long iova, phys_addr_t hpa,
5474                            size_t size, int iommu_prot, gfp_t gfp)
5475 {
5476         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5477         u64 max_addr;
5478         int prot = 0;
5479         int ret;
5480
5481         if (iommu_prot & IOMMU_READ)
5482                 prot |= DMA_PTE_READ;
5483         if (iommu_prot & IOMMU_WRITE)
5484                 prot |= DMA_PTE_WRITE;
5485         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5486                 prot |= DMA_PTE_SNP;
5487
5488         max_addr = iova + size;
5489         if (dmar_domain->max_addr < max_addr) {
5490                 u64 end;
5491
5492                 /* check if minimum agaw is sufficient for mapped address */
5493                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5494                 if (end < max_addr) {
5495                         pr_err("%s: iommu width (%d) is not "
5496                                "sufficient for the mapped address (%llx)\n",
5497                                __func__, dmar_domain->gaw, max_addr);
5498                         return -EFAULT;
5499                 }
5500                 dmar_domain->max_addr = max_addr;
5501         }
5502         /* Round up size to next multiple of PAGE_SIZE, if it and
5503            the low bits of hpa would take us onto the next page */
5504         size = aligned_nrpages(hpa, size);
5505         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5506                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5507         return ret;
5508 }
5509
5510 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5511                                 unsigned long iova, size_t size,
5512                                 struct iommu_iotlb_gather *gather)
5513 {
5514         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5515         struct page *freelist = NULL;
5516         unsigned long start_pfn, last_pfn;
5517         unsigned int npages;
5518         int iommu_id, level = 0;
5519
5520         /* Cope with horrid API which requires us to unmap more than the
5521            size argument if it happens to be a large-page mapping. */
5522         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5523
5524         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5525                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5526
5527         start_pfn = iova >> VTD_PAGE_SHIFT;
5528         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5529
5530         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5531
5532         npages = last_pfn - start_pfn + 1;
5533
5534         for_each_domain_iommu(iommu_id, dmar_domain)
5535                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5536                                       start_pfn, npages, !freelist, 0);
5537
5538         dma_free_pagelist(freelist);
5539
5540         if (dmar_domain->max_addr == iova + size)
5541                 dmar_domain->max_addr = iova;
5542
5543         return size;
5544 }
5545
5546 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5547                                             dma_addr_t iova)
5548 {
5549         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5550         struct dma_pte *pte;
5551         int level = 0;
5552         u64 phys = 0;
5553
5554         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5555         if (pte)
5556                 phys = dma_pte_addr(pte);
5557
5558         return phys;
5559 }
5560
5561 static inline bool scalable_mode_support(void)
5562 {
5563         struct dmar_drhd_unit *drhd;
5564         struct intel_iommu *iommu;
5565         bool ret = true;
5566
5567         rcu_read_lock();
5568         for_each_active_iommu(iommu, drhd) {
5569                 if (!sm_supported(iommu)) {
5570                         ret = false;
5571                         break;
5572                 }
5573         }
5574         rcu_read_unlock();
5575
5576         return ret;
5577 }
5578
5579 static inline bool iommu_pasid_support(void)
5580 {
5581         struct dmar_drhd_unit *drhd;
5582         struct intel_iommu *iommu;
5583         bool ret = true;
5584
5585         rcu_read_lock();
5586         for_each_active_iommu(iommu, drhd) {
5587                 if (!pasid_supported(iommu)) {
5588                         ret = false;
5589                         break;
5590                 }
5591         }
5592         rcu_read_unlock();
5593
5594         return ret;
5595 }
5596
5597 static bool intel_iommu_capable(enum iommu_cap cap)
5598 {
5599         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5600                 return domain_update_iommu_snooping(NULL) == 1;
5601         if (cap == IOMMU_CAP_INTR_REMAP)
5602                 return irq_remapping_enabled == 1;
5603
5604         return false;
5605 }
5606
5607 static int intel_iommu_add_device(struct device *dev)
5608 {
5609         struct dmar_domain *dmar_domain;
5610         struct iommu_domain *domain;
5611         struct intel_iommu *iommu;
5612         struct iommu_group *group;
5613         u8 bus, devfn;
5614         int ret;
5615
5616         iommu = device_to_iommu(dev, &bus, &devfn);
5617         if (!iommu)
5618                 return -ENODEV;
5619
5620         iommu_device_link(&iommu->iommu, dev);
5621
5622         if (translation_pre_enabled(iommu))
5623                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5624
5625         group = iommu_group_get_for_dev(dev);
5626
5627         if (IS_ERR(group))
5628                 return PTR_ERR(group);
5629
5630         iommu_group_put(group);
5631
5632         domain = iommu_get_domain_for_dev(dev);
5633         dmar_domain = to_dmar_domain(domain);
5634         if (domain->type == IOMMU_DOMAIN_DMA) {
5635                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5636                         ret = iommu_request_dm_for_dev(dev);
5637                         if (ret) {
5638                                 dmar_remove_one_dev_info(dev);
5639                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5640                                 domain_add_dev_info(si_domain, dev);
5641                                 dev_info(dev,
5642                                          "Device uses a private identity domain.\n");
5643                         }
5644                 }
5645         } else {
5646                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5647                         ret = iommu_request_dma_domain_for_dev(dev);
5648                         if (ret) {
5649                                 dmar_remove_one_dev_info(dev);
5650                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5651                                 if (!get_private_domain_for_dev(dev)) {
5652                                         dev_warn(dev,
5653                                                  "Failed to get a private domain.\n");
5654                                         return -ENOMEM;
5655                                 }
5656
5657                                 dev_info(dev,
5658                                          "Device uses a private dma domain.\n");
5659                         }
5660                 }
5661         }
5662
5663         if (device_needs_bounce(dev)) {
5664                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5665                 set_dma_ops(dev, &bounce_dma_ops);
5666         }
5667
5668         return 0;
5669 }
5670
5671 static void intel_iommu_remove_device(struct device *dev)
5672 {
5673         struct intel_iommu *iommu;
5674         u8 bus, devfn;
5675
5676         iommu = device_to_iommu(dev, &bus, &devfn);
5677         if (!iommu)
5678                 return;
5679
5680         dmar_remove_one_dev_info(dev);
5681
5682         iommu_group_remove_device(dev);
5683
5684         iommu_device_unlink(&iommu->iommu, dev);
5685
5686         if (device_needs_bounce(dev))
5687                 set_dma_ops(dev, NULL);
5688 }
5689
5690 static void intel_iommu_get_resv_regions(struct device *device,
5691                                          struct list_head *head)
5692 {
5693         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5694         struct iommu_resv_region *reg;
5695         struct dmar_rmrr_unit *rmrr;
5696         struct device *i_dev;
5697         int i;
5698
5699         down_read(&dmar_global_lock);
5700         for_each_rmrr_units(rmrr) {
5701                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5702                                           i, i_dev) {
5703                         struct iommu_resv_region *resv;
5704                         enum iommu_resv_type type;
5705                         size_t length;
5706
5707                         if (i_dev != device &&
5708                             !is_downstream_to_pci_bridge(device, i_dev))
5709                                 continue;
5710
5711                         length = rmrr->end_address - rmrr->base_address + 1;
5712
5713                         type = device_rmrr_is_relaxable(device) ?
5714                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5715
5716                         resv = iommu_alloc_resv_region(rmrr->base_address,
5717                                                        length, prot, type);
5718                         if (!resv)
5719                                 break;
5720
5721                         list_add_tail(&resv->list, head);
5722                 }
5723         }
5724         up_read(&dmar_global_lock);
5725
5726 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5727         if (dev_is_pci(device)) {
5728                 struct pci_dev *pdev = to_pci_dev(device);
5729
5730                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5731                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5732                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5733                         if (reg)
5734                                 list_add_tail(&reg->list, head);
5735                 }
5736         }
5737 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5738
5739         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5740                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5741                                       0, IOMMU_RESV_MSI);
5742         if (!reg)
5743                 return;
5744         list_add_tail(&reg->list, head);
5745 }
5746
5747 static void intel_iommu_put_resv_regions(struct device *dev,
5748                                          struct list_head *head)
5749 {
5750         struct iommu_resv_region *entry, *next;
5751
5752         list_for_each_entry_safe(entry, next, head, list)
5753                 kfree(entry);
5754 }
5755
5756 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5757 {
5758         struct device_domain_info *info;
5759         struct context_entry *context;
5760         struct dmar_domain *domain;
5761         unsigned long flags;
5762         u64 ctx_lo;
5763         int ret;
5764
5765         domain = find_domain(dev);
5766         if (!domain)
5767                 return -EINVAL;
5768
5769         spin_lock_irqsave(&device_domain_lock, flags);
5770         spin_lock(&iommu->lock);
5771
5772         ret = -EINVAL;
5773         info = dev->archdata.iommu;
5774         if (!info || !info->pasid_supported)
5775                 goto out;
5776
5777         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5778         if (WARN_ON(!context))
5779                 goto out;
5780
5781         ctx_lo = context[0].lo;
5782
5783         if (!(ctx_lo & CONTEXT_PASIDE)) {
5784                 ctx_lo |= CONTEXT_PASIDE;
5785                 context[0].lo = ctx_lo;
5786                 wmb();
5787                 iommu->flush.flush_context(iommu,
5788                                            domain->iommu_did[iommu->seq_id],
5789                                            PCI_DEVID(info->bus, info->devfn),
5790                                            DMA_CCMD_MASK_NOBIT,
5791                                            DMA_CCMD_DEVICE_INVL);
5792         }
5793
5794         /* Enable PASID support in the device, if it wasn't already */
5795         if (!info->pasid_enabled)
5796                 iommu_enable_dev_iotlb(info);
5797
5798         ret = 0;
5799
5800  out:
5801         spin_unlock(&iommu->lock);
5802         spin_unlock_irqrestore(&device_domain_lock, flags);
5803
5804         return ret;
5805 }
5806
5807 static void intel_iommu_apply_resv_region(struct device *dev,
5808                                           struct iommu_domain *domain,
5809                                           struct iommu_resv_region *region)
5810 {
5811         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5812         unsigned long start, end;
5813
5814         start = IOVA_PFN(region->start);
5815         end   = IOVA_PFN(region->start + region->length - 1);
5816
5817         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5818 }
5819
5820 #ifdef CONFIG_INTEL_IOMMU_SVM
5821 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5822 {
5823         struct intel_iommu *iommu;
5824         u8 bus, devfn;
5825
5826         if (iommu_dummy(dev)) {
5827                 dev_warn(dev,
5828                          "No IOMMU translation for device; cannot enable SVM\n");
5829                 return NULL;
5830         }
5831
5832         iommu = device_to_iommu(dev, &bus, &devfn);
5833         if ((!iommu)) {
5834                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5835                 return NULL;
5836         }
5837
5838         return iommu;
5839 }
5840 #endif /* CONFIG_INTEL_IOMMU_SVM */
5841
5842 static int intel_iommu_enable_auxd(struct device *dev)
5843 {
5844         struct device_domain_info *info;
5845         struct intel_iommu *iommu;
5846         unsigned long flags;
5847         u8 bus, devfn;
5848         int ret;
5849
5850         iommu = device_to_iommu(dev, &bus, &devfn);
5851         if (!iommu || dmar_disabled)
5852                 return -EINVAL;
5853
5854         if (!sm_supported(iommu) || !pasid_supported(iommu))
5855                 return -EINVAL;
5856
5857         ret = intel_iommu_enable_pasid(iommu, dev);
5858         if (ret)
5859                 return -ENODEV;
5860
5861         spin_lock_irqsave(&device_domain_lock, flags);
5862         info = dev->archdata.iommu;
5863         info->auxd_enabled = 1;
5864         spin_unlock_irqrestore(&device_domain_lock, flags);
5865
5866         return 0;
5867 }
5868
5869 static int intel_iommu_disable_auxd(struct device *dev)
5870 {
5871         struct device_domain_info *info;
5872         unsigned long flags;
5873
5874         spin_lock_irqsave(&device_domain_lock, flags);
5875         info = dev->archdata.iommu;
5876         if (!WARN_ON(!info))
5877                 info->auxd_enabled = 0;
5878         spin_unlock_irqrestore(&device_domain_lock, flags);
5879
5880         return 0;
5881 }
5882
5883 /*
5884  * A PCI express designated vendor specific extended capability is defined
5885  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886  * for system software and tools to detect endpoint devices supporting the
5887  * Intel scalable IO virtualization without host driver dependency.
5888  *
5889  * Returns the address of the matching extended capability structure within
5890  * the device's PCI configuration space or 0 if the device does not support
5891  * it.
5892  */
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5894 {
5895         int pos;
5896         u16 vendor, id;
5897
5898         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5899         while (pos) {
5900                 pci_read_config_word(pdev, pos + 4, &vendor);
5901                 pci_read_config_word(pdev, pos + 8, &id);
5902                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5903                         return pos;
5904
5905                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5906         }
5907
5908         return 0;
5909 }
5910
5911 static bool
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5913 {
5914         if (feat == IOMMU_DEV_FEAT_AUX) {
5915                 int ret;
5916
5917                 if (!dev_is_pci(dev) || dmar_disabled ||
5918                     !scalable_mode_support() || !iommu_pasid_support())
5919                         return false;
5920
5921                 ret = pci_pasid_features(to_pci_dev(dev));
5922                 if (ret < 0)
5923                         return false;
5924
5925                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5926         }
5927
5928         return false;
5929 }
5930
5931 static int
5932 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5933 {
5934         if (feat == IOMMU_DEV_FEAT_AUX)
5935                 return intel_iommu_enable_auxd(dev);
5936
5937         return -ENODEV;
5938 }
5939
5940 static int
5941 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5942 {
5943         if (feat == IOMMU_DEV_FEAT_AUX)
5944                 return intel_iommu_disable_auxd(dev);
5945
5946         return -ENODEV;
5947 }
5948
5949 static bool
5950 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5951 {
5952         struct device_domain_info *info = dev->archdata.iommu;
5953
5954         if (feat == IOMMU_DEV_FEAT_AUX)
5955                 return scalable_mode_support() && info && info->auxd_enabled;
5956
5957         return false;
5958 }
5959
5960 static int
5961 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5962 {
5963         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5964
5965         return dmar_domain->default_pasid > 0 ?
5966                         dmar_domain->default_pasid : -EINVAL;
5967 }
5968
5969 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5970                                            struct device *dev)
5971 {
5972         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5973 }
5974
5975 const struct iommu_ops intel_iommu_ops = {
5976         .capable                = intel_iommu_capable,
5977         .domain_alloc           = intel_iommu_domain_alloc,
5978         .domain_free            = intel_iommu_domain_free,
5979         .attach_dev             = intel_iommu_attach_device,
5980         .detach_dev             = intel_iommu_detach_device,
5981         .aux_attach_dev         = intel_iommu_aux_attach_device,
5982         .aux_detach_dev         = intel_iommu_aux_detach_device,
5983         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5984         .map                    = intel_iommu_map,
5985         .unmap                  = intel_iommu_unmap,
5986         .iova_to_phys           = intel_iommu_iova_to_phys,
5987         .add_device             = intel_iommu_add_device,
5988         .remove_device          = intel_iommu_remove_device,
5989         .get_resv_regions       = intel_iommu_get_resv_regions,
5990         .put_resv_regions       = intel_iommu_put_resv_regions,
5991         .apply_resv_region      = intel_iommu_apply_resv_region,
5992         .device_group           = pci_device_group,
5993         .dev_has_feat           = intel_iommu_dev_has_feat,
5994         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5995         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5996         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5997         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5998         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5999 };
6000
6001 static void quirk_iommu_igfx(struct pci_dev *dev)
6002 {
6003         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6004         dmar_map_gfx = 0;
6005 }
6006
6007 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6015
6016 /* Broadwell igfx malfunctions with dmar */
6017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6041
6042 static void quirk_iommu_rwbf(struct pci_dev *dev)
6043 {
6044         /*
6045          * Mobile 4 Series Chipset neglects to set RWBF capability,
6046          * but needs it. Same seems to hold for the desktop versions.
6047          */
6048         pci_info(dev, "Forcing write-buffer flush capability\n");
6049         rwbf_quirk = 1;
6050 }
6051
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6053 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6058 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6059
6060 #define GGC 0x52
6061 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6062 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6063 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6064 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6065 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6066 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6067 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6068 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6069
6070 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6071 {
6072         unsigned short ggc;
6073
6074         if (pci_read_config_word(dev, GGC, &ggc))
6075                 return;
6076
6077         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6078                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6079                 dmar_map_gfx = 0;
6080         } else if (dmar_map_gfx) {
6081                 /* we have to ensure the gfx device is idle before we flush */
6082                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6083                 intel_iommu_strict = 1;
6084        }
6085 }
6086 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6090
6091 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6092    ISOCH DMAR unit for the Azalia sound device, but not give it any
6093    TLB entries, which causes it to deadlock. Check for that.  We do
6094    this in a function called from init_dmars(), instead of in a PCI
6095    quirk, because we don't want to print the obnoxious "BIOS broken"
6096    message if VT-d is actually disabled.
6097 */
6098 static void __init check_tylersburg_isoch(void)
6099 {
6100         struct pci_dev *pdev;
6101         uint32_t vtisochctrl;
6102
6103         /* If there's no Azalia in the system anyway, forget it. */
6104         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6105         if (!pdev)
6106                 return;
6107         pci_dev_put(pdev);
6108
6109         /* System Management Registers. Might be hidden, in which case
6110            we can't do the sanity check. But that's OK, because the
6111            known-broken BIOSes _don't_ actually hide it, so far. */
6112         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6113         if (!pdev)
6114                 return;
6115
6116         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6117                 pci_dev_put(pdev);
6118                 return;
6119         }
6120
6121         pci_dev_put(pdev);
6122
6123         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6124         if (vtisochctrl & 1)
6125                 return;
6126
6127         /* Drop all bits other than the number of TLB entries */
6128         vtisochctrl &= 0x1c;
6129
6130         /* If we have the recommended number of TLB entries (16), fine. */
6131         if (vtisochctrl == 0x10)
6132                 return;
6133
6134         /* Zero TLB entries? You get to ride the short bus to school. */
6135         if (!vtisochctrl) {
6136                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6137                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6138                      dmi_get_system_info(DMI_BIOS_VENDOR),
6139                      dmi_get_system_info(DMI_BIOS_VERSION),
6140                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6141                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6142                 return;
6143         }
6144
6145         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6146                vtisochctrl);
6147 }