Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 #define for_each_domain_iommu(idx, domain)                      \
311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
312                 if (domain->iommu_refcnt[idx])
313
314 struct dmar_rmrr_unit {
315         struct list_head list;          /* list of rmrr units   */
316         struct acpi_dmar_header *hdr;   /* ACPI header          */
317         u64     base_address;           /* reserved base address*/
318         u64     end_address;            /* reserved end address */
319         struct dmar_dev_scope *devices; /* target devices */
320         int     devices_cnt;            /* target device count */
321 };
322
323 struct dmar_atsr_unit {
324         struct list_head list;          /* list of ATSR units */
325         struct acpi_dmar_header *hdr;   /* ACPI header */
326         struct dmar_dev_scope *devices; /* target devices */
327         int devices_cnt;                /* target device count */
328         u8 include_all:1;               /* include all ports */
329 };
330
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
333
334 #define for_each_rmrr_units(rmrr) \
335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
339
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345                                  struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347                                struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350                                      struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352                                             dma_addr_t iova);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
370
371 #define IDENTMAP_ALL            1
372 #define IDENTMAP_GFX            2
373 #define IDENTMAP_AZALIA         4
374
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
377
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
384                                 to_pci_dev(d)->untrusted)
385
386 /*
387  * Iterate over elements in device_domain_list and call the specified
388  * callback @fn against each element.
389  */
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391                                      void *data), void *data)
392 {
393         int ret = 0;
394         unsigned long flags;
395         struct device_domain_info *info;
396
397         spin_lock_irqsave(&device_domain_lock, flags);
398         list_for_each_entry(info, &device_domain_list, global) {
399                 ret = fn(info, data);
400                 if (ret) {
401                         spin_unlock_irqrestore(&device_domain_lock, flags);
402                         return ret;
403                 }
404         }
405         spin_unlock_irqrestore(&device_domain_lock, flags);
406
407         return 0;
408 }
409
410 const struct iommu_ops intel_iommu_ops;
411
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 {
414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 }
416
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 }
421
422 static void init_translation_status(struct intel_iommu *iommu)
423 {
424         u32 gsts;
425
426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
427         if (gsts & DMA_GSTS_TES)
428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 }
430
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 {
434         return container_of(dom, struct dmar_domain, domain);
435 }
436
437 static int __init intel_iommu_setup(char *str)
438 {
439         if (!str)
440                 return -EINVAL;
441         while (*str) {
442                 if (!strncmp(str, "on", 2)) {
443                         dmar_disabled = 0;
444                         pr_info("IOMMU enabled\n");
445                 } else if (!strncmp(str, "off", 3)) {
446                         dmar_disabled = 1;
447                         no_platform_optin = 1;
448                         pr_info("IOMMU disabled\n");
449                 } else if (!strncmp(str, "igfx_off", 8)) {
450                         dmar_map_gfx = 0;
451                         pr_info("Disable GFX device mapping\n");
452                 } else if (!strncmp(str, "forcedac", 8)) {
453                         pr_info("Forcing DAC for PCI devices\n");
454                         dmar_forcedac = 1;
455                 } else if (!strncmp(str, "strict", 6)) {
456                         pr_info("Disable batched IOTLB flush\n");
457                         intel_iommu_strict = 1;
458                 } else if (!strncmp(str, "sp_off", 6)) {
459                         pr_info("Disable supported super page\n");
460                         intel_iommu_superpage = 0;
461                 } else if (!strncmp(str, "sm_on", 5)) {
462                         pr_info("Intel-IOMMU: scalable mode supported\n");
463                         intel_iommu_sm = 1;
464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467                         intel_iommu_tboot_noforce = 1;
468                 } else if (!strncmp(str, "nobounce", 8)) {
469                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470                         intel_no_bounce = 1;
471                 }
472
473                 str += strcspn(str, ",");
474                 while (*str == ',')
475                         str++;
476         }
477         return 0;
478 }
479 __setup("intel_iommu=", intel_iommu_setup);
480
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
483
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         domains = iommu->domains[idx];
490         if (!domains)
491                 return NULL;
492
493         return domains[did & 0xff];
494 }
495
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497                              struct dmar_domain *domain)
498 {
499         struct dmar_domain **domains;
500         int idx = did >> 8;
501
502         if (!iommu->domains[idx]) {
503                 size_t size = 256 * sizeof(struct dmar_domain *);
504                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
505         }
506
507         domains = iommu->domains[idx];
508         if (WARN_ON(!domains))
509                 return;
510         else
511                 domains[did & 0xff] = domain;
512 }
513
514 void *alloc_pgtable_page(int node)
515 {
516         struct page *page;
517         void *vaddr = NULL;
518
519         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520         if (page)
521                 vaddr = page_address(page);
522         return vaddr;
523 }
524
525 void free_pgtable_page(void *vaddr)
526 {
527         free_page((unsigned long)vaddr);
528 }
529
530 static inline void *alloc_domain_mem(void)
531 {
532         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
533 }
534
535 static void free_domain_mem(void *vaddr)
536 {
537         kmem_cache_free(iommu_domain_cache, vaddr);
538 }
539
540 static inline void * alloc_devinfo_mem(void)
541 {
542         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
543 }
544
545 static inline void free_devinfo_mem(void *vaddr)
546 {
547         kmem_cache_free(iommu_devinfo_cache, vaddr);
548 }
549
550 static inline int domain_type_is_si(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
615 {
616         struct dmar_drhd_unit *drhd;
617         struct intel_iommu *iommu;
618         bool found = false;
619         int i;
620
621         domain->iommu_coherency = 1;
622
623         for_each_domain_iommu(i, domain) {
624                 found = true;
625                 if (!ecap_coherent(g_iommus[i]->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         if (found)
631                 return;
632
633         /* No hardware attached; use lowest common denominator */
634         rcu_read_lock();
635         for_each_active_iommu(iommu, drhd) {
636                 if (!ecap_coherent(iommu->ecap)) {
637                         domain->iommu_coherency = 0;
638                         break;
639                 }
640         }
641         rcu_read_unlock();
642 }
643
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
645 {
646         struct dmar_drhd_unit *drhd;
647         struct intel_iommu *iommu;
648         int ret = 1;
649
650         rcu_read_lock();
651         for_each_active_iommu(iommu, drhd) {
652                 if (iommu != skip) {
653                         if (!ecap_sc_support(iommu->ecap)) {
654                                 ret = 0;
655                                 break;
656                         }
657                 }
658         }
659         rcu_read_unlock();
660
661         return ret;
662 }
663
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
665 {
666         struct dmar_drhd_unit *drhd;
667         struct intel_iommu *iommu;
668         int mask = 0xf;
669
670         if (!intel_iommu_superpage) {
671                 return 0;
672         }
673
674         /* set iommu_superpage to the smallest common denominator */
675         rcu_read_lock();
676         for_each_active_iommu(iommu, drhd) {
677                 if (iommu != skip) {
678                         mask &= cap_super_page_val(iommu->cap);
679                         if (!mask)
680                                 break;
681                 }
682         }
683         rcu_read_unlock();
684
685         return fls(mask);
686 }
687
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
690 {
691         domain_update_iommu_coherency(domain);
692         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
694 }
695
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697                                          u8 devfn, int alloc)
698 {
699         struct root_entry *root = &iommu->root_entry[bus];
700         struct context_entry *context;
701         u64 *entry;
702
703         entry = &root->lo;
704         if (sm_supported(iommu)) {
705                 if (devfn >= 0x80) {
706                         devfn -= 0x80;
707                         entry = &root->hi;
708                 }
709                 devfn *= 2;
710         }
711         if (*entry & 1)
712                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713         else {
714                 unsigned long phy_addr;
715                 if (!alloc)
716                         return NULL;
717
718                 context = alloc_pgtable_page(iommu->node);
719                 if (!context)
720                         return NULL;
721
722                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723                 phy_addr = virt_to_phys((void *)context);
724                 *entry = phy_addr | 1;
725                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
726         }
727         return &context[devfn];
728 }
729
730 static int iommu_dummy(struct device *dev)
731 {
732         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
733 }
734
735 /**
736  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737  *                               sub-hierarchy of a candidate PCI-PCI bridge
738  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739  * @bridge: the candidate PCI-PCI bridge
740  *
741  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
742  */
743 static bool
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
745 {
746         struct pci_dev *pdev, *pbridge;
747
748         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749                 return false;
750
751         pdev = to_pci_dev(dev);
752         pbridge = to_pci_dev(bridge);
753
754         if (pbridge->subordinate &&
755             pbridge->subordinate->number <= pdev->bus->number &&
756             pbridge->subordinate->busn_res.end >= pdev->bus->number)
757                 return true;
758
759         return false;
760 }
761
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
763 {
764         struct dmar_drhd_unit *drhd = NULL;
765         struct intel_iommu *iommu;
766         struct device *tmp;
767         struct pci_dev *pdev = NULL;
768         u16 segment = 0;
769         int i;
770
771         if (iommu_dummy(dev))
772                 return NULL;
773
774         if (dev_is_pci(dev)) {
775                 struct pci_dev *pf_pdev;
776
777                 pdev = to_pci_dev(dev);
778
779 #ifdef CONFIG_X86
780                 /* VMD child devices currently cannot be handled individually */
781                 if (is_vmd(pdev->bus))
782                         return NULL;
783 #endif
784
785                 /* VFs aren't listed in scope tables; we need to look up
786                  * the PF instead to find the IOMMU. */
787                 pf_pdev = pci_physfn(pdev);
788                 dev = &pf_pdev->dev;
789                 segment = pci_domain_nr(pdev->bus);
790         } else if (has_acpi_companion(dev))
791                 dev = &ACPI_COMPANION(dev)->dev;
792
793         rcu_read_lock();
794         for_each_active_iommu(iommu, drhd) {
795                 if (pdev && segment != drhd->segment)
796                         continue;
797
798                 for_each_active_dev_scope(drhd->devices,
799                                           drhd->devices_cnt, i, tmp) {
800                         if (tmp == dev) {
801                                 /* For a VF use its original BDF# not that of the PF
802                                  * which we used for the IOMMU lookup. Strictly speaking
803                                  * we could do this for all PCI devices; we only need to
804                                  * get the BDF# from the scope table for ACPI matches. */
805                                 if (pdev && pdev->is_virtfn)
806                                         goto got_pdev;
807
808                                 *bus = drhd->devices[i].bus;
809                                 *devfn = drhd->devices[i].devfn;
810                                 goto out;
811                         }
812
813                         if (is_downstream_to_pci_bridge(dev, tmp))
814                                 goto got_pdev;
815                 }
816
817                 if (pdev && drhd->include_all) {
818                 got_pdev:
819                         *bus = pdev->bus->number;
820                         *devfn = pdev->devfn;
821                         goto out;
822                 }
823         }
824         iommu = NULL;
825  out:
826         rcu_read_unlock();
827
828         return iommu;
829 }
830
831 static void domain_flush_cache(struct dmar_domain *domain,
832                                void *addr, int size)
833 {
834         if (!domain->iommu_coherency)
835                 clflush_cache_range(addr, size);
836 }
837
838 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
839 {
840         struct context_entry *context;
841         int ret = 0;
842         unsigned long flags;
843
844         spin_lock_irqsave(&iommu->lock, flags);
845         context = iommu_context_addr(iommu, bus, devfn, 0);
846         if (context)
847                 ret = context_present(context);
848         spin_unlock_irqrestore(&iommu->lock, flags);
849         return ret;
850 }
851
852 static void free_context_table(struct intel_iommu *iommu)
853 {
854         int i;
855         unsigned long flags;
856         struct context_entry *context;
857
858         spin_lock_irqsave(&iommu->lock, flags);
859         if (!iommu->root_entry) {
860                 goto out;
861         }
862         for (i = 0; i < ROOT_ENTRY_NR; i++) {
863                 context = iommu_context_addr(iommu, i, 0, 0);
864                 if (context)
865                         free_pgtable_page(context);
866
867                 if (!sm_supported(iommu))
868                         continue;
869
870                 context = iommu_context_addr(iommu, i, 0x80, 0);
871                 if (context)
872                         free_pgtable_page(context);
873
874         }
875         free_pgtable_page(iommu->root_entry);
876         iommu->root_entry = NULL;
877 out:
878         spin_unlock_irqrestore(&iommu->lock, flags);
879 }
880
881 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882                                       unsigned long pfn, int *target_level)
883 {
884         struct dma_pte *parent, *pte;
885         int level = agaw_to_level(domain->agaw);
886         int offset;
887
888         BUG_ON(!domain->pgd);
889
890         if (!domain_pfn_supported(domain, pfn))
891                 /* Address beyond IOMMU's addressing capabilities. */
892                 return NULL;
893
894         parent = domain->pgd;
895
896         while (1) {
897                 void *tmp_page;
898
899                 offset = pfn_level_offset(pfn, level);
900                 pte = &parent[offset];
901                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
902                         break;
903                 if (level == *target_level)
904                         break;
905
906                 if (!dma_pte_present(pte)) {
907                         uint64_t pteval;
908
909                         tmp_page = alloc_pgtable_page(domain->nid);
910
911                         if (!tmp_page)
912                                 return NULL;
913
914                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916                         if (cmpxchg64(&pte->val, 0ULL, pteval))
917                                 /* Someone else set it while we were thinking; use theirs. */
918                                 free_pgtable_page(tmp_page);
919                         else
920                                 domain_flush_cache(domain, pte, sizeof(*pte));
921                 }
922                 if (level == 1)
923                         break;
924
925                 parent = phys_to_virt(dma_pte_addr(pte));
926                 level--;
927         }
928
929         if (!*target_level)
930                 *target_level = level;
931
932         return pte;
933 }
934
935 /* return address's pte at specific level */
936 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
937                                          unsigned long pfn,
938                                          int level, int *large_page)
939 {
940         struct dma_pte *parent, *pte;
941         int total = agaw_to_level(domain->agaw);
942         int offset;
943
944         parent = domain->pgd;
945         while (level <= total) {
946                 offset = pfn_level_offset(pfn, total);
947                 pte = &parent[offset];
948                 if (level == total)
949                         return pte;
950
951                 if (!dma_pte_present(pte)) {
952                         *large_page = total;
953                         break;
954                 }
955
956                 if (dma_pte_superpage(pte)) {
957                         *large_page = total;
958                         return pte;
959                 }
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 total--;
963         }
964         return NULL;
965 }
966
967 /* clear last level pte, a tlb flush should be followed */
968 static void dma_pte_clear_range(struct dmar_domain *domain,
969                                 unsigned long start_pfn,
970                                 unsigned long last_pfn)
971 {
972         unsigned int large_page;
973         struct dma_pte *first_pte, *pte;
974
975         BUG_ON(!domain_pfn_supported(domain, start_pfn));
976         BUG_ON(!domain_pfn_supported(domain, last_pfn));
977         BUG_ON(start_pfn > last_pfn);
978
979         /* we don't need lock here; nobody else touches the iova range */
980         do {
981                 large_page = 1;
982                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
983                 if (!pte) {
984                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
985                         continue;
986                 }
987                 do {
988                         dma_clear_pte(pte);
989                         start_pfn += lvl_to_nr_pages(large_page);
990                         pte++;
991                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
992
993                 domain_flush_cache(domain, first_pte,
994                                    (void *)pte - (void *)first_pte);
995
996         } while (start_pfn && start_pfn <= last_pfn);
997 }
998
999 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000                                int retain_level, struct dma_pte *pte,
1001                                unsigned long pfn, unsigned long start_pfn,
1002                                unsigned long last_pfn)
1003 {
1004         pfn = max(start_pfn, pfn);
1005         pte = &pte[pfn_level_offset(pfn, level)];
1006
1007         do {
1008                 unsigned long level_pfn;
1009                 struct dma_pte *level_pte;
1010
1011                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1012                         goto next;
1013
1014                 level_pfn = pfn & level_mask(level);
1015                 level_pte = phys_to_virt(dma_pte_addr(pte));
1016
1017                 if (level > 2) {
1018                         dma_pte_free_level(domain, level - 1, retain_level,
1019                                            level_pte, level_pfn, start_pfn,
1020                                            last_pfn);
1021                 }
1022
1023                 /*
1024                  * Free the page table if we're below the level we want to
1025                  * retain and the range covers the entire table.
1026                  */
1027                 if (level < retain_level && !(start_pfn > level_pfn ||
1028                       last_pfn < level_pfn + level_size(level) - 1)) {
1029                         dma_clear_pte(pte);
1030                         domain_flush_cache(domain, pte, sizeof(*pte));
1031                         free_pgtable_page(level_pte);
1032                 }
1033 next:
1034                 pfn += level_size(level);
1035         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1036 }
1037
1038 /*
1039  * clear last level (leaf) ptes and free page table pages below the
1040  * level we wish to keep intact.
1041  */
1042 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043                                    unsigned long start_pfn,
1044                                    unsigned long last_pfn,
1045                                    int retain_level)
1046 {
1047         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049         BUG_ON(start_pfn > last_pfn);
1050
1051         dma_pte_clear_range(domain, start_pfn, last_pfn);
1052
1053         /* We don't need lock here; nobody else touches the iova range */
1054         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055                            domain->pgd, 0, start_pfn, last_pfn);
1056
1057         /* free pgd */
1058         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059                 free_pgtable_page(domain->pgd);
1060                 domain->pgd = NULL;
1061         }
1062 }
1063
1064 /* When a page at a given level is being unlinked from its parent, we don't
1065    need to *modify* it at all. All we need to do is make a list of all the
1066    pages which can be freed just as soon as we've flushed the IOTLB and we
1067    know the hardware page-walk will no longer touch them.
1068    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1069    be freed. */
1070 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071                                             int level, struct dma_pte *pte,
1072                                             struct page *freelist)
1073 {
1074         struct page *pg;
1075
1076         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077         pg->freelist = freelist;
1078         freelist = pg;
1079
1080         if (level == 1)
1081                 return freelist;
1082
1083         pte = page_address(pg);
1084         do {
1085                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086                         freelist = dma_pte_list_pagetables(domain, level - 1,
1087                                                            pte, freelist);
1088                 pte++;
1089         } while (!first_pte_in_page(pte));
1090
1091         return freelist;
1092 }
1093
1094 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095                                         struct dma_pte *pte, unsigned long pfn,
1096                                         unsigned long start_pfn,
1097                                         unsigned long last_pfn,
1098                                         struct page *freelist)
1099 {
1100         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1101
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107
1108                 if (!dma_pte_present(pte))
1109                         goto next;
1110
1111                 level_pfn = pfn & level_mask(level);
1112
1113                 /* If range covers entire pagetable, free it */
1114                 if (start_pfn <= level_pfn &&
1115                     last_pfn >= level_pfn + level_size(level) - 1) {
1116                         /* These suborbinate page tables are going away entirely. Don't
1117                            bother to clear them; we're just going to *free* them. */
1118                         if (level > 1 && !dma_pte_superpage(pte))
1119                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1120
1121                         dma_clear_pte(pte);
1122                         if (!first_pte)
1123                                 first_pte = pte;
1124                         last_pte = pte;
1125                 } else if (level > 1) {
1126                         /* Recurse down into a level that isn't *entirely* obsolete */
1127                         freelist = dma_pte_clear_level(domain, level - 1,
1128                                                        phys_to_virt(dma_pte_addr(pte)),
1129                                                        level_pfn, start_pfn, last_pfn,
1130                                                        freelist);
1131                 }
1132 next:
1133                 pfn += level_size(level);
1134         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1135
1136         if (first_pte)
1137                 domain_flush_cache(domain, first_pte,
1138                                    (void *)++last_pte - (void *)first_pte);
1139
1140         return freelist;
1141 }
1142
1143 /* We can't just free the pages because the IOMMU may still be walking
1144    the page tables, and may have cached the intermediate levels. The
1145    pages can only be freed after the IOTLB flush has been done. */
1146 static struct page *domain_unmap(struct dmar_domain *domain,
1147                                  unsigned long start_pfn,
1148                                  unsigned long last_pfn)
1149 {
1150         struct page *freelist;
1151
1152         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154         BUG_ON(start_pfn > last_pfn);
1155
1156         /* we don't need lock here; nobody else touches the iova range */
1157         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1159
1160         /* free pgd */
1161         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162                 struct page *pgd_page = virt_to_page(domain->pgd);
1163                 pgd_page->freelist = freelist;
1164                 freelist = pgd_page;
1165
1166                 domain->pgd = NULL;
1167         }
1168
1169         return freelist;
1170 }
1171
1172 static void dma_free_pagelist(struct page *freelist)
1173 {
1174         struct page *pg;
1175
1176         while ((pg = freelist)) {
1177                 freelist = pg->freelist;
1178                 free_pgtable_page(page_address(pg));
1179         }
1180 }
1181
1182 static void iova_entry_free(unsigned long data)
1183 {
1184         struct page *freelist = (struct page *)data;
1185
1186         dma_free_pagelist(freelist);
1187 }
1188
1189 /* iommu handling */
1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191 {
1192         struct root_entry *root;
1193         unsigned long flags;
1194
1195         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1196         if (!root) {
1197                 pr_err("Allocating root entry for %s failed\n",
1198                         iommu->name);
1199                 return -ENOMEM;
1200         }
1201
1202         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1203
1204         spin_lock_irqsave(&iommu->lock, flags);
1205         iommu->root_entry = root;
1206         spin_unlock_irqrestore(&iommu->lock, flags);
1207
1208         return 0;
1209 }
1210
1211 static void iommu_set_root_entry(struct intel_iommu *iommu)
1212 {
1213         u64 addr;
1214         u32 sts;
1215         unsigned long flag;
1216
1217         addr = virt_to_phys(iommu->root_entry);
1218         if (sm_supported(iommu))
1219                 addr |= DMA_RTADDR_SMT;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1223
1224         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1225
1226         /* Make sure hardware complete it */
1227         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228                       readl, (sts & DMA_GSTS_RTPS), sts);
1229
1230         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1231 }
1232
1233 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1234 {
1235         u32 val;
1236         unsigned long flag;
1237
1238         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1239                 return;
1240
1241         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1243
1244         /* Make sure hardware complete it */
1245         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246                       readl, (!(val & DMA_GSTS_WBFS)), val);
1247
1248         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1249 }
1250
1251 /* return value determine if we need a write buffer flush */
1252 static void __iommu_flush_context(struct intel_iommu *iommu,
1253                                   u16 did, u16 source_id, u8 function_mask,
1254                                   u64 type)
1255 {
1256         u64 val = 0;
1257         unsigned long flag;
1258
1259         switch (type) {
1260         case DMA_CCMD_GLOBAL_INVL:
1261                 val = DMA_CCMD_GLOBAL_INVL;
1262                 break;
1263         case DMA_CCMD_DOMAIN_INVL:
1264                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1265                 break;
1266         case DMA_CCMD_DEVICE_INVL:
1267                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1269                 break;
1270         default:
1271                 BUG();
1272         }
1273         val |= DMA_CCMD_ICC;
1274
1275         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1277
1278         /* Make sure hardware complete it */
1279         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1281
1282         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283 }
1284
1285 /* return value determine if we need a write buffer flush */
1286 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287                                 u64 addr, unsigned int size_order, u64 type)
1288 {
1289         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290         u64 val = 0, val_iva = 0;
1291         unsigned long flag;
1292
1293         switch (type) {
1294         case DMA_TLB_GLOBAL_FLUSH:
1295                 /* global flush doesn't need set IVA_REG */
1296                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1297                 break;
1298         case DMA_TLB_DSI_FLUSH:
1299                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300                 break;
1301         case DMA_TLB_PSI_FLUSH:
1302                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303                 /* IH bit is passed in as part of address */
1304                 val_iva = size_order | addr;
1305                 break;
1306         default:
1307                 BUG();
1308         }
1309         /* Note: set drain read/write */
1310 #if 0
1311         /*
1312          * This is probably to be super secure.. Looks like we can
1313          * ignore it without any impact.
1314          */
1315         if (cap_read_drain(iommu->cap))
1316                 val |= DMA_TLB_READ_DRAIN;
1317 #endif
1318         if (cap_write_drain(iommu->cap))
1319                 val |= DMA_TLB_WRITE_DRAIN;
1320
1321         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322         /* Note: Only uses first TLB reg currently */
1323         if (val_iva)
1324                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1326
1327         /* Make sure hardware complete it */
1328         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1330
1331         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332
1333         /* check IOTLB invalidation granularity */
1334         if (DMA_TLB_IAIG(val) == 0)
1335                 pr_err("Flush IOTLB failed\n");
1336         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1338                         (unsigned long long)DMA_TLB_IIRG(type),
1339                         (unsigned long long)DMA_TLB_IAIG(val));
1340 }
1341
1342 static struct device_domain_info *
1343 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1344                          u8 bus, u8 devfn)
1345 {
1346         struct device_domain_info *info;
1347
1348         assert_spin_locked(&device_domain_lock);
1349
1350         if (!iommu->qi)
1351                 return NULL;
1352
1353         list_for_each_entry(info, &domain->devices, link)
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         if (info->ats_supported && info->dev)
1357                                 return info;
1358                         break;
1359                 }
1360
1361         return NULL;
1362 }
1363
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1365 {
1366         struct device_domain_info *info;
1367         bool has_iotlb_device = false;
1368
1369         assert_spin_locked(&device_domain_lock);
1370
1371         list_for_each_entry(info, &domain->devices, link) {
1372                 struct pci_dev *pdev;
1373
1374                 if (!info->dev || !dev_is_pci(info->dev))
1375                         continue;
1376
1377                 pdev = to_pci_dev(info->dev);
1378                 if (pdev->ats_enabled) {
1379                         has_iotlb_device = true;
1380                         break;
1381                 }
1382         }
1383
1384         domain->has_iotlb_device = has_iotlb_device;
1385 }
1386
1387 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1388 {
1389         struct pci_dev *pdev;
1390
1391         assert_spin_locked(&device_domain_lock);
1392
1393         if (!info || !dev_is_pci(info->dev))
1394                 return;
1395
1396         pdev = to_pci_dev(info->dev);
1397         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400          * reserved, which should be set to 0.
1401          */
1402         if (!ecap_dit(info->iommu->ecap))
1403                 info->pfsid = 0;
1404         else {
1405                 struct pci_dev *pf_pdev;
1406
1407                 /* pdev will be returned if device is not a vf */
1408                 pf_pdev = pci_physfn(pdev);
1409                 info->pfsid = pci_dev_id(pf_pdev);
1410         }
1411
1412 #ifdef CONFIG_INTEL_IOMMU_SVM
1413         /* The PCIe spec, in its wisdom, declares that the behaviour of
1414            the device if you enable PASID support after ATS support is
1415            undefined. So always enable PASID support on devices which
1416            have it, even if we can't yet know if we're ever going to
1417            use it. */
1418         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419                 info->pasid_enabled = 1;
1420
1421         if (info->pri_supported &&
1422             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1423             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424                 info->pri_enabled = 1;
1425 #endif
1426         if (!pdev->untrusted && info->ats_supported &&
1427             pci_ats_page_aligned(pdev) &&
1428             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429                 info->ats_enabled = 1;
1430                 domain_update_iotlb(info->domain);
1431                 info->ats_qdep = pci_ats_queue_depth(pdev);
1432         }
1433 }
1434
1435 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1436 {
1437         struct pci_dev *pdev;
1438
1439         assert_spin_locked(&device_domain_lock);
1440
1441         if (!dev_is_pci(info->dev))
1442                 return;
1443
1444         pdev = to_pci_dev(info->dev);
1445
1446         if (info->ats_enabled) {
1447                 pci_disable_ats(pdev);
1448                 info->ats_enabled = 0;
1449                 domain_update_iotlb(info->domain);
1450         }
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452         if (info->pri_enabled) {
1453                 pci_disable_pri(pdev);
1454                 info->pri_enabled = 0;
1455         }
1456         if (info->pasid_enabled) {
1457                 pci_disable_pasid(pdev);
1458                 info->pasid_enabled = 0;
1459         }
1460 #endif
1461 }
1462
1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464                                   u64 addr, unsigned mask)
1465 {
1466         u16 sid, qdep;
1467         unsigned long flags;
1468         struct device_domain_info *info;
1469
1470         if (!domain->has_iotlb_device)
1471                 return;
1472
1473         spin_lock_irqsave(&device_domain_lock, flags);
1474         list_for_each_entry(info, &domain->devices, link) {
1475                 if (!info->ats_enabled)
1476                         continue;
1477
1478                 sid = info->bus << 8 | info->devfn;
1479                 qdep = info->ats_qdep;
1480                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481                                 qdep, addr, mask);
1482         }
1483         spin_unlock_irqrestore(&device_domain_lock, flags);
1484 }
1485
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487                                   struct dmar_domain *domain,
1488                                   unsigned long pfn, unsigned int pages,
1489                                   int ih, int map)
1490 {
1491         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493         u16 did = domain->iommu_did[iommu->seq_id];
1494
1495         BUG_ON(pages == 0);
1496
1497         if (ih)
1498                 ih = 1 << 6;
1499         /*
1500          * Fallback to domain selective flush if no PSI support or the size is
1501          * too big.
1502          * PSI requires page size to be 2 ^ x, and the base address is naturally
1503          * aligned to the size
1504          */
1505         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507                                                 DMA_TLB_DSI_FLUSH);
1508         else
1509                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510                                                 DMA_TLB_PSI_FLUSH);
1511
1512         /*
1513          * In caching mode, changes of pages from non-present to present require
1514          * flush. However, device IOTLB doesn't need to be flushed in this case.
1515          */
1516         if (!cap_caching_mode(iommu->cap) || !map)
1517                 iommu_flush_dev_iotlb(domain, addr, mask);
1518 }
1519
1520 /* Notification for newly created mappings */
1521 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522                                         struct dmar_domain *domain,
1523                                         unsigned long pfn, unsigned int pages)
1524 {
1525         /* It's a non-present to present mapping. Only flush if caching mode */
1526         if (cap_caching_mode(iommu->cap))
1527                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1528         else
1529                 iommu_flush_write_buffer(iommu);
1530 }
1531
1532 static void iommu_flush_iova(struct iova_domain *iovad)
1533 {
1534         struct dmar_domain *domain;
1535         int idx;
1536
1537         domain = container_of(iovad, struct dmar_domain, iovad);
1538
1539         for_each_domain_iommu(idx, domain) {
1540                 struct intel_iommu *iommu = g_iommus[idx];
1541                 u16 did = domain->iommu_did[iommu->seq_id];
1542
1543                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1544
1545                 if (!cap_caching_mode(iommu->cap))
1546                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547                                               0, MAX_AGAW_PFN_WIDTH);
1548         }
1549 }
1550
1551 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1552 {
1553         u32 pmen;
1554         unsigned long flags;
1555
1556         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1557                 return;
1558
1559         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561         pmen &= ~DMA_PMEN_EPM;
1562         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1563
1564         /* wait for the protected region status bit to clear */
1565         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1567
1568         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569 }
1570
1571 static void iommu_enable_translation(struct intel_iommu *iommu)
1572 {
1573         u32 sts;
1574         unsigned long flags;
1575
1576         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577         iommu->gcmd |= DMA_GCMD_TE;
1578         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1579
1580         /* Make sure hardware complete it */
1581         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582                       readl, (sts & DMA_GSTS_TES), sts);
1583
1584         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585 }
1586
1587 static void iommu_disable_translation(struct intel_iommu *iommu)
1588 {
1589         u32 sts;
1590         unsigned long flag;
1591
1592         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593         iommu->gcmd &= ~DMA_GCMD_TE;
1594         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595
1596         /* Make sure hardware complete it */
1597         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598                       readl, (!(sts & DMA_GSTS_TES)), sts);
1599
1600         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1601 }
1602
1603 static int iommu_init_domains(struct intel_iommu *iommu)
1604 {
1605         u32 ndomains, nlongs;
1606         size_t size;
1607
1608         ndomains = cap_ndoms(iommu->cap);
1609         pr_debug("%s: Number of Domains supported <%d>\n",
1610                  iommu->name, ndomains);
1611         nlongs = BITS_TO_LONGS(ndomains);
1612
1613         spin_lock_init(&iommu->lock);
1614
1615         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616         if (!iommu->domain_ids) {
1617                 pr_err("%s: Allocating domain id array failed\n",
1618                        iommu->name);
1619                 return -ENOMEM;
1620         }
1621
1622         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623         iommu->domains = kzalloc(size, GFP_KERNEL);
1624
1625         if (iommu->domains) {
1626                 size = 256 * sizeof(struct dmar_domain *);
1627                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1628         }
1629
1630         if (!iommu->domains || !iommu->domains[0]) {
1631                 pr_err("%s: Allocating domain array failed\n",
1632                        iommu->name);
1633                 kfree(iommu->domain_ids);
1634                 kfree(iommu->domains);
1635                 iommu->domain_ids = NULL;
1636                 iommu->domains    = NULL;
1637                 return -ENOMEM;
1638         }
1639
1640         /*
1641          * If Caching mode is set, then invalid translations are tagged
1642          * with domain-id 0, hence we need to pre-allocate it. We also
1643          * use domain-id 0 as a marker for non-allocated domain-id, so
1644          * make sure it is not used for a real domain.
1645          */
1646         set_bit(0, iommu->domain_ids);
1647
1648         /*
1649          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650          * entry for first-level or pass-through translation modes should
1651          * be programmed with a domain id different from those used for
1652          * second-level or nested translation. We reserve a domain id for
1653          * this purpose.
1654          */
1655         if (sm_supported(iommu))
1656                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1657
1658         return 0;
1659 }
1660
1661 static void disable_dmar_iommu(struct intel_iommu *iommu)
1662 {
1663         struct device_domain_info *info, *tmp;
1664         unsigned long flags;
1665
1666         if (!iommu->domains || !iommu->domain_ids)
1667                 return;
1668
1669         spin_lock_irqsave(&device_domain_lock, flags);
1670         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671                 if (info->iommu != iommu)
1672                         continue;
1673
1674                 if (!info->dev || !info->domain)
1675                         continue;
1676
1677                 __dmar_remove_one_dev_info(info);
1678         }
1679         spin_unlock_irqrestore(&device_domain_lock, flags);
1680
1681         if (iommu->gcmd & DMA_GCMD_TE)
1682                 iommu_disable_translation(iommu);
1683 }
1684
1685 static void free_dmar_iommu(struct intel_iommu *iommu)
1686 {
1687         if ((iommu->domains) && (iommu->domain_ids)) {
1688                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1689                 int i;
1690
1691                 for (i = 0; i < elems; i++)
1692                         kfree(iommu->domains[i]);
1693                 kfree(iommu->domains);
1694                 kfree(iommu->domain_ids);
1695                 iommu->domains = NULL;
1696                 iommu->domain_ids = NULL;
1697         }
1698
1699         g_iommus[iommu->seq_id] = NULL;
1700
1701         /* free context mapping */
1702         free_context_table(iommu);
1703
1704 #ifdef CONFIG_INTEL_IOMMU_SVM
1705         if (pasid_supported(iommu)) {
1706                 if (ecap_prs(iommu->ecap))
1707                         intel_svm_finish_prq(iommu);
1708         }
1709 #endif
1710 }
1711
1712 static struct dmar_domain *alloc_domain(int flags)
1713 {
1714         struct dmar_domain *domain;
1715
1716         domain = alloc_domain_mem();
1717         if (!domain)
1718                 return NULL;
1719
1720         memset(domain, 0, sizeof(*domain));
1721         domain->nid = NUMA_NO_NODE;
1722         domain->flags = flags;
1723         domain->has_iotlb_device = false;
1724         INIT_LIST_HEAD(&domain->devices);
1725
1726         return domain;
1727 }
1728
1729 /* Must be called with iommu->lock */
1730 static int domain_attach_iommu(struct dmar_domain *domain,
1731                                struct intel_iommu *iommu)
1732 {
1733         unsigned long ndomains;
1734         int num;
1735
1736         assert_spin_locked(&device_domain_lock);
1737         assert_spin_locked(&iommu->lock);
1738
1739         domain->iommu_refcnt[iommu->seq_id] += 1;
1740         domain->iommu_count += 1;
1741         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742                 ndomains = cap_ndoms(iommu->cap);
1743                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1744
1745                 if (num >= ndomains) {
1746                         pr_err("%s: No free domain ids\n", iommu->name);
1747                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1748                         domain->iommu_count -= 1;
1749                         return -ENOSPC;
1750                 }
1751
1752                 set_bit(num, iommu->domain_ids);
1753                 set_iommu_domain(iommu, num, domain);
1754
1755                 domain->iommu_did[iommu->seq_id] = num;
1756                 domain->nid                      = iommu->node;
1757
1758                 domain_update_iommu_cap(domain);
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int domain_detach_iommu(struct dmar_domain *domain,
1765                                struct intel_iommu *iommu)
1766 {
1767         int num, count;
1768
1769         assert_spin_locked(&device_domain_lock);
1770         assert_spin_locked(&iommu->lock);
1771
1772         domain->iommu_refcnt[iommu->seq_id] -= 1;
1773         count = --domain->iommu_count;
1774         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775                 num = domain->iommu_did[iommu->seq_id];
1776                 clear_bit(num, iommu->domain_ids);
1777                 set_iommu_domain(iommu, num, NULL);
1778
1779                 domain_update_iommu_cap(domain);
1780                 domain->iommu_did[iommu->seq_id] = 0;
1781         }
1782
1783         return count;
1784 }
1785
1786 static struct iova_domain reserved_iova_list;
1787 static struct lock_class_key reserved_rbtree_key;
1788
1789 static int dmar_init_reserved_ranges(void)
1790 {
1791         struct pci_dev *pdev = NULL;
1792         struct iova *iova;
1793         int i;
1794
1795         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1796
1797         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798                 &reserved_rbtree_key);
1799
1800         /* IOAPIC ranges shouldn't be accessed by DMA */
1801         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802                 IOVA_PFN(IOAPIC_RANGE_END));
1803         if (!iova) {
1804                 pr_err("Reserve IOAPIC range failed\n");
1805                 return -ENODEV;
1806         }
1807
1808         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809         for_each_pci_dev(pdev) {
1810                 struct resource *r;
1811
1812                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813                         r = &pdev->resource[i];
1814                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815                                 continue;
1816                         iova = reserve_iova(&reserved_iova_list,
1817                                             IOVA_PFN(r->start),
1818                                             IOVA_PFN(r->end));
1819                         if (!iova) {
1820                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1821                                 return -ENODEV;
1822                         }
1823                 }
1824         }
1825         return 0;
1826 }
1827
1828 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1829 {
1830         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1831 }
1832
1833 static inline int guestwidth_to_adjustwidth(int gaw)
1834 {
1835         int agaw;
1836         int r = (gaw - 12) % 9;
1837
1838         if (r == 0)
1839                 agaw = gaw;
1840         else
1841                 agaw = gaw + 9 - r;
1842         if (agaw > 64)
1843                 agaw = 64;
1844         return agaw;
1845 }
1846
1847 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1848                        int guest_width)
1849 {
1850         int adjust_width, agaw;
1851         unsigned long sagaw;
1852         int err;
1853
1854         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1855
1856         err = init_iova_flush_queue(&domain->iovad,
1857                                     iommu_flush_iova, iova_entry_free);
1858         if (err)
1859                 return err;
1860
1861         domain_reserve_special_ranges(domain);
1862
1863         /* calculate AGAW */
1864         if (guest_width > cap_mgaw(iommu->cap))
1865                 guest_width = cap_mgaw(iommu->cap);
1866         domain->gaw = guest_width;
1867         adjust_width = guestwidth_to_adjustwidth(guest_width);
1868         agaw = width_to_agaw(adjust_width);
1869         sagaw = cap_sagaw(iommu->cap);
1870         if (!test_bit(agaw, &sagaw)) {
1871                 /* hardware doesn't support it, choose a bigger one */
1872                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873                 agaw = find_next_bit(&sagaw, 5, agaw);
1874                 if (agaw >= 5)
1875                         return -ENODEV;
1876         }
1877         domain->agaw = agaw;
1878
1879         if (ecap_coherent(iommu->ecap))
1880                 domain->iommu_coherency = 1;
1881         else
1882                 domain->iommu_coherency = 0;
1883
1884         if (ecap_sc_support(iommu->ecap))
1885                 domain->iommu_snooping = 1;
1886         else
1887                 domain->iommu_snooping = 0;
1888
1889         if (intel_iommu_superpage)
1890                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891         else
1892                 domain->iommu_superpage = 0;
1893
1894         domain->nid = iommu->node;
1895
1896         /* always allocate the top pgd */
1897         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1898         if (!domain->pgd)
1899                 return -ENOMEM;
1900         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1901         return 0;
1902 }
1903
1904 static void domain_exit(struct dmar_domain *domain)
1905 {
1906
1907         /* Remove associated devices and clear attached or cached domains */
1908         domain_remove_dev_info(domain);
1909
1910         /* destroy iovas */
1911         put_iova_domain(&domain->iovad);
1912
1913         if (domain->pgd) {
1914                 struct page *freelist;
1915
1916                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917                 dma_free_pagelist(freelist);
1918         }
1919
1920         free_domain_mem(domain);
1921 }
1922
1923 /*
1924  * Get the PASID directory size for scalable mode context entry.
1925  * Value of X in the PDTS field of a scalable mode context entry
1926  * indicates PASID directory with 2^(X + 7) entries.
1927  */
1928 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1929 {
1930         int pds, max_pde;
1931
1932         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1934         if (pds < 7)
1935                 return 0;
1936
1937         return pds - 7;
1938 }
1939
1940 /*
1941  * Set the RID_PASID field of a scalable mode context entry. The
1942  * IOMMU hardware will use the PASID value set in this field for
1943  * DMA translations of DMA requests without PASID.
1944  */
1945 static inline void
1946 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1947 {
1948         context->hi |= pasid & ((1 << 20) - 1);
1949         context->hi |= (1 << 20);
1950 }
1951
1952 /*
1953  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1954  * entry.
1955  */
1956 static inline void context_set_sm_dte(struct context_entry *context)
1957 {
1958         context->lo |= (1 << 2);
1959 }
1960
1961 /*
1962  * Set the PRE(Page Request Enable) field of a scalable mode context
1963  * entry.
1964  */
1965 static inline void context_set_sm_pre(struct context_entry *context)
1966 {
1967         context->lo |= (1 << 4);
1968 }
1969
1970 /* Convert value to context PASID directory size field coding. */
1971 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1972
1973 static int domain_context_mapping_one(struct dmar_domain *domain,
1974                                       struct intel_iommu *iommu,
1975                                       struct pasid_table *table,
1976                                       u8 bus, u8 devfn)
1977 {
1978         u16 did = domain->iommu_did[iommu->seq_id];
1979         int translation = CONTEXT_TT_MULTI_LEVEL;
1980         struct device_domain_info *info = NULL;
1981         struct context_entry *context;
1982         unsigned long flags;
1983         int ret;
1984
1985         WARN_ON(did == 0);
1986
1987         if (hw_pass_through && domain_type_is_si(domain))
1988                 translation = CONTEXT_TT_PASS_THROUGH;
1989
1990         pr_debug("Set context mapping for %02x:%02x.%d\n",
1991                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1992
1993         BUG_ON(!domain->pgd);
1994
1995         spin_lock_irqsave(&device_domain_lock, flags);
1996         spin_lock(&iommu->lock);
1997
1998         ret = -ENOMEM;
1999         context = iommu_context_addr(iommu, bus, devfn, 1);
2000         if (!context)
2001                 goto out_unlock;
2002
2003         ret = 0;
2004         if (context_present(context))
2005                 goto out_unlock;
2006
2007         /*
2008          * For kdump cases, old valid entries may be cached due to the
2009          * in-flight DMA and copied pgtable, but there is no unmapping
2010          * behaviour for them, thus we need an explicit cache flush for
2011          * the newly-mapped device. For kdump, at this point, the device
2012          * is supposed to finish reset at its driver probe stage, so no
2013          * in-flight DMA will exist, and we don't need to worry anymore
2014          * hereafter.
2015          */
2016         if (context_copied(context)) {
2017                 u16 did_old = context_domain_id(context);
2018
2019                 if (did_old < cap_ndoms(iommu->cap)) {
2020                         iommu->flush.flush_context(iommu, did_old,
2021                                                    (((u16)bus) << 8) | devfn,
2022                                                    DMA_CCMD_MASK_NOBIT,
2023                                                    DMA_CCMD_DEVICE_INVL);
2024                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2025                                                  DMA_TLB_DSI_FLUSH);
2026                 }
2027         }
2028
2029         context_clear_entry(context);
2030
2031         if (sm_supported(iommu)) {
2032                 unsigned long pds;
2033
2034                 WARN_ON(!table);
2035
2036                 /* Setup the PASID DIR pointer: */
2037                 pds = context_get_sm_pds(table);
2038                 context->lo = (u64)virt_to_phys(table->table) |
2039                                 context_pdts(pds);
2040
2041                 /* Setup the RID_PASID field: */
2042                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2043
2044                 /*
2045                  * Setup the Device-TLB enable bit and Page request
2046                  * Enable bit:
2047                  */
2048                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049                 if (info && info->ats_supported)
2050                         context_set_sm_dte(context);
2051                 if (info && info->pri_supported)
2052                         context_set_sm_pre(context);
2053         } else {
2054                 struct dma_pte *pgd = domain->pgd;
2055                 int agaw;
2056
2057                 context_set_domain_id(context, did);
2058
2059                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2060                         /*
2061                          * Skip top levels of page tables for iommu which has
2062                          * less agaw than default. Unnecessary for PT mode.
2063                          */
2064                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2065                                 ret = -ENOMEM;
2066                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2067                                 if (!dma_pte_present(pgd))
2068                                         goto out_unlock;
2069                         }
2070
2071                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072                         if (info && info->ats_supported)
2073                                 translation = CONTEXT_TT_DEV_IOTLB;
2074                         else
2075                                 translation = CONTEXT_TT_MULTI_LEVEL;
2076
2077                         context_set_address_root(context, virt_to_phys(pgd));
2078                         context_set_address_width(context, agaw);
2079                 } else {
2080                         /*
2081                          * In pass through mode, AW must be programmed to
2082                          * indicate the largest AGAW value supported by
2083                          * hardware. And ASR is ignored by hardware.
2084                          */
2085                         context_set_address_width(context, iommu->msagaw);
2086                 }
2087
2088                 context_set_translation_type(context, translation);
2089         }
2090
2091         context_set_fault_enable(context);
2092         context_set_present(context);
2093         domain_flush_cache(domain, context, sizeof(*context));
2094
2095         /*
2096          * It's a non-present to present mapping. If hardware doesn't cache
2097          * non-present entry we only need to flush the write-buffer. If the
2098          * _does_ cache non-present entries, then it does so in the special
2099          * domain #0, which we have to flush:
2100          */
2101         if (cap_caching_mode(iommu->cap)) {
2102                 iommu->flush.flush_context(iommu, 0,
2103                                            (((u16)bus) << 8) | devfn,
2104                                            DMA_CCMD_MASK_NOBIT,
2105                                            DMA_CCMD_DEVICE_INVL);
2106                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2107         } else {
2108                 iommu_flush_write_buffer(iommu);
2109         }
2110         iommu_enable_dev_iotlb(info);
2111
2112         ret = 0;
2113
2114 out_unlock:
2115         spin_unlock(&iommu->lock);
2116         spin_unlock_irqrestore(&device_domain_lock, flags);
2117
2118         return ret;
2119 }
2120
2121 struct domain_context_mapping_data {
2122         struct dmar_domain *domain;
2123         struct intel_iommu *iommu;
2124         struct pasid_table *table;
2125 };
2126
2127 static int domain_context_mapping_cb(struct pci_dev *pdev,
2128                                      u16 alias, void *opaque)
2129 {
2130         struct domain_context_mapping_data *data = opaque;
2131
2132         return domain_context_mapping_one(data->domain, data->iommu,
2133                                           data->table, PCI_BUS_NUM(alias),
2134                                           alias & 0xff);
2135 }
2136
2137 static int
2138 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2139 {
2140         struct domain_context_mapping_data data;
2141         struct pasid_table *table;
2142         struct intel_iommu *iommu;
2143         u8 bus, devfn;
2144
2145         iommu = device_to_iommu(dev, &bus, &devfn);
2146         if (!iommu)
2147                 return -ENODEV;
2148
2149         table = intel_pasid_get_table(dev);
2150
2151         if (!dev_is_pci(dev))
2152                 return domain_context_mapping_one(domain, iommu, table,
2153                                                   bus, devfn);
2154
2155         data.domain = domain;
2156         data.iommu = iommu;
2157         data.table = table;
2158
2159         return pci_for_each_dma_alias(to_pci_dev(dev),
2160                                       &domain_context_mapping_cb, &data);
2161 }
2162
2163 static int domain_context_mapped_cb(struct pci_dev *pdev,
2164                                     u16 alias, void *opaque)
2165 {
2166         struct intel_iommu *iommu = opaque;
2167
2168         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2169 }
2170
2171 static int domain_context_mapped(struct device *dev)
2172 {
2173         struct intel_iommu *iommu;
2174         u8 bus, devfn;
2175
2176         iommu = device_to_iommu(dev, &bus, &devfn);
2177         if (!iommu)
2178                 return -ENODEV;
2179
2180         if (!dev_is_pci(dev))
2181                 return device_context_mapped(iommu, bus, devfn);
2182
2183         return !pci_for_each_dma_alias(to_pci_dev(dev),
2184                                        domain_context_mapped_cb, iommu);
2185 }
2186
2187 /* Returns a number of VTD pages, but aligned to MM page size */
2188 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2189                                             size_t size)
2190 {
2191         host_addr &= ~PAGE_MASK;
2192         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2193 }
2194
2195 /* Return largest possible superpage level for a given mapping */
2196 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197                                           unsigned long iov_pfn,
2198                                           unsigned long phy_pfn,
2199                                           unsigned long pages)
2200 {
2201         int support, level = 1;
2202         unsigned long pfnmerge;
2203
2204         support = domain->iommu_superpage;
2205
2206         /* To use a large page, the virtual *and* physical addresses
2207            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208            of them will mean we have to use smaller pages. So just
2209            merge them and check both at once. */
2210         pfnmerge = iov_pfn | phy_pfn;
2211
2212         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213                 pages >>= VTD_STRIDE_SHIFT;
2214                 if (!pages)
2215                         break;
2216                 pfnmerge >>= VTD_STRIDE_SHIFT;
2217                 level++;
2218                 support--;
2219         }
2220         return level;
2221 }
2222
2223 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224                             struct scatterlist *sg, unsigned long phys_pfn,
2225                             unsigned long nr_pages, int prot)
2226 {
2227         struct dma_pte *first_pte = NULL, *pte = NULL;
2228         phys_addr_t uninitialized_var(pteval);
2229         unsigned long sg_res = 0;
2230         unsigned int largepage_lvl = 0;
2231         unsigned long lvl_pages = 0;
2232
2233         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2234
2235         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2236                 return -EINVAL;
2237
2238         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2239
2240         if (!sg) {
2241                 sg_res = nr_pages;
2242                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2243         }
2244
2245         while (nr_pages > 0) {
2246                 uint64_t tmp;
2247
2248                 if (!sg_res) {
2249                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2250
2251                         sg_res = aligned_nrpages(sg->offset, sg->length);
2252                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253                         sg->dma_length = sg->length;
2254                         pteval = (sg_phys(sg) - pgoff) | prot;
2255                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2256                 }
2257
2258                 if (!pte) {
2259                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2260
2261                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2262                         if (!pte)
2263                                 return -ENOMEM;
2264                         /* It is large page*/
2265                         if (largepage_lvl > 1) {
2266                                 unsigned long nr_superpages, end_pfn;
2267
2268                                 pteval |= DMA_PTE_LARGE_PAGE;
2269                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2270
2271                                 nr_superpages = sg_res / lvl_pages;
2272                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2273
2274                                 /*
2275                                  * Ensure that old small page tables are
2276                                  * removed to make room for superpage(s).
2277                                  * We're adding new large pages, so make sure
2278                                  * we don't remove their parent tables.
2279                                  */
2280                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2281                                                        largepage_lvl + 1);
2282                         } else {
2283                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2284                         }
2285
2286                 }
2287                 /* We don't need lock here, nobody else
2288                  * touches the iova range
2289                  */
2290                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2291                 if (tmp) {
2292                         static int dumps = 5;
2293                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294                                 iov_pfn, tmp, (unsigned long long)pteval);
2295                         if (dumps) {
2296                                 dumps--;
2297                                 debug_dma_dump_mappings(NULL);
2298                         }
2299                         WARN_ON(1);
2300                 }
2301
2302                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2303
2304                 BUG_ON(nr_pages < lvl_pages);
2305                 BUG_ON(sg_res < lvl_pages);
2306
2307                 nr_pages -= lvl_pages;
2308                 iov_pfn += lvl_pages;
2309                 phys_pfn += lvl_pages;
2310                 pteval += lvl_pages * VTD_PAGE_SIZE;
2311                 sg_res -= lvl_pages;
2312
2313                 /* If the next PTE would be the first in a new page, then we
2314                    need to flush the cache on the entries we've just written.
2315                    And then we'll need to recalculate 'pte', so clear it and
2316                    let it get set again in the if (!pte) block above.
2317
2318                    If we're done (!nr_pages) we need to flush the cache too.
2319
2320                    Also if we've been setting superpages, we may need to
2321                    recalculate 'pte' and switch back to smaller pages for the
2322                    end of the mapping, if the trailing size is not enough to
2323                    use another superpage (i.e. sg_res < lvl_pages). */
2324                 pte++;
2325                 if (!nr_pages || first_pte_in_page(pte) ||
2326                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327                         domain_flush_cache(domain, first_pte,
2328                                            (void *)pte - (void *)first_pte);
2329                         pte = NULL;
2330                 }
2331
2332                 if (!sg_res && nr_pages)
2333                         sg = sg_next(sg);
2334         }
2335         return 0;
2336 }
2337
2338 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339                           struct scatterlist *sg, unsigned long phys_pfn,
2340                           unsigned long nr_pages, int prot)
2341 {
2342         int iommu_id, ret;
2343         struct intel_iommu *iommu;
2344
2345         /* Do the real mapping first */
2346         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2347         if (ret)
2348                 return ret;
2349
2350         for_each_domain_iommu(iommu_id, domain) {
2351                 iommu = g_iommus[iommu_id];
2352                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2353         }
2354
2355         return 0;
2356 }
2357
2358 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359                                     struct scatterlist *sg, unsigned long nr_pages,
2360                                     int prot)
2361 {
2362         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2363 }
2364
2365 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366                                      unsigned long phys_pfn, unsigned long nr_pages,
2367                                      int prot)
2368 {
2369         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2370 }
2371
2372 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2373 {
2374         unsigned long flags;
2375         struct context_entry *context;
2376         u16 did_old;
2377
2378         if (!iommu)
2379                 return;
2380
2381         spin_lock_irqsave(&iommu->lock, flags);
2382         context = iommu_context_addr(iommu, bus, devfn, 0);
2383         if (!context) {
2384                 spin_unlock_irqrestore(&iommu->lock, flags);
2385                 return;
2386         }
2387         did_old = context_domain_id(context);
2388         context_clear_entry(context);
2389         __iommu_flush_cache(iommu, context, sizeof(*context));
2390         spin_unlock_irqrestore(&iommu->lock, flags);
2391         iommu->flush.flush_context(iommu,
2392                                    did_old,
2393                                    (((u16)bus) << 8) | devfn,
2394                                    DMA_CCMD_MASK_NOBIT,
2395                                    DMA_CCMD_DEVICE_INVL);
2396         iommu->flush.flush_iotlb(iommu,
2397                                  did_old,
2398                                  0,
2399                                  0,
2400                                  DMA_TLB_DSI_FLUSH);
2401 }
2402
2403 static inline void unlink_domain_info(struct device_domain_info *info)
2404 {
2405         assert_spin_locked(&device_domain_lock);
2406         list_del(&info->link);
2407         list_del(&info->global);
2408         if (info->dev)
2409                 info->dev->archdata.iommu = NULL;
2410 }
2411
2412 static void domain_remove_dev_info(struct dmar_domain *domain)
2413 {
2414         struct device_domain_info *info, *tmp;
2415         unsigned long flags;
2416
2417         spin_lock_irqsave(&device_domain_lock, flags);
2418         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419                 __dmar_remove_one_dev_info(info);
2420         spin_unlock_irqrestore(&device_domain_lock, flags);
2421 }
2422
2423 static struct dmar_domain *find_domain(struct device *dev)
2424 {
2425         struct device_domain_info *info;
2426
2427         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2428                      dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2429                 return NULL;
2430
2431         /* No lock here, assumes no domain exit in normal case */
2432         info = dev->archdata.iommu;
2433         if (likely(info))
2434                 return info->domain;
2435
2436         return NULL;
2437 }
2438
2439 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2440 {
2441         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2442                 struct iommu_domain *domain;
2443
2444                 dev->archdata.iommu = NULL;
2445                 domain = iommu_get_domain_for_dev(dev);
2446                 if (domain)
2447                         intel_iommu_attach_device(domain, dev);
2448         }
2449
2450         return find_domain(dev);
2451 }
2452
2453 static inline struct device_domain_info *
2454 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2455 {
2456         struct device_domain_info *info;
2457
2458         list_for_each_entry(info, &device_domain_list, global)
2459                 if (info->iommu->segment == segment && info->bus == bus &&
2460                     info->devfn == devfn)
2461                         return info;
2462
2463         return NULL;
2464 }
2465
2466 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2467                                                     int bus, int devfn,
2468                                                     struct device *dev,
2469                                                     struct dmar_domain *domain)
2470 {
2471         struct dmar_domain *found = NULL;
2472         struct device_domain_info *info;
2473         unsigned long flags;
2474         int ret;
2475
2476         info = alloc_devinfo_mem();
2477         if (!info)
2478                 return NULL;
2479
2480         info->bus = bus;
2481         info->devfn = devfn;
2482         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2483         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2484         info->ats_qdep = 0;
2485         info->dev = dev;
2486         info->domain = domain;
2487         info->iommu = iommu;
2488         info->pasid_table = NULL;
2489         info->auxd_enabled = 0;
2490         INIT_LIST_HEAD(&info->auxiliary_domains);
2491
2492         if (dev && dev_is_pci(dev)) {
2493                 struct pci_dev *pdev = to_pci_dev(info->dev);
2494
2495                 if (!pdev->untrusted &&
2496                     !pci_ats_disabled() &&
2497                     ecap_dev_iotlb_support(iommu->ecap) &&
2498                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2499                     dmar_find_matched_atsr_unit(pdev))
2500                         info->ats_supported = 1;
2501
2502                 if (sm_supported(iommu)) {
2503                         if (pasid_supported(iommu)) {
2504                                 int features = pci_pasid_features(pdev);
2505                                 if (features >= 0)
2506                                         info->pasid_supported = features | 1;
2507                         }
2508
2509                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2510                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2511                                 info->pri_supported = 1;
2512                 }
2513         }
2514
2515         spin_lock_irqsave(&device_domain_lock, flags);
2516         if (dev)
2517                 found = find_domain(dev);
2518
2519         if (!found) {
2520                 struct device_domain_info *info2;
2521                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2522                 if (info2) {
2523                         found      = info2->domain;
2524                         info2->dev = dev;
2525                 }
2526         }
2527
2528         if (found) {
2529                 spin_unlock_irqrestore(&device_domain_lock, flags);
2530                 free_devinfo_mem(info);
2531                 /* Caller must free the original domain */
2532                 return found;
2533         }
2534
2535         spin_lock(&iommu->lock);
2536         ret = domain_attach_iommu(domain, iommu);
2537         spin_unlock(&iommu->lock);
2538
2539         if (ret) {
2540                 spin_unlock_irqrestore(&device_domain_lock, flags);
2541                 free_devinfo_mem(info);
2542                 return NULL;
2543         }
2544
2545         list_add(&info->link, &domain->devices);
2546         list_add(&info->global, &device_domain_list);
2547         if (dev)
2548                 dev->archdata.iommu = info;
2549         spin_unlock_irqrestore(&device_domain_lock, flags);
2550
2551         /* PASID table is mandatory for a PCI device in scalable mode. */
2552         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2553                 ret = intel_pasid_alloc_table(dev);
2554                 if (ret) {
2555                         dev_err(dev, "PASID table allocation failed\n");
2556                         dmar_remove_one_dev_info(dev);
2557                         return NULL;
2558                 }
2559
2560                 /* Setup the PASID entry for requests without PASID: */
2561                 spin_lock(&iommu->lock);
2562                 if (hw_pass_through && domain_type_is_si(domain))
2563                         ret = intel_pasid_setup_pass_through(iommu, domain,
2564                                         dev, PASID_RID2PASID);
2565                 else
2566                         ret = intel_pasid_setup_second_level(iommu, domain,
2567                                         dev, PASID_RID2PASID);
2568                 spin_unlock(&iommu->lock);
2569                 if (ret) {
2570                         dev_err(dev, "Setup RID2PASID failed\n");
2571                         dmar_remove_one_dev_info(dev);
2572                         return NULL;
2573                 }
2574         }
2575
2576         if (dev && domain_context_mapping(domain, dev)) {
2577                 dev_err(dev, "Domain context map failed\n");
2578                 dmar_remove_one_dev_info(dev);
2579                 return NULL;
2580         }
2581
2582         return domain;
2583 }
2584
2585 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2586 {
2587         *(u16 *)opaque = alias;
2588         return 0;
2589 }
2590
2591 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2592 {
2593         struct device_domain_info *info;
2594         struct dmar_domain *domain = NULL;
2595         struct intel_iommu *iommu;
2596         u16 dma_alias;
2597         unsigned long flags;
2598         u8 bus, devfn;
2599
2600         iommu = device_to_iommu(dev, &bus, &devfn);
2601         if (!iommu)
2602                 return NULL;
2603
2604         if (dev_is_pci(dev)) {
2605                 struct pci_dev *pdev = to_pci_dev(dev);
2606
2607                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2608
2609                 spin_lock_irqsave(&device_domain_lock, flags);
2610                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2611                                                       PCI_BUS_NUM(dma_alias),
2612                                                       dma_alias & 0xff);
2613                 if (info) {
2614                         iommu = info->iommu;
2615                         domain = info->domain;
2616                 }
2617                 spin_unlock_irqrestore(&device_domain_lock, flags);
2618
2619                 /* DMA alias already has a domain, use it */
2620                 if (info)
2621                         goto out;
2622         }
2623
2624         /* Allocate and initialize new domain for the device */
2625         domain = alloc_domain(0);
2626         if (!domain)
2627                 return NULL;
2628         if (domain_init(domain, iommu, gaw)) {
2629                 domain_exit(domain);
2630                 return NULL;
2631         }
2632
2633 out:
2634         return domain;
2635 }
2636
2637 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2638                                               struct dmar_domain *domain)
2639 {
2640         struct intel_iommu *iommu;
2641         struct dmar_domain *tmp;
2642         u16 req_id, dma_alias;
2643         u8 bus, devfn;
2644
2645         iommu = device_to_iommu(dev, &bus, &devfn);
2646         if (!iommu)
2647                 return NULL;
2648
2649         req_id = ((u16)bus << 8) | devfn;
2650
2651         if (dev_is_pci(dev)) {
2652                 struct pci_dev *pdev = to_pci_dev(dev);
2653
2654                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2655
2656                 /* register PCI DMA alias device */
2657                 if (req_id != dma_alias) {
2658                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2659                                         dma_alias & 0xff, NULL, domain);
2660
2661                         if (!tmp || tmp != domain)
2662                                 return tmp;
2663                 }
2664         }
2665
2666         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2667         if (!tmp || tmp != domain)
2668                 return tmp;
2669
2670         return domain;
2671 }
2672
2673 static int iommu_domain_identity_map(struct dmar_domain *domain,
2674                                      unsigned long long start,
2675                                      unsigned long long end)
2676 {
2677         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2678         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2679
2680         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2681                           dma_to_mm_pfn(last_vpfn))) {
2682                 pr_err("Reserving iova failed\n");
2683                 return -ENOMEM;
2684         }
2685
2686         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2687         /*
2688          * RMRR range might have overlap with physical memory range,
2689          * clear it first
2690          */
2691         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2692
2693         return __domain_mapping(domain, first_vpfn, NULL,
2694                                 first_vpfn, last_vpfn - first_vpfn + 1,
2695                                 DMA_PTE_READ|DMA_PTE_WRITE);
2696 }
2697
2698 static int domain_prepare_identity_map(struct device *dev,
2699                                        struct dmar_domain *domain,
2700                                        unsigned long long start,
2701                                        unsigned long long end)
2702 {
2703         /* For _hardware_ passthrough, don't bother. But for software
2704            passthrough, we do it anyway -- it may indicate a memory
2705            range which is reserved in E820, so which didn't get set
2706            up to start with in si_domain */
2707         if (domain == si_domain && hw_pass_through) {
2708                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2709                          start, end);
2710                 return 0;
2711         }
2712
2713         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2714
2715         if (end < start) {
2716                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2717                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2718                         dmi_get_system_info(DMI_BIOS_VENDOR),
2719                         dmi_get_system_info(DMI_BIOS_VERSION),
2720                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2721                 return -EIO;
2722         }
2723
2724         if (end >> agaw_to_width(domain->agaw)) {
2725                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2726                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2727                      agaw_to_width(domain->agaw),
2728                      dmi_get_system_info(DMI_BIOS_VENDOR),
2729                      dmi_get_system_info(DMI_BIOS_VERSION),
2730                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2731                 return -EIO;
2732         }
2733
2734         return iommu_domain_identity_map(domain, start, end);
2735 }
2736
2737 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2738
2739 static int __init si_domain_init(int hw)
2740 {
2741         struct dmar_rmrr_unit *rmrr;
2742         struct device *dev;
2743         int i, nid, ret;
2744
2745         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2746         if (!si_domain)
2747                 return -EFAULT;
2748
2749         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2750                 domain_exit(si_domain);
2751                 return -EFAULT;
2752         }
2753
2754         if (hw)
2755                 return 0;
2756
2757         for_each_online_node(nid) {
2758                 unsigned long start_pfn, end_pfn;
2759                 int i;
2760
2761                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2762                         ret = iommu_domain_identity_map(si_domain,
2763                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2764                         if (ret)
2765                                 return ret;
2766                 }
2767         }
2768
2769         /*
2770          * Normally we use DMA domains for devices which have RMRRs. But we
2771          * loose this requirement for graphic and usb devices. Identity map
2772          * the RMRRs for graphic and USB devices so that they could use the
2773          * si_domain.
2774          */
2775         for_each_rmrr_units(rmrr) {
2776                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2777                                           i, dev) {
2778                         unsigned long long start = rmrr->base_address;
2779                         unsigned long long end = rmrr->end_address;
2780
2781                         if (device_is_rmrr_locked(dev))
2782                                 continue;
2783
2784                         if (WARN_ON(end < start ||
2785                                     end >> agaw_to_width(si_domain->agaw)))
2786                                 continue;
2787
2788                         ret = iommu_domain_identity_map(si_domain, start, end);
2789                         if (ret)
2790                                 return ret;
2791                 }
2792         }
2793
2794         return 0;
2795 }
2796
2797 static int identity_mapping(struct device *dev)
2798 {
2799         struct device_domain_info *info;
2800
2801         info = dev->archdata.iommu;
2802         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2803                 return (info->domain == si_domain);
2804
2805         return 0;
2806 }
2807
2808 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2809 {
2810         struct dmar_domain *ndomain;
2811         struct intel_iommu *iommu;
2812         u8 bus, devfn;
2813
2814         iommu = device_to_iommu(dev, &bus, &devfn);
2815         if (!iommu)
2816                 return -ENODEV;
2817
2818         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2819         if (ndomain != domain)
2820                 return -EBUSY;
2821
2822         return 0;
2823 }
2824
2825 static bool device_has_rmrr(struct device *dev)
2826 {
2827         struct dmar_rmrr_unit *rmrr;
2828         struct device *tmp;
2829         int i;
2830
2831         rcu_read_lock();
2832         for_each_rmrr_units(rmrr) {
2833                 /*
2834                  * Return TRUE if this RMRR contains the device that
2835                  * is passed in.
2836                  */
2837                 for_each_active_dev_scope(rmrr->devices,
2838                                           rmrr->devices_cnt, i, tmp)
2839                         if (tmp == dev ||
2840                             is_downstream_to_pci_bridge(dev, tmp)) {
2841                                 rcu_read_unlock();
2842                                 return true;
2843                         }
2844         }
2845         rcu_read_unlock();
2846         return false;
2847 }
2848
2849 /**
2850  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2851  * is relaxable (ie. is allowed to be not enforced under some conditions)
2852  * @dev: device handle
2853  *
2854  * We assume that PCI USB devices with RMRRs have them largely
2855  * for historical reasons and that the RMRR space is not actively used post
2856  * boot.  This exclusion may change if vendors begin to abuse it.
2857  *
2858  * The same exception is made for graphics devices, with the requirement that
2859  * any use of the RMRR regions will be torn down before assigning the device
2860  * to a guest.
2861  *
2862  * Return: true if the RMRR is relaxable, false otherwise
2863  */
2864 static bool device_rmrr_is_relaxable(struct device *dev)
2865 {
2866         struct pci_dev *pdev;
2867
2868         if (!dev_is_pci(dev))
2869                 return false;
2870
2871         pdev = to_pci_dev(dev);
2872         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2873                 return true;
2874         else
2875                 return false;
2876 }
2877
2878 /*
2879  * There are a couple cases where we need to restrict the functionality of
2880  * devices associated with RMRRs.  The first is when evaluating a device for
2881  * identity mapping because problems exist when devices are moved in and out
2882  * of domains and their respective RMRR information is lost.  This means that
2883  * a device with associated RMRRs will never be in a "passthrough" domain.
2884  * The second is use of the device through the IOMMU API.  This interface
2885  * expects to have full control of the IOVA space for the device.  We cannot
2886  * satisfy both the requirement that RMRR access is maintained and have an
2887  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2888  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2889  * We therefore prevent devices associated with an RMRR from participating in
2890  * the IOMMU API, which eliminates them from device assignment.
2891  *
2892  * In both cases, devices which have relaxable RMRRs are not concerned by this
2893  * restriction. See device_rmrr_is_relaxable comment.
2894  */
2895 static bool device_is_rmrr_locked(struct device *dev)
2896 {
2897         if (!device_has_rmrr(dev))
2898                 return false;
2899
2900         if (device_rmrr_is_relaxable(dev))
2901                 return false;
2902
2903         return true;
2904 }
2905
2906 /*
2907  * Return the required default domain type for a specific device.
2908  *
2909  * @dev: the device in query
2910  * @startup: true if this is during early boot
2911  *
2912  * Returns:
2913  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2914  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2915  *  - 0: both identity and dynamic domains work for this device
2916  */
2917 static int device_def_domain_type(struct device *dev)
2918 {
2919         if (dev_is_pci(dev)) {
2920                 struct pci_dev *pdev = to_pci_dev(dev);
2921
2922                 if (device_is_rmrr_locked(dev))
2923                         return IOMMU_DOMAIN_DMA;
2924
2925                 /*
2926                  * Prevent any device marked as untrusted from getting
2927                  * placed into the statically identity mapping domain.
2928                  */
2929                 if (pdev->untrusted)
2930                         return IOMMU_DOMAIN_DMA;
2931
2932                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2933                         return IOMMU_DOMAIN_IDENTITY;
2934
2935                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2936                         return IOMMU_DOMAIN_IDENTITY;
2937
2938                 /*
2939                  * We want to start off with all devices in the 1:1 domain, and
2940                  * take them out later if we find they can't access all of memory.
2941                  *
2942                  * However, we can't do this for PCI devices behind bridges,
2943                  * because all PCI devices behind the same bridge will end up
2944                  * with the same source-id on their transactions.
2945                  *
2946                  * Practically speaking, we can't change things around for these
2947                  * devices at run-time, because we can't be sure there'll be no
2948                  * DMA transactions in flight for any of their siblings.
2949                  *
2950                  * So PCI devices (unless they're on the root bus) as well as
2951                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2952                  * the 1:1 domain, just in _case_ one of their siblings turns out
2953                  * not to be able to map all of memory.
2954                  */
2955                 if (!pci_is_pcie(pdev)) {
2956                         if (!pci_is_root_bus(pdev->bus))
2957                                 return IOMMU_DOMAIN_DMA;
2958                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2959                                 return IOMMU_DOMAIN_DMA;
2960                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2961                         return IOMMU_DOMAIN_DMA;
2962         } else {
2963                 if (device_has_rmrr(dev))
2964                         return IOMMU_DOMAIN_DMA;
2965         }
2966
2967         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2968                         IOMMU_DOMAIN_IDENTITY : 0;
2969 }
2970
2971 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2972 {
2973         /*
2974          * Start from the sane iommu hardware state.
2975          * If the queued invalidation is already initialized by us
2976          * (for example, while enabling interrupt-remapping) then
2977          * we got the things already rolling from a sane state.
2978          */
2979         if (!iommu->qi) {
2980                 /*
2981                  * Clear any previous faults.
2982                  */
2983                 dmar_fault(-1, iommu);
2984                 /*
2985                  * Disable queued invalidation if supported and already enabled
2986                  * before OS handover.
2987                  */
2988                 dmar_disable_qi(iommu);
2989         }
2990
2991         if (dmar_enable_qi(iommu)) {
2992                 /*
2993                  * Queued Invalidate not enabled, use Register Based Invalidate
2994                  */
2995                 iommu->flush.flush_context = __iommu_flush_context;
2996                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2997                 pr_info("%s: Using Register based invalidation\n",
2998                         iommu->name);
2999         } else {
3000                 iommu->flush.flush_context = qi_flush_context;
3001                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3002                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3003         }
3004 }
3005
3006 static int copy_context_table(struct intel_iommu *iommu,
3007                               struct root_entry *old_re,
3008                               struct context_entry **tbl,
3009                               int bus, bool ext)
3010 {
3011         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3012         struct context_entry *new_ce = NULL, ce;
3013         struct context_entry *old_ce = NULL;
3014         struct root_entry re;
3015         phys_addr_t old_ce_phys;
3016
3017         tbl_idx = ext ? bus * 2 : bus;
3018         memcpy(&re, old_re, sizeof(re));
3019
3020         for (devfn = 0; devfn < 256; devfn++) {
3021                 /* First calculate the correct index */
3022                 idx = (ext ? devfn * 2 : devfn) % 256;
3023
3024                 if (idx == 0) {
3025                         /* First save what we may have and clean up */
3026                         if (new_ce) {
3027                                 tbl[tbl_idx] = new_ce;
3028                                 __iommu_flush_cache(iommu, new_ce,
3029                                                     VTD_PAGE_SIZE);
3030                                 pos = 1;
3031                         }
3032
3033                         if (old_ce)
3034                                 memunmap(old_ce);
3035
3036                         ret = 0;
3037                         if (devfn < 0x80)
3038                                 old_ce_phys = root_entry_lctp(&re);
3039                         else
3040                                 old_ce_phys = root_entry_uctp(&re);
3041
3042                         if (!old_ce_phys) {
3043                                 if (ext && devfn == 0) {
3044                                         /* No LCTP, try UCTP */
3045                                         devfn = 0x7f;
3046                                         continue;
3047                                 } else {
3048                                         goto out;
3049                                 }
3050                         }
3051
3052                         ret = -ENOMEM;
3053                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3054                                         MEMREMAP_WB);
3055                         if (!old_ce)
3056                                 goto out;
3057
3058                         new_ce = alloc_pgtable_page(iommu->node);
3059                         if (!new_ce)
3060                                 goto out_unmap;
3061
3062                         ret = 0;
3063                 }
3064
3065                 /* Now copy the context entry */
3066                 memcpy(&ce, old_ce + idx, sizeof(ce));
3067
3068                 if (!__context_present(&ce))
3069                         continue;
3070
3071                 did = context_domain_id(&ce);
3072                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3073                         set_bit(did, iommu->domain_ids);
3074
3075                 /*
3076                  * We need a marker for copied context entries. This
3077                  * marker needs to work for the old format as well as
3078                  * for extended context entries.
3079                  *
3080                  * Bit 67 of the context entry is used. In the old
3081                  * format this bit is available to software, in the
3082                  * extended format it is the PGE bit, but PGE is ignored
3083                  * by HW if PASIDs are disabled (and thus still
3084                  * available).
3085                  *
3086                  * So disable PASIDs first and then mark the entry
3087                  * copied. This means that we don't copy PASID
3088                  * translations from the old kernel, but this is fine as
3089                  * faults there are not fatal.
3090                  */
3091                 context_clear_pasid_enable(&ce);
3092                 context_set_copied(&ce);
3093
3094                 new_ce[idx] = ce;
3095         }
3096
3097         tbl[tbl_idx + pos] = new_ce;
3098
3099         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3100
3101 out_unmap:
3102         memunmap(old_ce);
3103
3104 out:
3105         return ret;
3106 }
3107
3108 static int copy_translation_tables(struct intel_iommu *iommu)
3109 {
3110         struct context_entry **ctxt_tbls;
3111         struct root_entry *old_rt;
3112         phys_addr_t old_rt_phys;
3113         int ctxt_table_entries;
3114         unsigned long flags;
3115         u64 rtaddr_reg;
3116         int bus, ret;
3117         bool new_ext, ext;
3118
3119         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3120         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3121         new_ext    = !!ecap_ecs(iommu->ecap);
3122
3123         /*
3124          * The RTT bit can only be changed when translation is disabled,
3125          * but disabling translation means to open a window for data
3126          * corruption. So bail out and don't copy anything if we would
3127          * have to change the bit.
3128          */
3129         if (new_ext != ext)
3130                 return -EINVAL;
3131
3132         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3133         if (!old_rt_phys)
3134                 return -EINVAL;
3135
3136         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3137         if (!old_rt)
3138                 return -ENOMEM;
3139
3140         /* This is too big for the stack - allocate it from slab */
3141         ctxt_table_entries = ext ? 512 : 256;
3142         ret = -ENOMEM;
3143         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3144         if (!ctxt_tbls)
3145                 goto out_unmap;
3146
3147         for (bus = 0; bus < 256; bus++) {
3148                 ret = copy_context_table(iommu, &old_rt[bus],
3149                                          ctxt_tbls, bus, ext);
3150                 if (ret) {
3151                         pr_err("%s: Failed to copy context table for bus %d\n",
3152                                 iommu->name, bus);
3153                         continue;
3154                 }
3155         }
3156
3157         spin_lock_irqsave(&iommu->lock, flags);
3158
3159         /* Context tables are copied, now write them to the root_entry table */
3160         for (bus = 0; bus < 256; bus++) {
3161                 int idx = ext ? bus * 2 : bus;
3162                 u64 val;
3163
3164                 if (ctxt_tbls[idx]) {
3165                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3166                         iommu->root_entry[bus].lo = val;
3167                 }
3168
3169                 if (!ext || !ctxt_tbls[idx + 1])
3170                         continue;
3171
3172                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3173                 iommu->root_entry[bus].hi = val;
3174         }
3175
3176         spin_unlock_irqrestore(&iommu->lock, flags);
3177
3178         kfree(ctxt_tbls);
3179
3180         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3181
3182         ret = 0;
3183
3184 out_unmap:
3185         memunmap(old_rt);
3186
3187         return ret;
3188 }
3189
3190 static int __init init_dmars(void)
3191 {
3192         struct dmar_drhd_unit *drhd;
3193         struct intel_iommu *iommu;
3194         int ret;
3195
3196         /*
3197          * for each drhd
3198          *    allocate root
3199          *    initialize and program root entry to not present
3200          * endfor
3201          */
3202         for_each_drhd_unit(drhd) {
3203                 /*
3204                  * lock not needed as this is only incremented in the single
3205                  * threaded kernel __init code path all other access are read
3206                  * only
3207                  */
3208                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3209                         g_num_of_iommus++;
3210                         continue;
3211                 }
3212                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3213         }
3214
3215         /* Preallocate enough resources for IOMMU hot-addition */
3216         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3217                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3218
3219         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3220                         GFP_KERNEL);
3221         if (!g_iommus) {
3222                 pr_err("Allocating global iommu array failed\n");
3223                 ret = -ENOMEM;
3224                 goto error;
3225         }
3226
3227         for_each_iommu(iommu, drhd) {
3228                 if (drhd->ignored) {
3229                         iommu_disable_translation(iommu);
3230                         continue;
3231                 }
3232
3233                 /*
3234                  * Find the max pasid size of all IOMMU's in the system.
3235                  * We need to ensure the system pasid table is no bigger
3236                  * than the smallest supported.
3237                  */
3238                 if (pasid_supported(iommu)) {
3239                         u32 temp = 2 << ecap_pss(iommu->ecap);
3240
3241                         intel_pasid_max_id = min_t(u32, temp,
3242                                                    intel_pasid_max_id);
3243                 }
3244
3245                 g_iommus[iommu->seq_id] = iommu;
3246
3247                 intel_iommu_init_qi(iommu);
3248
3249                 ret = iommu_init_domains(iommu);
3250                 if (ret)
3251                         goto free_iommu;
3252
3253                 init_translation_status(iommu);
3254
3255                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3256                         iommu_disable_translation(iommu);
3257                         clear_translation_pre_enabled(iommu);
3258                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3259                                 iommu->name);
3260                 }
3261
3262                 /*
3263                  * TBD:
3264                  * we could share the same root & context tables
3265                  * among all IOMMU's. Need to Split it later.
3266                  */
3267                 ret = iommu_alloc_root_entry(iommu);
3268                 if (ret)
3269                         goto free_iommu;
3270
3271                 if (translation_pre_enabled(iommu)) {
3272                         pr_info("Translation already enabled - trying to copy translation structures\n");
3273
3274                         ret = copy_translation_tables(iommu);
3275                         if (ret) {
3276                                 /*
3277                                  * We found the IOMMU with translation
3278                                  * enabled - but failed to copy over the
3279                                  * old root-entry table. Try to proceed
3280                                  * by disabling translation now and
3281                                  * allocating a clean root-entry table.
3282                                  * This might cause DMAR faults, but
3283                                  * probably the dump will still succeed.
3284                                  */
3285                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3286                                        iommu->name);
3287                                 iommu_disable_translation(iommu);
3288                                 clear_translation_pre_enabled(iommu);
3289                         } else {
3290                                 pr_info("Copied translation tables from previous kernel for %s\n",
3291                                         iommu->name);
3292                         }
3293                 }
3294
3295                 if (!ecap_pass_through(iommu->ecap))
3296                         hw_pass_through = 0;
3297 #ifdef CONFIG_INTEL_IOMMU_SVM
3298                 if (pasid_supported(iommu))
3299                         intel_svm_init(iommu);
3300 #endif
3301         }
3302
3303         /*
3304          * Now that qi is enabled on all iommus, set the root entry and flush
3305          * caches. This is required on some Intel X58 chipsets, otherwise the
3306          * flush_context function will loop forever and the boot hangs.
3307          */
3308         for_each_active_iommu(iommu, drhd) {
3309                 iommu_flush_write_buffer(iommu);
3310                 iommu_set_root_entry(iommu);
3311                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3312                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3313         }
3314
3315         if (iommu_default_passthrough())
3316                 iommu_identity_mapping |= IDENTMAP_ALL;
3317
3318 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3319         dmar_map_gfx = 0;
3320 #endif
3321
3322         if (!dmar_map_gfx)
3323                 iommu_identity_mapping |= IDENTMAP_GFX;
3324
3325         check_tylersburg_isoch();
3326
3327         ret = si_domain_init(hw_pass_through);
3328         if (ret)
3329                 goto free_iommu;
3330
3331         /*
3332          * for each drhd
3333          *   enable fault log
3334          *   global invalidate context cache
3335          *   global invalidate iotlb
3336          *   enable translation
3337          */
3338         for_each_iommu(iommu, drhd) {
3339                 if (drhd->ignored) {
3340                         /*
3341                          * we always have to disable PMRs or DMA may fail on
3342                          * this device
3343                          */
3344                         if (force_on)
3345                                 iommu_disable_protect_mem_regions(iommu);
3346                         continue;
3347                 }
3348
3349                 iommu_flush_write_buffer(iommu);
3350
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3353                         /*
3354                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3355                          * could cause possible lock race condition.
3356                          */
3357                         up_write(&dmar_global_lock);
3358                         ret = intel_svm_enable_prq(iommu);
3359                         down_write(&dmar_global_lock);
3360                         if (ret)
3361                                 goto free_iommu;
3362                 }
3363 #endif
3364                 ret = dmar_set_interrupt(iommu);
3365                 if (ret)
3366                         goto free_iommu;
3367         }
3368
3369         return 0;
3370
3371 free_iommu:
3372         for_each_active_iommu(iommu, drhd) {
3373                 disable_dmar_iommu(iommu);
3374                 free_dmar_iommu(iommu);
3375         }
3376
3377         kfree(g_iommus);
3378
3379 error:
3380         return ret;
3381 }
3382
3383 /* This takes a number of _MM_ pages, not VTD pages */
3384 static unsigned long intel_alloc_iova(struct device *dev,
3385                                      struct dmar_domain *domain,
3386                                      unsigned long nrpages, uint64_t dma_mask)
3387 {
3388         unsigned long iova_pfn;
3389
3390         /* Restrict dma_mask to the width that the iommu can handle */
3391         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3392         /* Ensure we reserve the whole size-aligned region */
3393         nrpages = __roundup_pow_of_two(nrpages);
3394
3395         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3396                 /*
3397                  * First try to allocate an io virtual address in
3398                  * DMA_BIT_MASK(32) and if that fails then try allocating
3399                  * from higher range
3400                  */
3401                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3402                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3403                 if (iova_pfn)
3404                         return iova_pfn;
3405         }
3406         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3407                                    IOVA_PFN(dma_mask), true);
3408         if (unlikely(!iova_pfn)) {
3409                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3410                 return 0;
3411         }
3412
3413         return iova_pfn;
3414 }
3415
3416 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3417 {
3418         struct dmar_domain *domain, *tmp;
3419         struct dmar_rmrr_unit *rmrr;
3420         struct device *i_dev;
3421         int i, ret;
3422
3423         /* Device shouldn't be attached by any domains. */
3424         domain = find_domain(dev);
3425         if (domain)
3426                 return NULL;
3427
3428         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3429         if (!domain)
3430                 goto out;
3431
3432         /* We have a new domain - setup possible RMRRs for the device */
3433         rcu_read_lock();
3434         for_each_rmrr_units(rmrr) {
3435                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3436                                           i, i_dev) {
3437                         if (i_dev != dev)
3438                                 continue;
3439
3440                         ret = domain_prepare_identity_map(dev, domain,
3441                                                           rmrr->base_address,
3442                                                           rmrr->end_address);
3443                         if (ret)
3444                                 dev_err(dev, "Mapping reserved region failed\n");
3445                 }
3446         }
3447         rcu_read_unlock();
3448
3449         tmp = set_domain_for_dev(dev, domain);
3450         if (!tmp || domain != tmp) {
3451                 domain_exit(domain);
3452                 domain = tmp;
3453         }
3454
3455 out:
3456         if (!domain)
3457                 dev_err(dev, "Allocating domain failed\n");
3458         else
3459                 domain->domain.type = IOMMU_DOMAIN_DMA;
3460
3461         return domain;
3462 }
3463
3464 /* Check if the dev needs to go through non-identity map and unmap process.*/
3465 static bool iommu_need_mapping(struct device *dev)
3466 {
3467         int ret;
3468
3469         if (iommu_dummy(dev))
3470                 return false;
3471
3472         ret = identity_mapping(dev);
3473         if (ret) {
3474                 u64 dma_mask = *dev->dma_mask;
3475
3476                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3477                         dma_mask = dev->coherent_dma_mask;
3478
3479                 if (dma_mask >= dma_direct_get_required_mask(dev))
3480                         return false;
3481
3482                 /*
3483                  * 32 bit DMA is removed from si_domain and fall back to
3484                  * non-identity mapping.
3485                  */
3486                 dmar_remove_one_dev_info(dev);
3487                 ret = iommu_request_dma_domain_for_dev(dev);
3488                 if (ret) {
3489                         struct iommu_domain *domain;
3490                         struct dmar_domain *dmar_domain;
3491
3492                         domain = iommu_get_domain_for_dev(dev);
3493                         if (domain) {
3494                                 dmar_domain = to_dmar_domain(domain);
3495                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3496                         }
3497                         dmar_remove_one_dev_info(dev);
3498                         get_private_domain_for_dev(dev);
3499                 }
3500
3501                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3502         }
3503
3504         return true;
3505 }
3506
3507 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3508                                      size_t size, int dir, u64 dma_mask)
3509 {
3510         struct dmar_domain *domain;
3511         phys_addr_t start_paddr;
3512         unsigned long iova_pfn;
3513         int prot = 0;
3514         int ret;
3515         struct intel_iommu *iommu;
3516         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3517
3518         BUG_ON(dir == DMA_NONE);
3519
3520         domain = deferred_attach_domain(dev);
3521         if (!domain)
3522                 return DMA_MAPPING_ERROR;
3523
3524         iommu = domain_get_iommu(domain);
3525         size = aligned_nrpages(paddr, size);
3526
3527         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3528         if (!iova_pfn)
3529                 goto error;
3530
3531         /*
3532          * Check if DMAR supports zero-length reads on write only
3533          * mappings..
3534          */
3535         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3536                         !cap_zlr(iommu->cap))
3537                 prot |= DMA_PTE_READ;
3538         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3539                 prot |= DMA_PTE_WRITE;
3540         /*
3541          * paddr - (paddr + size) might be partial page, we should map the whole
3542          * page.  Note: if two part of one page are separately mapped, we
3543          * might have two guest_addr mapping to the same host paddr, but this
3544          * is not a big problem
3545          */
3546         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3547                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3548         if (ret)
3549                 goto error;
3550
3551         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3552         start_paddr += paddr & ~PAGE_MASK;
3553
3554         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3555
3556         return start_paddr;
3557
3558 error:
3559         if (iova_pfn)
3560                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3561         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3562                 size, (unsigned long long)paddr, dir);
3563         return DMA_MAPPING_ERROR;
3564 }
3565
3566 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3567                                  unsigned long offset, size_t size,
3568                                  enum dma_data_direction dir,
3569                                  unsigned long attrs)
3570 {
3571         if (iommu_need_mapping(dev))
3572                 return __intel_map_single(dev, page_to_phys(page) + offset,
3573                                 size, dir, *dev->dma_mask);
3574         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3575 }
3576
3577 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3578                                      size_t size, enum dma_data_direction dir,
3579                                      unsigned long attrs)
3580 {
3581         if (iommu_need_mapping(dev))
3582                 return __intel_map_single(dev, phys_addr, size, dir,
3583                                 *dev->dma_mask);
3584         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3585 }
3586
3587 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3588 {
3589         struct dmar_domain *domain;
3590         unsigned long start_pfn, last_pfn;
3591         unsigned long nrpages;
3592         unsigned long iova_pfn;
3593         struct intel_iommu *iommu;
3594         struct page *freelist;
3595         struct pci_dev *pdev = NULL;
3596
3597         domain = find_domain(dev);
3598         BUG_ON(!domain);
3599
3600         iommu = domain_get_iommu(domain);
3601
3602         iova_pfn = IOVA_PFN(dev_addr);
3603
3604         nrpages = aligned_nrpages(dev_addr, size);
3605         start_pfn = mm_to_dma_pfn(iova_pfn);
3606         last_pfn = start_pfn + nrpages - 1;
3607
3608         if (dev_is_pci(dev))
3609                 pdev = to_pci_dev(dev);
3610
3611         freelist = domain_unmap(domain, start_pfn, last_pfn);
3612         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3613                         !has_iova_flush_queue(&domain->iovad)) {
3614                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3615                                       nrpages, !freelist, 0);
3616                 /* free iova */
3617                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3618                 dma_free_pagelist(freelist);
3619         } else {
3620                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3621                            (unsigned long)freelist);
3622                 /*
3623                  * queue up the release of the unmap to save the 1/6th of the
3624                  * cpu used up by the iotlb flush operation...
3625                  */
3626         }
3627
3628         trace_unmap_single(dev, dev_addr, size);
3629 }
3630
3631 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3632                              size_t size, enum dma_data_direction dir,
3633                              unsigned long attrs)
3634 {
3635         if (iommu_need_mapping(dev))
3636                 intel_unmap(dev, dev_addr, size);
3637         else
3638                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3639 }
3640
3641 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3642                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3643 {
3644         if (iommu_need_mapping(dev))
3645                 intel_unmap(dev, dev_addr, size);
3646 }
3647
3648 static void *intel_alloc_coherent(struct device *dev, size_t size,
3649                                   dma_addr_t *dma_handle, gfp_t flags,
3650                                   unsigned long attrs)
3651 {
3652         struct page *page = NULL;
3653         int order;
3654
3655         if (!iommu_need_mapping(dev))
3656                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3657
3658         size = PAGE_ALIGN(size);
3659         order = get_order(size);
3660
3661         if (gfpflags_allow_blocking(flags)) {
3662                 unsigned int count = size >> PAGE_SHIFT;
3663
3664                 page = dma_alloc_from_contiguous(dev, count, order,
3665                                                  flags & __GFP_NOWARN);
3666         }
3667
3668         if (!page)
3669                 page = alloc_pages(flags, order);
3670         if (!page)
3671                 return NULL;
3672         memset(page_address(page), 0, size);
3673
3674         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3675                                          DMA_BIDIRECTIONAL,
3676                                          dev->coherent_dma_mask);
3677         if (*dma_handle != DMA_MAPPING_ERROR)
3678                 return page_address(page);
3679         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3680                 __free_pages(page, order);
3681
3682         return NULL;
3683 }
3684
3685 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3686                                 dma_addr_t dma_handle, unsigned long attrs)
3687 {
3688         int order;
3689         struct page *page = virt_to_page(vaddr);
3690
3691         if (!iommu_need_mapping(dev))
3692                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3693
3694         size = PAGE_ALIGN(size);
3695         order = get_order(size);
3696
3697         intel_unmap(dev, dma_handle, size);
3698         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3699                 __free_pages(page, order);
3700 }
3701
3702 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3703                            int nelems, enum dma_data_direction dir,
3704                            unsigned long attrs)
3705 {
3706         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3707         unsigned long nrpages = 0;
3708         struct scatterlist *sg;
3709         int i;
3710
3711         if (!iommu_need_mapping(dev))
3712                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3713
3714         for_each_sg(sglist, sg, nelems, i) {
3715                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3716         }
3717
3718         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3719
3720         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3721 }
3722
3723 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3724                         enum dma_data_direction dir, unsigned long attrs)
3725 {
3726         int i;
3727         struct dmar_domain *domain;
3728         size_t size = 0;
3729         int prot = 0;
3730         unsigned long iova_pfn;
3731         int ret;
3732         struct scatterlist *sg;
3733         unsigned long start_vpfn;
3734         struct intel_iommu *iommu;
3735
3736         BUG_ON(dir == DMA_NONE);
3737         if (!iommu_need_mapping(dev))
3738                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3739
3740         domain = deferred_attach_domain(dev);
3741         if (!domain)
3742                 return 0;
3743
3744         iommu = domain_get_iommu(domain);
3745
3746         for_each_sg(sglist, sg, nelems, i)
3747                 size += aligned_nrpages(sg->offset, sg->length);
3748
3749         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3750                                 *dev->dma_mask);
3751         if (!iova_pfn) {
3752                 sglist->dma_length = 0;
3753                 return 0;
3754         }
3755
3756         /*
3757          * Check if DMAR supports zero-length reads on write only
3758          * mappings..
3759          */
3760         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3761                         !cap_zlr(iommu->cap))
3762                 prot |= DMA_PTE_READ;
3763         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3764                 prot |= DMA_PTE_WRITE;
3765
3766         start_vpfn = mm_to_dma_pfn(iova_pfn);
3767
3768         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3769         if (unlikely(ret)) {
3770                 dma_pte_free_pagetable(domain, start_vpfn,
3771                                        start_vpfn + size - 1,
3772                                        agaw_to_level(domain->agaw) + 1);
3773                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3774                 return 0;
3775         }
3776
3777         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3778                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3779
3780         return nelems;
3781 }
3782
3783 static u64 intel_get_required_mask(struct device *dev)
3784 {
3785         if (!iommu_need_mapping(dev))
3786                 return dma_direct_get_required_mask(dev);
3787         return DMA_BIT_MASK(32);
3788 }
3789
3790 static const struct dma_map_ops intel_dma_ops = {
3791         .alloc = intel_alloc_coherent,
3792         .free = intel_free_coherent,
3793         .map_sg = intel_map_sg,
3794         .unmap_sg = intel_unmap_sg,
3795         .map_page = intel_map_page,
3796         .unmap_page = intel_unmap_page,
3797         .map_resource = intel_map_resource,
3798         .unmap_resource = intel_unmap_resource,
3799         .dma_supported = dma_direct_supported,
3800         .mmap = dma_common_mmap,
3801         .get_sgtable = dma_common_get_sgtable,
3802         .get_required_mask = intel_get_required_mask,
3803 };
3804
3805 static void
3806 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3807                    enum dma_data_direction dir, enum dma_sync_target target)
3808 {
3809         struct dmar_domain *domain;
3810         phys_addr_t tlb_addr;
3811
3812         domain = find_domain(dev);
3813         if (WARN_ON(!domain))
3814                 return;
3815
3816         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3817         if (is_swiotlb_buffer(tlb_addr))
3818                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3819 }
3820
3821 static dma_addr_t
3822 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3823                   enum dma_data_direction dir, unsigned long attrs,
3824                   u64 dma_mask)
3825 {
3826         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3827         struct dmar_domain *domain;
3828         struct intel_iommu *iommu;
3829         unsigned long iova_pfn;
3830         unsigned long nrpages;
3831         phys_addr_t tlb_addr;
3832         int prot = 0;
3833         int ret;
3834
3835         domain = deferred_attach_domain(dev);
3836         if (WARN_ON(dir == DMA_NONE || !domain))
3837                 return DMA_MAPPING_ERROR;
3838
3839         iommu = domain_get_iommu(domain);
3840         if (WARN_ON(!iommu))
3841                 return DMA_MAPPING_ERROR;
3842
3843         nrpages = aligned_nrpages(0, size);
3844         iova_pfn = intel_alloc_iova(dev, domain,
3845                                     dma_to_mm_pfn(nrpages), dma_mask);
3846         if (!iova_pfn)
3847                 return DMA_MAPPING_ERROR;
3848
3849         /*
3850          * Check if DMAR supports zero-length reads on write only
3851          * mappings..
3852          */
3853         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3854                         !cap_zlr(iommu->cap))
3855                 prot |= DMA_PTE_READ;
3856         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3857                 prot |= DMA_PTE_WRITE;
3858
3859         /*
3860          * If both the physical buffer start address and size are
3861          * page aligned, we don't need to use a bounce page.
3862          */
3863         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3864                 tlb_addr = swiotlb_tbl_map_single(dev,
3865                                 __phys_to_dma(dev, io_tlb_start),
3866                                 paddr, size, aligned_size, dir, attrs);
3867                 if (tlb_addr == DMA_MAPPING_ERROR) {
3868                         goto swiotlb_error;
3869                 } else {
3870                         /* Cleanup the padding area. */
3871                         void *padding_start = phys_to_virt(tlb_addr);
3872                         size_t padding_size = aligned_size;
3873
3874                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3875                             (dir == DMA_TO_DEVICE ||
3876                              dir == DMA_BIDIRECTIONAL)) {
3877                                 padding_start += size;
3878                                 padding_size -= size;
3879                         }
3880
3881                         memset(padding_start, 0, padding_size);
3882                 }
3883         } else {
3884                 tlb_addr = paddr;
3885         }
3886
3887         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3888                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3889         if (ret)
3890                 goto mapping_error;
3891
3892         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3893
3894         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3895
3896 mapping_error:
3897         if (is_swiotlb_buffer(tlb_addr))
3898                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3899                                          aligned_size, dir, attrs);
3900 swiotlb_error:
3901         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3902         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3903                 size, (unsigned long long)paddr, dir);
3904
3905         return DMA_MAPPING_ERROR;
3906 }
3907
3908 static void
3909 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3910                     enum dma_data_direction dir, unsigned long attrs)
3911 {
3912         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3913         struct dmar_domain *domain;
3914         phys_addr_t tlb_addr;
3915
3916         domain = find_domain(dev);
3917         if (WARN_ON(!domain))
3918                 return;
3919
3920         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3921         if (WARN_ON(!tlb_addr))
3922                 return;
3923
3924         intel_unmap(dev, dev_addr, size);
3925         if (is_swiotlb_buffer(tlb_addr))
3926                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3927                                          aligned_size, dir, attrs);
3928
3929         trace_bounce_unmap_single(dev, dev_addr, size);
3930 }
3931
3932 static dma_addr_t
3933 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3934                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3935 {
3936         return bounce_map_single(dev, page_to_phys(page) + offset,
3937                                  size, dir, attrs, *dev->dma_mask);
3938 }
3939
3940 static dma_addr_t
3941 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3942                     enum dma_data_direction dir, unsigned long attrs)
3943 {
3944         return bounce_map_single(dev, phys_addr, size,
3945                                  dir, attrs, *dev->dma_mask);
3946 }
3947
3948 static void
3949 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3950                   enum dma_data_direction dir, unsigned long attrs)
3951 {
3952         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3953 }
3954
3955 static void
3956 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3957                       enum dma_data_direction dir, unsigned long attrs)
3958 {
3959         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3960 }
3961
3962 static void
3963 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3964                 enum dma_data_direction dir, unsigned long attrs)
3965 {
3966         struct scatterlist *sg;
3967         int i;
3968
3969         for_each_sg(sglist, sg, nelems, i)
3970                 bounce_unmap_page(dev, sg->dma_address,
3971                                   sg_dma_len(sg), dir, attrs);
3972 }
3973
3974 static int
3975 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3976               enum dma_data_direction dir, unsigned long attrs)
3977 {
3978         int i;
3979         struct scatterlist *sg;
3980
3981         for_each_sg(sglist, sg, nelems, i) {
3982                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3983                                                   sg->offset, sg->length,
3984                                                   dir, attrs);
3985                 if (sg->dma_address == DMA_MAPPING_ERROR)
3986                         goto out_unmap;
3987                 sg_dma_len(sg) = sg->length;
3988         }
3989
3990         return nelems;
3991
3992 out_unmap:
3993         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3994         return 0;
3995 }
3996
3997 static void
3998 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3999                            size_t size, enum dma_data_direction dir)
4000 {
4001         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4002 }
4003
4004 static void
4005 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4006                               size_t size, enum dma_data_direction dir)
4007 {
4008         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4009 }
4010
4011 static void
4012 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4013                        int nelems, enum dma_data_direction dir)
4014 {
4015         struct scatterlist *sg;
4016         int i;
4017
4018         for_each_sg(sglist, sg, nelems, i)
4019                 bounce_sync_single(dev, sg_dma_address(sg),
4020                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4021 }
4022
4023 static void
4024 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4025                           int nelems, enum dma_data_direction dir)
4026 {
4027         struct scatterlist *sg;
4028         int i;
4029
4030         for_each_sg(sglist, sg, nelems, i)
4031                 bounce_sync_single(dev, sg_dma_address(sg),
4032                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4033 }
4034
4035 static const struct dma_map_ops bounce_dma_ops = {
4036         .alloc                  = intel_alloc_coherent,
4037         .free                   = intel_free_coherent,
4038         .map_sg                 = bounce_map_sg,
4039         .unmap_sg               = bounce_unmap_sg,
4040         .map_page               = bounce_map_page,
4041         .unmap_page             = bounce_unmap_page,
4042         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4043         .sync_single_for_device = bounce_sync_single_for_device,
4044         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4045         .sync_sg_for_device     = bounce_sync_sg_for_device,
4046         .map_resource           = bounce_map_resource,
4047         .unmap_resource         = bounce_unmap_resource,
4048         .dma_supported          = dma_direct_supported,
4049 };
4050
4051 static inline int iommu_domain_cache_init(void)
4052 {
4053         int ret = 0;
4054
4055         iommu_domain_cache = kmem_cache_create("iommu_domain",
4056                                          sizeof(struct dmar_domain),
4057                                          0,
4058                                          SLAB_HWCACHE_ALIGN,
4059
4060                                          NULL);
4061         if (!iommu_domain_cache) {
4062                 pr_err("Couldn't create iommu_domain cache\n");
4063                 ret = -ENOMEM;
4064         }
4065
4066         return ret;
4067 }
4068
4069 static inline int iommu_devinfo_cache_init(void)
4070 {
4071         int ret = 0;
4072
4073         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4074                                          sizeof(struct device_domain_info),
4075                                          0,
4076                                          SLAB_HWCACHE_ALIGN,
4077                                          NULL);
4078         if (!iommu_devinfo_cache) {
4079                 pr_err("Couldn't create devinfo cache\n");
4080                 ret = -ENOMEM;
4081         }
4082
4083         return ret;
4084 }
4085
4086 static int __init iommu_init_mempool(void)
4087 {
4088         int ret;
4089         ret = iova_cache_get();
4090         if (ret)
4091                 return ret;
4092
4093         ret = iommu_domain_cache_init();
4094         if (ret)
4095                 goto domain_error;
4096
4097         ret = iommu_devinfo_cache_init();
4098         if (!ret)
4099                 return ret;
4100
4101         kmem_cache_destroy(iommu_domain_cache);
4102 domain_error:
4103         iova_cache_put();
4104
4105         return -ENOMEM;
4106 }
4107
4108 static void __init iommu_exit_mempool(void)
4109 {
4110         kmem_cache_destroy(iommu_devinfo_cache);
4111         kmem_cache_destroy(iommu_domain_cache);
4112         iova_cache_put();
4113 }
4114
4115 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4116 {
4117         struct dmar_drhd_unit *drhd;
4118         u32 vtbar;
4119         int rc;
4120
4121         /* We know that this device on this chipset has its own IOMMU.
4122          * If we find it under a different IOMMU, then the BIOS is lying
4123          * to us. Hope that the IOMMU for this device is actually
4124          * disabled, and it needs no translation...
4125          */
4126         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4127         if (rc) {
4128                 /* "can't" happen */
4129                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4130                 return;
4131         }
4132         vtbar &= 0xffff0000;
4133
4134         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4135         drhd = dmar_find_matched_drhd_unit(pdev);
4136         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4137                             TAINT_FIRMWARE_WORKAROUND,
4138                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4139                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4140 }
4141 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4142
4143 static void __init init_no_remapping_devices(void)
4144 {
4145         struct dmar_drhd_unit *drhd;
4146         struct device *dev;
4147         int i;
4148
4149         for_each_drhd_unit(drhd) {
4150                 if (!drhd->include_all) {
4151                         for_each_active_dev_scope(drhd->devices,
4152                                                   drhd->devices_cnt, i, dev)
4153                                 break;
4154                         /* ignore DMAR unit if no devices exist */
4155                         if (i == drhd->devices_cnt)
4156                                 drhd->ignored = 1;
4157                 }
4158         }
4159
4160         for_each_active_drhd_unit(drhd) {
4161                 if (drhd->include_all)
4162                         continue;
4163
4164                 for_each_active_dev_scope(drhd->devices,
4165                                           drhd->devices_cnt, i, dev)
4166                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4167                                 break;
4168                 if (i < drhd->devices_cnt)
4169                         continue;
4170
4171                 /* This IOMMU has *only* gfx devices. Either bypass it or
4172                    set the gfx_mapped flag, as appropriate */
4173                 if (!dmar_map_gfx) {
4174                         drhd->ignored = 1;
4175                         for_each_active_dev_scope(drhd->devices,
4176                                                   drhd->devices_cnt, i, dev)
4177                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4178                 }
4179         }
4180 }
4181
4182 #ifdef CONFIG_SUSPEND
4183 static int init_iommu_hw(void)
4184 {
4185         struct dmar_drhd_unit *drhd;
4186         struct intel_iommu *iommu = NULL;
4187
4188         for_each_active_iommu(iommu, drhd)
4189                 if (iommu->qi)
4190                         dmar_reenable_qi(iommu);
4191
4192         for_each_iommu(iommu, drhd) {
4193                 if (drhd->ignored) {
4194                         /*
4195                          * we always have to disable PMRs or DMA may fail on
4196                          * this device
4197                          */
4198                         if (force_on)
4199                                 iommu_disable_protect_mem_regions(iommu);
4200                         continue;
4201                 }
4202
4203                 iommu_flush_write_buffer(iommu);
4204
4205                 iommu_set_root_entry(iommu);
4206
4207                 iommu->flush.flush_context(iommu, 0, 0, 0,
4208                                            DMA_CCMD_GLOBAL_INVL);
4209                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4210                 iommu_enable_translation(iommu);
4211                 iommu_disable_protect_mem_regions(iommu);
4212         }
4213
4214         return 0;
4215 }
4216
4217 static void iommu_flush_all(void)
4218 {
4219         struct dmar_drhd_unit *drhd;
4220         struct intel_iommu *iommu;
4221
4222         for_each_active_iommu(iommu, drhd) {
4223                 iommu->flush.flush_context(iommu, 0, 0, 0,
4224                                            DMA_CCMD_GLOBAL_INVL);
4225                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4226                                          DMA_TLB_GLOBAL_FLUSH);
4227         }
4228 }
4229
4230 static int iommu_suspend(void)
4231 {
4232         struct dmar_drhd_unit *drhd;
4233         struct intel_iommu *iommu = NULL;
4234         unsigned long flag;
4235
4236         for_each_active_iommu(iommu, drhd) {
4237                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4238                                                  GFP_ATOMIC);
4239                 if (!iommu->iommu_state)
4240                         goto nomem;
4241         }
4242
4243         iommu_flush_all();
4244
4245         for_each_active_iommu(iommu, drhd) {
4246                 iommu_disable_translation(iommu);
4247
4248                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4249
4250                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4251                         readl(iommu->reg + DMAR_FECTL_REG);
4252                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4253                         readl(iommu->reg + DMAR_FEDATA_REG);
4254                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4255                         readl(iommu->reg + DMAR_FEADDR_REG);
4256                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4257                         readl(iommu->reg + DMAR_FEUADDR_REG);
4258
4259                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4260         }
4261         return 0;
4262
4263 nomem:
4264         for_each_active_iommu(iommu, drhd)
4265                 kfree(iommu->iommu_state);
4266
4267         return -ENOMEM;
4268 }
4269
4270 static void iommu_resume(void)
4271 {
4272         struct dmar_drhd_unit *drhd;
4273         struct intel_iommu *iommu = NULL;
4274         unsigned long flag;
4275
4276         if (init_iommu_hw()) {
4277                 if (force_on)
4278                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4279                 else
4280                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4281                 return;
4282         }
4283
4284         for_each_active_iommu(iommu, drhd) {
4285
4286                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4287
4288                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4289                         iommu->reg + DMAR_FECTL_REG);
4290                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4291                         iommu->reg + DMAR_FEDATA_REG);
4292                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4293                         iommu->reg + DMAR_FEADDR_REG);
4294                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4295                         iommu->reg + DMAR_FEUADDR_REG);
4296
4297                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4298         }
4299
4300         for_each_active_iommu(iommu, drhd)
4301                 kfree(iommu->iommu_state);
4302 }
4303
4304 static struct syscore_ops iommu_syscore_ops = {
4305         .resume         = iommu_resume,
4306         .suspend        = iommu_suspend,
4307 };
4308
4309 static void __init init_iommu_pm_ops(void)
4310 {
4311         register_syscore_ops(&iommu_syscore_ops);
4312 }
4313
4314 #else
4315 static inline void init_iommu_pm_ops(void) {}
4316 #endif  /* CONFIG_PM */
4317
4318 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4319 {
4320         struct acpi_dmar_reserved_memory *rmrr;
4321         struct dmar_rmrr_unit *rmrru;
4322         int ret;
4323
4324         rmrr = (struct acpi_dmar_reserved_memory *)header;
4325         ret = arch_rmrr_sanity_check(rmrr);
4326         if (ret)
4327                 return ret;
4328
4329         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4330         if (!rmrru)
4331                 goto out;
4332
4333         rmrru->hdr = header;
4334
4335         rmrru->base_address = rmrr->base_address;
4336         rmrru->end_address = rmrr->end_address;
4337
4338         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4339                                 ((void *)rmrr) + rmrr->header.length,
4340                                 &rmrru->devices_cnt);
4341         if (rmrru->devices_cnt && rmrru->devices == NULL)
4342                 goto free_rmrru;
4343
4344         list_add(&rmrru->list, &dmar_rmrr_units);
4345
4346         return 0;
4347 free_rmrru:
4348         kfree(rmrru);
4349 out:
4350         return -ENOMEM;
4351 }
4352
4353 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4354 {
4355         struct dmar_atsr_unit *atsru;
4356         struct acpi_dmar_atsr *tmp;
4357
4358         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4359                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4360                 if (atsr->segment != tmp->segment)
4361                         continue;
4362                 if (atsr->header.length != tmp->header.length)
4363                         continue;
4364                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4365                         return atsru;
4366         }
4367
4368         return NULL;
4369 }
4370
4371 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4372 {
4373         struct acpi_dmar_atsr *atsr;
4374         struct dmar_atsr_unit *atsru;
4375
4376         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4377                 return 0;
4378
4379         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4380         atsru = dmar_find_atsr(atsr);
4381         if (atsru)
4382                 return 0;
4383
4384         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4385         if (!atsru)
4386                 return -ENOMEM;
4387
4388         /*
4389          * If memory is allocated from slab by ACPI _DSM method, we need to
4390          * copy the memory content because the memory buffer will be freed
4391          * on return.
4392          */
4393         atsru->hdr = (void *)(atsru + 1);
4394         memcpy(atsru->hdr, hdr, hdr->length);
4395         atsru->include_all = atsr->flags & 0x1;
4396         if (!atsru->include_all) {
4397                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4398                                 (void *)atsr + atsr->header.length,
4399                                 &atsru->devices_cnt);
4400                 if (atsru->devices_cnt && atsru->devices == NULL) {
4401                         kfree(atsru);
4402                         return -ENOMEM;
4403                 }
4404         }
4405
4406         list_add_rcu(&atsru->list, &dmar_atsr_units);
4407
4408         return 0;
4409 }
4410
4411 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4412 {
4413         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4414         kfree(atsru);
4415 }
4416
4417 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4418 {
4419         struct acpi_dmar_atsr *atsr;
4420         struct dmar_atsr_unit *atsru;
4421
4422         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4423         atsru = dmar_find_atsr(atsr);
4424         if (atsru) {
4425                 list_del_rcu(&atsru->list);
4426                 synchronize_rcu();
4427                 intel_iommu_free_atsr(atsru);
4428         }
4429
4430         return 0;
4431 }
4432
4433 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4434 {
4435         int i;
4436         struct device *dev;
4437         struct acpi_dmar_atsr *atsr;
4438         struct dmar_atsr_unit *atsru;
4439
4440         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4441         atsru = dmar_find_atsr(atsr);
4442         if (!atsru)
4443                 return 0;
4444
4445         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4446                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4447                                           i, dev)
4448                         return -EBUSY;
4449         }
4450
4451         return 0;
4452 }
4453
4454 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4455 {
4456         int sp, ret;
4457         struct intel_iommu *iommu = dmaru->iommu;
4458
4459         if (g_iommus[iommu->seq_id])
4460                 return 0;
4461
4462         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4463                 pr_warn("%s: Doesn't support hardware pass through.\n",
4464                         iommu->name);
4465                 return -ENXIO;
4466         }
4467         if (!ecap_sc_support(iommu->ecap) &&
4468             domain_update_iommu_snooping(iommu)) {
4469                 pr_warn("%s: Doesn't support snooping.\n",
4470                         iommu->name);
4471                 return -ENXIO;
4472         }
4473         sp = domain_update_iommu_superpage(iommu) - 1;
4474         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4475                 pr_warn("%s: Doesn't support large page.\n",
4476                         iommu->name);
4477                 return -ENXIO;
4478         }
4479
4480         /*
4481          * Disable translation if already enabled prior to OS handover.
4482          */
4483         if (iommu->gcmd & DMA_GCMD_TE)
4484                 iommu_disable_translation(iommu);
4485
4486         g_iommus[iommu->seq_id] = iommu;
4487         ret = iommu_init_domains(iommu);
4488         if (ret == 0)
4489                 ret = iommu_alloc_root_entry(iommu);
4490         if (ret)
4491                 goto out;
4492
4493 #ifdef CONFIG_INTEL_IOMMU_SVM
4494         if (pasid_supported(iommu))
4495                 intel_svm_init(iommu);
4496 #endif
4497
4498         if (dmaru->ignored) {
4499                 /*
4500                  * we always have to disable PMRs or DMA may fail on this device
4501                  */
4502                 if (force_on)
4503                         iommu_disable_protect_mem_regions(iommu);
4504                 return 0;
4505         }
4506
4507         intel_iommu_init_qi(iommu);
4508         iommu_flush_write_buffer(iommu);
4509
4510 #ifdef CONFIG_INTEL_IOMMU_SVM
4511         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4512                 ret = intel_svm_enable_prq(iommu);
4513                 if (ret)
4514                         goto disable_iommu;
4515         }
4516 #endif
4517         ret = dmar_set_interrupt(iommu);
4518         if (ret)
4519                 goto disable_iommu;
4520
4521         iommu_set_root_entry(iommu);
4522         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4523         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4524         iommu_enable_translation(iommu);
4525
4526         iommu_disable_protect_mem_regions(iommu);
4527         return 0;
4528
4529 disable_iommu:
4530         disable_dmar_iommu(iommu);
4531 out:
4532         free_dmar_iommu(iommu);
4533         return ret;
4534 }
4535
4536 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4537 {
4538         int ret = 0;
4539         struct intel_iommu *iommu = dmaru->iommu;
4540
4541         if (!intel_iommu_enabled)
4542                 return 0;
4543         if (iommu == NULL)
4544                 return -EINVAL;
4545
4546         if (insert) {
4547                 ret = intel_iommu_add(dmaru);
4548         } else {
4549                 disable_dmar_iommu(iommu);
4550                 free_dmar_iommu(iommu);
4551         }
4552
4553         return ret;
4554 }
4555
4556 static void intel_iommu_free_dmars(void)
4557 {
4558         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4559         struct dmar_atsr_unit *atsru, *atsr_n;
4560
4561         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4562                 list_del(&rmrru->list);
4563                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4564                 kfree(rmrru);
4565         }
4566
4567         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4568                 list_del(&atsru->list);
4569                 intel_iommu_free_atsr(atsru);
4570         }
4571 }
4572
4573 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4574 {
4575         int i, ret = 1;
4576         struct pci_bus *bus;
4577         struct pci_dev *bridge = NULL;
4578         struct device *tmp;
4579         struct acpi_dmar_atsr *atsr;
4580         struct dmar_atsr_unit *atsru;
4581
4582         dev = pci_physfn(dev);
4583         for (bus = dev->bus; bus; bus = bus->parent) {
4584                 bridge = bus->self;
4585                 /* If it's an integrated device, allow ATS */
4586                 if (!bridge)
4587                         return 1;
4588                 /* Connected via non-PCIe: no ATS */
4589                 if (!pci_is_pcie(bridge) ||
4590                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4591                         return 0;
4592                 /* If we found the root port, look it up in the ATSR */
4593                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4594                         break;
4595         }
4596
4597         rcu_read_lock();
4598         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4599                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4600                 if (atsr->segment != pci_domain_nr(dev->bus))
4601                         continue;
4602
4603                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4604                         if (tmp == &bridge->dev)
4605                                 goto out;
4606
4607                 if (atsru->include_all)
4608                         goto out;
4609         }
4610         ret = 0;
4611 out:
4612         rcu_read_unlock();
4613
4614         return ret;
4615 }
4616
4617 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4618 {
4619         int ret;
4620         struct dmar_rmrr_unit *rmrru;
4621         struct dmar_atsr_unit *atsru;
4622         struct acpi_dmar_atsr *atsr;
4623         struct acpi_dmar_reserved_memory *rmrr;
4624
4625         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4626                 return 0;
4627
4628         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4629                 rmrr = container_of(rmrru->hdr,
4630                                     struct acpi_dmar_reserved_memory, header);
4631                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4632                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4633                                 ((void *)rmrr) + rmrr->header.length,
4634                                 rmrr->segment, rmrru->devices,
4635                                 rmrru->devices_cnt);
4636                         if (ret < 0)
4637                                 return ret;
4638                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4639                         dmar_remove_dev_scope(info, rmrr->segment,
4640                                 rmrru->devices, rmrru->devices_cnt);
4641                 }
4642         }
4643
4644         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4645                 if (atsru->include_all)
4646                         continue;
4647
4648                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4649                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4650                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4651                                         (void *)atsr + atsr->header.length,
4652                                         atsr->segment, atsru->devices,
4653                                         atsru->devices_cnt);
4654                         if (ret > 0)
4655                                 break;
4656                         else if (ret < 0)
4657                                 return ret;
4658                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4659                         if (dmar_remove_dev_scope(info, atsr->segment,
4660                                         atsru->devices, atsru->devices_cnt))
4661                                 break;
4662                 }
4663         }
4664
4665         return 0;
4666 }
4667
4668 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4669                                        unsigned long val, void *v)
4670 {
4671         struct memory_notify *mhp = v;
4672         unsigned long long start, end;
4673         unsigned long start_vpfn, last_vpfn;
4674
4675         switch (val) {
4676         case MEM_GOING_ONLINE:
4677                 start = mhp->start_pfn << PAGE_SHIFT;
4678                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4679                 if (iommu_domain_identity_map(si_domain, start, end)) {
4680                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4681                                 start, end);
4682                         return NOTIFY_BAD;
4683                 }
4684                 break;
4685
4686         case MEM_OFFLINE:
4687         case MEM_CANCEL_ONLINE:
4688                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4689                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4690                 while (start_vpfn <= last_vpfn) {
4691                         struct iova *iova;
4692                         struct dmar_drhd_unit *drhd;
4693                         struct intel_iommu *iommu;
4694                         struct page *freelist;
4695
4696                         iova = find_iova(&si_domain->iovad, start_vpfn);
4697                         if (iova == NULL) {
4698                                 pr_debug("Failed get IOVA for PFN %lx\n",
4699                                          start_vpfn);
4700                                 break;
4701                         }
4702
4703                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4704                                                      start_vpfn, last_vpfn);
4705                         if (iova == NULL) {
4706                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4707                                         start_vpfn, last_vpfn);
4708                                 return NOTIFY_BAD;
4709                         }
4710
4711                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4712                                                iova->pfn_hi);
4713
4714                         rcu_read_lock();
4715                         for_each_active_iommu(iommu, drhd)
4716                                 iommu_flush_iotlb_psi(iommu, si_domain,
4717                                         iova->pfn_lo, iova_size(iova),
4718                                         !freelist, 0);
4719                         rcu_read_unlock();
4720                         dma_free_pagelist(freelist);
4721
4722                         start_vpfn = iova->pfn_hi + 1;
4723                         free_iova_mem(iova);
4724                 }
4725                 break;
4726         }
4727
4728         return NOTIFY_OK;
4729 }
4730
4731 static struct notifier_block intel_iommu_memory_nb = {
4732         .notifier_call = intel_iommu_memory_notifier,
4733         .priority = 0
4734 };
4735
4736 static void free_all_cpu_cached_iovas(unsigned int cpu)
4737 {
4738         int i;
4739
4740         for (i = 0; i < g_num_of_iommus; i++) {
4741                 struct intel_iommu *iommu = g_iommus[i];
4742                 struct dmar_domain *domain;
4743                 int did;
4744
4745                 if (!iommu)
4746                         continue;
4747
4748                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4749                         domain = get_iommu_domain(iommu, (u16)did);
4750
4751                         if (!domain)
4752                                 continue;
4753                         free_cpu_cached_iovas(cpu, &domain->iovad);
4754                 }
4755         }
4756 }
4757
4758 static int intel_iommu_cpu_dead(unsigned int cpu)
4759 {
4760         free_all_cpu_cached_iovas(cpu);
4761         return 0;
4762 }
4763
4764 static void intel_disable_iommus(void)
4765 {
4766         struct intel_iommu *iommu = NULL;
4767         struct dmar_drhd_unit *drhd;
4768
4769         for_each_iommu(iommu, drhd)
4770                 iommu_disable_translation(iommu);
4771 }
4772
4773 void intel_iommu_shutdown(void)
4774 {
4775         struct dmar_drhd_unit *drhd;
4776         struct intel_iommu *iommu = NULL;
4777
4778         if (no_iommu || dmar_disabled)
4779                 return;
4780
4781         down_write(&dmar_global_lock);
4782
4783         /* Disable PMRs explicitly here. */
4784         for_each_iommu(iommu, drhd)
4785                 iommu_disable_protect_mem_regions(iommu);
4786
4787         /* Make sure the IOMMUs are switched off */
4788         intel_disable_iommus();
4789
4790         up_write(&dmar_global_lock);
4791 }
4792
4793 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4794 {
4795         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4796
4797         return container_of(iommu_dev, struct intel_iommu, iommu);
4798 }
4799
4800 static ssize_t intel_iommu_show_version(struct device *dev,
4801                                         struct device_attribute *attr,
4802                                         char *buf)
4803 {
4804         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4805         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4806         return sprintf(buf, "%d:%d\n",
4807                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4808 }
4809 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4810
4811 static ssize_t intel_iommu_show_address(struct device *dev,
4812                                         struct device_attribute *attr,
4813                                         char *buf)
4814 {
4815         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4816         return sprintf(buf, "%llx\n", iommu->reg_phys);
4817 }
4818 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4819
4820 static ssize_t intel_iommu_show_cap(struct device *dev,
4821                                     struct device_attribute *attr,
4822                                     char *buf)
4823 {
4824         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4825         return sprintf(buf, "%llx\n", iommu->cap);
4826 }
4827 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4828
4829 static ssize_t intel_iommu_show_ecap(struct device *dev,
4830                                     struct device_attribute *attr,
4831                                     char *buf)
4832 {
4833         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4834         return sprintf(buf, "%llx\n", iommu->ecap);
4835 }
4836 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4837
4838 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4839                                       struct device_attribute *attr,
4840                                       char *buf)
4841 {
4842         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4843         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4844 }
4845 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4846
4847 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4848                                            struct device_attribute *attr,
4849                                            char *buf)
4850 {
4851         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4852         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4853                                                   cap_ndoms(iommu->cap)));
4854 }
4855 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4856
4857 static struct attribute *intel_iommu_attrs[] = {
4858         &dev_attr_version.attr,
4859         &dev_attr_address.attr,
4860         &dev_attr_cap.attr,
4861         &dev_attr_ecap.attr,
4862         &dev_attr_domains_supported.attr,
4863         &dev_attr_domains_used.attr,
4864         NULL,
4865 };
4866
4867 static struct attribute_group intel_iommu_group = {
4868         .name = "intel-iommu",
4869         .attrs = intel_iommu_attrs,
4870 };
4871
4872 const struct attribute_group *intel_iommu_groups[] = {
4873         &intel_iommu_group,
4874         NULL,
4875 };
4876
4877 static inline bool has_untrusted_dev(void)
4878 {
4879         struct pci_dev *pdev = NULL;
4880
4881         for_each_pci_dev(pdev)
4882                 if (pdev->untrusted)
4883                         return true;
4884
4885         return false;
4886 }
4887
4888 static int __init platform_optin_force_iommu(void)
4889 {
4890         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4891                 return 0;
4892
4893         if (no_iommu || dmar_disabled)
4894                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4895
4896         /*
4897          * If Intel-IOMMU is disabled by default, we will apply identity
4898          * map for all devices except those marked as being untrusted.
4899          */
4900         if (dmar_disabled)
4901                 iommu_identity_mapping |= IDENTMAP_ALL;
4902
4903         dmar_disabled = 0;
4904         no_iommu = 0;
4905
4906         return 1;
4907 }
4908
4909 static int __init probe_acpi_namespace_devices(void)
4910 {
4911         struct dmar_drhd_unit *drhd;
4912         /* To avoid a -Wunused-but-set-variable warning. */
4913         struct intel_iommu *iommu __maybe_unused;
4914         struct device *dev;
4915         int i, ret = 0;
4916
4917         for_each_active_iommu(iommu, drhd) {
4918                 for_each_active_dev_scope(drhd->devices,
4919                                           drhd->devices_cnt, i, dev) {
4920                         struct acpi_device_physical_node *pn;
4921                         struct iommu_group *group;
4922                         struct acpi_device *adev;
4923
4924                         if (dev->bus != &acpi_bus_type)
4925                                 continue;
4926
4927                         adev = to_acpi_device(dev);
4928                         mutex_lock(&adev->physical_node_lock);
4929                         list_for_each_entry(pn,
4930                                             &adev->physical_node_list, node) {
4931                                 group = iommu_group_get(pn->dev);
4932                                 if (group) {
4933                                         iommu_group_put(group);
4934                                         continue;
4935                                 }
4936
4937                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4938                                 ret = iommu_probe_device(pn->dev);
4939                                 if (ret)
4940                                         break;
4941                         }
4942                         mutex_unlock(&adev->physical_node_lock);
4943
4944                         if (ret)
4945                                 return ret;
4946                 }
4947         }
4948
4949         return 0;
4950 }
4951
4952 int __init intel_iommu_init(void)
4953 {
4954         int ret = -ENODEV;
4955         struct dmar_drhd_unit *drhd;
4956         struct intel_iommu *iommu;
4957
4958         /*
4959          * Intel IOMMU is required for a TXT/tboot launch or platform
4960          * opt in, so enforce that.
4961          */
4962         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4963
4964         if (iommu_init_mempool()) {
4965                 if (force_on)
4966                         panic("tboot: Failed to initialize iommu memory\n");
4967                 return -ENOMEM;
4968         }
4969
4970         down_write(&dmar_global_lock);
4971         if (dmar_table_init()) {
4972                 if (force_on)
4973                         panic("tboot: Failed to initialize DMAR table\n");
4974                 goto out_free_dmar;
4975         }
4976
4977         if (dmar_dev_scope_init() < 0) {
4978                 if (force_on)
4979                         panic("tboot: Failed to initialize DMAR device scope\n");
4980                 goto out_free_dmar;
4981         }
4982
4983         up_write(&dmar_global_lock);
4984
4985         /*
4986          * The bus notifier takes the dmar_global_lock, so lockdep will
4987          * complain later when we register it under the lock.
4988          */
4989         dmar_register_bus_notifier();
4990
4991         down_write(&dmar_global_lock);
4992
4993         if (no_iommu || dmar_disabled) {
4994                 /*
4995                  * We exit the function here to ensure IOMMU's remapping and
4996                  * mempool aren't setup, which means that the IOMMU's PMRs
4997                  * won't be disabled via the call to init_dmars(). So disable
4998                  * it explicitly here. The PMRs were setup by tboot prior to
4999                  * calling SENTER, but the kernel is expected to reset/tear
5000                  * down the PMRs.
5001                  */
5002                 if (intel_iommu_tboot_noforce) {
5003                         for_each_iommu(iommu, drhd)
5004                                 iommu_disable_protect_mem_regions(iommu);
5005                 }
5006
5007                 /*
5008                  * Make sure the IOMMUs are switched off, even when we
5009                  * boot into a kexec kernel and the previous kernel left
5010                  * them enabled
5011                  */
5012                 intel_disable_iommus();
5013                 goto out_free_dmar;
5014         }
5015
5016         if (list_empty(&dmar_rmrr_units))
5017                 pr_info("No RMRR found\n");
5018
5019         if (list_empty(&dmar_atsr_units))
5020                 pr_info("No ATSR found\n");
5021
5022         if (dmar_init_reserved_ranges()) {
5023                 if (force_on)
5024                         panic("tboot: Failed to reserve iommu ranges\n");
5025                 goto out_free_reserved_range;
5026         }
5027
5028         if (dmar_map_gfx)
5029                 intel_iommu_gfx_mapped = 1;
5030
5031         init_no_remapping_devices();
5032
5033         ret = init_dmars();
5034         if (ret) {
5035                 if (force_on)
5036                         panic("tboot: Failed to initialize DMARs\n");
5037                 pr_err("Initialization failed\n");
5038                 goto out_free_reserved_range;
5039         }
5040         up_write(&dmar_global_lock);
5041
5042 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5043         /*
5044          * If the system has no untrusted device or the user has decided
5045          * to disable the bounce page mechanisms, we don't need swiotlb.
5046          * Mark this and the pre-allocated bounce pages will be released
5047          * later.
5048          */
5049         if (!has_untrusted_dev() || intel_no_bounce)
5050                 swiotlb = 0;
5051 #endif
5052         dma_ops = &intel_dma_ops;
5053
5054         init_iommu_pm_ops();
5055
5056         for_each_active_iommu(iommu, drhd) {
5057                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5058                                        intel_iommu_groups,
5059                                        "%s", iommu->name);
5060                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5061                 iommu_device_register(&iommu->iommu);
5062         }
5063
5064         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5065         if (si_domain && !hw_pass_through)
5066                 register_memory_notifier(&intel_iommu_memory_nb);
5067         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5068                           intel_iommu_cpu_dead);
5069
5070         down_read(&dmar_global_lock);
5071         if (probe_acpi_namespace_devices())
5072                 pr_warn("ACPI name space devices didn't probe correctly\n");
5073         up_read(&dmar_global_lock);
5074
5075         /* Finally, we enable the DMA remapping hardware. */
5076         for_each_iommu(iommu, drhd) {
5077                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5078                         iommu_enable_translation(iommu);
5079
5080                 iommu_disable_protect_mem_regions(iommu);
5081         }
5082         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5083
5084         intel_iommu_enabled = 1;
5085         intel_iommu_debugfs_init();
5086
5087         return 0;
5088
5089 out_free_reserved_range:
5090         put_iova_domain(&reserved_iova_list);
5091 out_free_dmar:
5092         intel_iommu_free_dmars();
5093         up_write(&dmar_global_lock);
5094         iommu_exit_mempool();
5095         return ret;
5096 }
5097
5098 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5099 {
5100         struct intel_iommu *iommu = opaque;
5101
5102         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5103         return 0;
5104 }
5105
5106 /*
5107  * NB - intel-iommu lacks any sort of reference counting for the users of
5108  * dependent devices.  If multiple endpoints have intersecting dependent
5109  * devices, unbinding the driver from any one of them will possibly leave
5110  * the others unable to operate.
5111  */
5112 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5113 {
5114         if (!iommu || !dev || !dev_is_pci(dev))
5115                 return;
5116
5117         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5118 }
5119
5120 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5121 {
5122         struct dmar_domain *domain;
5123         struct intel_iommu *iommu;
5124         unsigned long flags;
5125
5126         assert_spin_locked(&device_domain_lock);
5127
5128         if (WARN_ON(!info))
5129                 return;
5130
5131         iommu = info->iommu;
5132         domain = info->domain;
5133
5134         if (info->dev) {
5135                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5136                         intel_pasid_tear_down_entry(iommu, info->dev,
5137                                         PASID_RID2PASID);
5138
5139                 iommu_disable_dev_iotlb(info);
5140                 domain_context_clear(iommu, info->dev);
5141                 intel_pasid_free_table(info->dev);
5142         }
5143
5144         unlink_domain_info(info);
5145
5146         spin_lock_irqsave(&iommu->lock, flags);
5147         domain_detach_iommu(domain, iommu);
5148         spin_unlock_irqrestore(&iommu->lock, flags);
5149
5150         /* free the private domain */
5151         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5152             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5153             list_empty(&domain->devices))
5154                 domain_exit(info->domain);
5155
5156         free_devinfo_mem(info);
5157 }
5158
5159 static void dmar_remove_one_dev_info(struct device *dev)
5160 {
5161         struct device_domain_info *info;
5162         unsigned long flags;
5163
5164         spin_lock_irqsave(&device_domain_lock, flags);
5165         info = dev->archdata.iommu;
5166         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5167             && info != DUMMY_DEVICE_DOMAIN_INFO)
5168                 __dmar_remove_one_dev_info(info);
5169         spin_unlock_irqrestore(&device_domain_lock, flags);
5170 }
5171
5172 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5173 {
5174         int adjust_width;
5175
5176         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5177         domain_reserve_special_ranges(domain);
5178
5179         /* calculate AGAW */
5180         domain->gaw = guest_width;
5181         adjust_width = guestwidth_to_adjustwidth(guest_width);
5182         domain->agaw = width_to_agaw(adjust_width);
5183
5184         domain->iommu_coherency = 0;
5185         domain->iommu_snooping = 0;
5186         domain->iommu_superpage = 0;
5187         domain->max_addr = 0;
5188
5189         /* always allocate the top pgd */
5190         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5191         if (!domain->pgd)
5192                 return -ENOMEM;
5193         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5194         return 0;
5195 }
5196
5197 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5198 {
5199         struct dmar_domain *dmar_domain;
5200         struct iommu_domain *domain;
5201
5202         switch (type) {
5203         case IOMMU_DOMAIN_DMA:
5204         /* fallthrough */
5205         case IOMMU_DOMAIN_UNMANAGED:
5206                 dmar_domain = alloc_domain(0);
5207                 if (!dmar_domain) {
5208                         pr_err("Can't allocate dmar_domain\n");
5209                         return NULL;
5210                 }
5211                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5212                         pr_err("Domain initialization failed\n");
5213                         domain_exit(dmar_domain);
5214                         return NULL;
5215                 }
5216
5217                 if (type == IOMMU_DOMAIN_DMA &&
5218                     init_iova_flush_queue(&dmar_domain->iovad,
5219                                           iommu_flush_iova, iova_entry_free)) {
5220                         pr_warn("iova flush queue initialization failed\n");
5221                         intel_iommu_strict = 1;
5222                 }
5223
5224                 domain_update_iommu_cap(dmar_domain);
5225
5226                 domain = &dmar_domain->domain;
5227                 domain->geometry.aperture_start = 0;
5228                 domain->geometry.aperture_end   =
5229                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5230                 domain->geometry.force_aperture = true;
5231
5232                 return domain;
5233         case IOMMU_DOMAIN_IDENTITY:
5234                 return &si_domain->domain;
5235         default:
5236                 return NULL;
5237         }
5238
5239         return NULL;
5240 }
5241
5242 static void intel_iommu_domain_free(struct iommu_domain *domain)
5243 {
5244         if (domain != &si_domain->domain)
5245                 domain_exit(to_dmar_domain(domain));
5246 }
5247
5248 /*
5249  * Check whether a @domain could be attached to the @dev through the
5250  * aux-domain attach/detach APIs.
5251  */
5252 static inline bool
5253 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5254 {
5255         struct device_domain_info *info = dev->archdata.iommu;
5256
5257         return info && info->auxd_enabled &&
5258                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5259 }
5260
5261 static void auxiliary_link_device(struct dmar_domain *domain,
5262                                   struct device *dev)
5263 {
5264         struct device_domain_info *info = dev->archdata.iommu;
5265
5266         assert_spin_locked(&device_domain_lock);
5267         if (WARN_ON(!info))
5268                 return;
5269
5270         domain->auxd_refcnt++;
5271         list_add(&domain->auxd, &info->auxiliary_domains);
5272 }
5273
5274 static void auxiliary_unlink_device(struct dmar_domain *domain,
5275                                     struct device *dev)
5276 {
5277         struct device_domain_info *info = dev->archdata.iommu;
5278
5279         assert_spin_locked(&device_domain_lock);
5280         if (WARN_ON(!info))
5281                 return;
5282
5283         list_del(&domain->auxd);
5284         domain->auxd_refcnt--;
5285
5286         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5287                 intel_pasid_free_id(domain->default_pasid);
5288 }
5289
5290 static int aux_domain_add_dev(struct dmar_domain *domain,
5291                               struct device *dev)
5292 {
5293         int ret;
5294         u8 bus, devfn;
5295         unsigned long flags;
5296         struct intel_iommu *iommu;
5297
5298         iommu = device_to_iommu(dev, &bus, &devfn);
5299         if (!iommu)
5300                 return -ENODEV;
5301
5302         if (domain->default_pasid <= 0) {
5303                 int pasid;
5304
5305                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5306                                              pci_max_pasids(to_pci_dev(dev)),
5307                                              GFP_KERNEL);
5308                 if (pasid <= 0) {
5309                         pr_err("Can't allocate default pasid\n");
5310                         return -ENODEV;
5311                 }
5312                 domain->default_pasid = pasid;
5313         }
5314
5315         spin_lock_irqsave(&device_domain_lock, flags);
5316         /*
5317          * iommu->lock must be held to attach domain to iommu and setup the
5318          * pasid entry for second level translation.
5319          */
5320         spin_lock(&iommu->lock);
5321         ret = domain_attach_iommu(domain, iommu);
5322         if (ret)
5323                 goto attach_failed;
5324
5325         /* Setup the PASID entry for mediated devices: */
5326         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5327                                              domain->default_pasid);
5328         if (ret)
5329                 goto table_failed;
5330         spin_unlock(&iommu->lock);
5331
5332         auxiliary_link_device(domain, dev);
5333
5334         spin_unlock_irqrestore(&device_domain_lock, flags);
5335
5336         return 0;
5337
5338 table_failed:
5339         domain_detach_iommu(domain, iommu);
5340 attach_failed:
5341         spin_unlock(&iommu->lock);
5342         spin_unlock_irqrestore(&device_domain_lock, flags);
5343         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5344                 intel_pasid_free_id(domain->default_pasid);
5345
5346         return ret;
5347 }
5348
5349 static void aux_domain_remove_dev(struct dmar_domain *domain,
5350                                   struct device *dev)
5351 {
5352         struct device_domain_info *info;
5353         struct intel_iommu *iommu;
5354         unsigned long flags;
5355
5356         if (!is_aux_domain(dev, &domain->domain))
5357                 return;
5358
5359         spin_lock_irqsave(&device_domain_lock, flags);
5360         info = dev->archdata.iommu;
5361         iommu = info->iommu;
5362
5363         auxiliary_unlink_device(domain, dev);
5364
5365         spin_lock(&iommu->lock);
5366         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5367         domain_detach_iommu(domain, iommu);
5368         spin_unlock(&iommu->lock);
5369
5370         spin_unlock_irqrestore(&device_domain_lock, flags);
5371 }
5372
5373 static int prepare_domain_attach_device(struct iommu_domain *domain,
5374                                         struct device *dev)
5375 {
5376         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5377         struct intel_iommu *iommu;
5378         int addr_width;
5379         u8 bus, devfn;
5380
5381         iommu = device_to_iommu(dev, &bus, &devfn);
5382         if (!iommu)
5383                 return -ENODEV;
5384
5385         /* check if this iommu agaw is sufficient for max mapped address */
5386         addr_width = agaw_to_width(iommu->agaw);
5387         if (addr_width > cap_mgaw(iommu->cap))
5388                 addr_width = cap_mgaw(iommu->cap);
5389
5390         if (dmar_domain->max_addr > (1LL << addr_width)) {
5391                 dev_err(dev, "%s: iommu width (%d) is not "
5392                         "sufficient for the mapped address (%llx)\n",
5393                         __func__, addr_width, dmar_domain->max_addr);
5394                 return -EFAULT;
5395         }
5396         dmar_domain->gaw = addr_width;
5397
5398         /*
5399          * Knock out extra levels of page tables if necessary
5400          */
5401         while (iommu->agaw < dmar_domain->agaw) {
5402                 struct dma_pte *pte;
5403
5404                 pte = dmar_domain->pgd;
5405                 if (dma_pte_present(pte)) {
5406                         dmar_domain->pgd = (struct dma_pte *)
5407                                 phys_to_virt(dma_pte_addr(pte));
5408                         free_pgtable_page(pte);
5409                 }
5410                 dmar_domain->agaw--;
5411         }
5412
5413         return 0;
5414 }
5415
5416 static int intel_iommu_attach_device(struct iommu_domain *domain,
5417                                      struct device *dev)
5418 {
5419         int ret;
5420
5421         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5422             device_is_rmrr_locked(dev)) {
5423                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5424                 return -EPERM;
5425         }
5426
5427         if (is_aux_domain(dev, domain))
5428                 return -EPERM;
5429
5430         /* normally dev is not mapped */
5431         if (unlikely(domain_context_mapped(dev))) {
5432                 struct dmar_domain *old_domain;
5433
5434                 old_domain = find_domain(dev);
5435                 if (old_domain)
5436                         dmar_remove_one_dev_info(dev);
5437         }
5438
5439         ret = prepare_domain_attach_device(domain, dev);
5440         if (ret)
5441                 return ret;
5442
5443         return domain_add_dev_info(to_dmar_domain(domain), dev);
5444 }
5445
5446 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5447                                          struct device *dev)
5448 {
5449         int ret;
5450
5451         if (!is_aux_domain(dev, domain))
5452                 return -EPERM;
5453
5454         ret = prepare_domain_attach_device(domain, dev);
5455         if (ret)
5456                 return ret;
5457
5458         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5459 }
5460
5461 static void intel_iommu_detach_device(struct iommu_domain *domain,
5462                                       struct device *dev)
5463 {
5464         dmar_remove_one_dev_info(dev);
5465 }
5466
5467 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5468                                           struct device *dev)
5469 {
5470         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5471 }
5472
5473 static int intel_iommu_map(struct iommu_domain *domain,
5474                            unsigned long iova, phys_addr_t hpa,
5475                            size_t size, int iommu_prot, gfp_t gfp)
5476 {
5477         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5478         u64 max_addr;
5479         int prot = 0;
5480         int ret;
5481
5482         if (iommu_prot & IOMMU_READ)
5483                 prot |= DMA_PTE_READ;
5484         if (iommu_prot & IOMMU_WRITE)
5485                 prot |= DMA_PTE_WRITE;
5486         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5487                 prot |= DMA_PTE_SNP;
5488
5489         max_addr = iova + size;
5490         if (dmar_domain->max_addr < max_addr) {
5491                 u64 end;
5492
5493                 /* check if minimum agaw is sufficient for mapped address */
5494                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5495                 if (end < max_addr) {
5496                         pr_err("%s: iommu width (%d) is not "
5497                                "sufficient for the mapped address (%llx)\n",
5498                                __func__, dmar_domain->gaw, max_addr);
5499                         return -EFAULT;
5500                 }
5501                 dmar_domain->max_addr = max_addr;
5502         }
5503         /* Round up size to next multiple of PAGE_SIZE, if it and
5504            the low bits of hpa would take us onto the next page */
5505         size = aligned_nrpages(hpa, size);
5506         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5507                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5508         return ret;
5509 }
5510
5511 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5512                                 unsigned long iova, size_t size,
5513                                 struct iommu_iotlb_gather *gather)
5514 {
5515         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5516         struct page *freelist = NULL;
5517         unsigned long start_pfn, last_pfn;
5518         unsigned int npages;
5519         int iommu_id, level = 0;
5520
5521         /* Cope with horrid API which requires us to unmap more than the
5522            size argument if it happens to be a large-page mapping. */
5523         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5524
5525         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5526                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5527
5528         start_pfn = iova >> VTD_PAGE_SHIFT;
5529         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5530
5531         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5532
5533         npages = last_pfn - start_pfn + 1;
5534
5535         for_each_domain_iommu(iommu_id, dmar_domain)
5536                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5537                                       start_pfn, npages, !freelist, 0);
5538
5539         dma_free_pagelist(freelist);
5540
5541         if (dmar_domain->max_addr == iova + size)
5542                 dmar_domain->max_addr = iova;
5543
5544         return size;
5545 }
5546
5547 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5548                                             dma_addr_t iova)
5549 {
5550         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5551         struct dma_pte *pte;
5552         int level = 0;
5553         u64 phys = 0;
5554
5555         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5556         if (pte)
5557                 phys = dma_pte_addr(pte);
5558
5559         return phys;
5560 }
5561
5562 static inline bool scalable_mode_support(void)
5563 {
5564         struct dmar_drhd_unit *drhd;
5565         struct intel_iommu *iommu;
5566         bool ret = true;
5567
5568         rcu_read_lock();
5569         for_each_active_iommu(iommu, drhd) {
5570                 if (!sm_supported(iommu)) {
5571                         ret = false;
5572                         break;
5573                 }
5574         }
5575         rcu_read_unlock();
5576
5577         return ret;
5578 }
5579
5580 static inline bool iommu_pasid_support(void)
5581 {
5582         struct dmar_drhd_unit *drhd;
5583         struct intel_iommu *iommu;
5584         bool ret = true;
5585
5586         rcu_read_lock();
5587         for_each_active_iommu(iommu, drhd) {
5588                 if (!pasid_supported(iommu)) {
5589                         ret = false;
5590                         break;
5591                 }
5592         }
5593         rcu_read_unlock();
5594
5595         return ret;
5596 }
5597
5598 static bool intel_iommu_capable(enum iommu_cap cap)
5599 {
5600         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5601                 return domain_update_iommu_snooping(NULL) == 1;
5602         if (cap == IOMMU_CAP_INTR_REMAP)
5603                 return irq_remapping_enabled == 1;
5604
5605         return false;
5606 }
5607
5608 static int intel_iommu_add_device(struct device *dev)
5609 {
5610         struct dmar_domain *dmar_domain;
5611         struct iommu_domain *domain;
5612         struct intel_iommu *iommu;
5613         struct iommu_group *group;
5614         u8 bus, devfn;
5615         int ret;
5616
5617         iommu = device_to_iommu(dev, &bus, &devfn);
5618         if (!iommu)
5619                 return -ENODEV;
5620
5621         iommu_device_link(&iommu->iommu, dev);
5622
5623         if (translation_pre_enabled(iommu))
5624                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5625
5626         group = iommu_group_get_for_dev(dev);
5627
5628         if (IS_ERR(group)) {
5629                 ret = PTR_ERR(group);
5630                 goto unlink;
5631         }
5632
5633         iommu_group_put(group);
5634
5635         domain = iommu_get_domain_for_dev(dev);
5636         dmar_domain = to_dmar_domain(domain);
5637         if (domain->type == IOMMU_DOMAIN_DMA) {
5638                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5639                         ret = iommu_request_dm_for_dev(dev);
5640                         if (ret) {
5641                                 dmar_remove_one_dev_info(dev);
5642                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5643                                 domain_add_dev_info(si_domain, dev);
5644                                 dev_info(dev,
5645                                          "Device uses a private identity domain.\n");
5646                         }
5647                 }
5648         } else {
5649                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5650                         ret = iommu_request_dma_domain_for_dev(dev);
5651                         if (ret) {
5652                                 dmar_remove_one_dev_info(dev);
5653                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5654                                 if (!get_private_domain_for_dev(dev)) {
5655                                         dev_warn(dev,
5656                                                  "Failed to get a private domain.\n");
5657                                         ret = -ENOMEM;
5658                                         goto unlink;
5659                                 }
5660
5661                                 dev_info(dev,
5662                                          "Device uses a private dma domain.\n");
5663                         }
5664                 }
5665         }
5666
5667         if (device_needs_bounce(dev)) {
5668                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5669                 set_dma_ops(dev, &bounce_dma_ops);
5670         }
5671
5672         return 0;
5673
5674 unlink:
5675         iommu_device_unlink(&iommu->iommu, dev);
5676         return ret;
5677 }
5678
5679 static void intel_iommu_remove_device(struct device *dev)
5680 {
5681         struct intel_iommu *iommu;
5682         u8 bus, devfn;
5683
5684         iommu = device_to_iommu(dev, &bus, &devfn);
5685         if (!iommu)
5686                 return;
5687
5688         dmar_remove_one_dev_info(dev);
5689
5690         iommu_group_remove_device(dev);
5691
5692         iommu_device_unlink(&iommu->iommu, dev);
5693
5694         if (device_needs_bounce(dev))
5695                 set_dma_ops(dev, NULL);
5696 }
5697
5698 static void intel_iommu_get_resv_regions(struct device *device,
5699                                          struct list_head *head)
5700 {
5701         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5702         struct iommu_resv_region *reg;
5703         struct dmar_rmrr_unit *rmrr;
5704         struct device *i_dev;
5705         int i;
5706
5707         down_read(&dmar_global_lock);
5708         for_each_rmrr_units(rmrr) {
5709                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5710                                           i, i_dev) {
5711                         struct iommu_resv_region *resv;
5712                         enum iommu_resv_type type;
5713                         size_t length;
5714
5715                         if (i_dev != device &&
5716                             !is_downstream_to_pci_bridge(device, i_dev))
5717                                 continue;
5718
5719                         length = rmrr->end_address - rmrr->base_address + 1;
5720
5721                         type = device_rmrr_is_relaxable(device) ?
5722                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5723
5724                         resv = iommu_alloc_resv_region(rmrr->base_address,
5725                                                        length, prot, type);
5726                         if (!resv)
5727                                 break;
5728
5729                         list_add_tail(&resv->list, head);
5730                 }
5731         }
5732         up_read(&dmar_global_lock);
5733
5734 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5735         if (dev_is_pci(device)) {
5736                 struct pci_dev *pdev = to_pci_dev(device);
5737
5738                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5739                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5740                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5741                         if (reg)
5742                                 list_add_tail(&reg->list, head);
5743                 }
5744         }
5745 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5746
5747         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5748                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5749                                       0, IOMMU_RESV_MSI);
5750         if (!reg)
5751                 return;
5752         list_add_tail(&reg->list, head);
5753 }
5754
5755 static void intel_iommu_put_resv_regions(struct device *dev,
5756                                          struct list_head *head)
5757 {
5758         struct iommu_resv_region *entry, *next;
5759
5760         list_for_each_entry_safe(entry, next, head, list)
5761                 kfree(entry);
5762 }
5763
5764 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5765 {
5766         struct device_domain_info *info;
5767         struct context_entry *context;
5768         struct dmar_domain *domain;
5769         unsigned long flags;
5770         u64 ctx_lo;
5771         int ret;
5772
5773         domain = find_domain(dev);
5774         if (!domain)
5775                 return -EINVAL;
5776
5777         spin_lock_irqsave(&device_domain_lock, flags);
5778         spin_lock(&iommu->lock);
5779
5780         ret = -EINVAL;
5781         info = dev->archdata.iommu;
5782         if (!info || !info->pasid_supported)
5783                 goto out;
5784
5785         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5786         if (WARN_ON(!context))
5787                 goto out;
5788
5789         ctx_lo = context[0].lo;
5790
5791         if (!(ctx_lo & CONTEXT_PASIDE)) {
5792                 ctx_lo |= CONTEXT_PASIDE;
5793                 context[0].lo = ctx_lo;
5794                 wmb();
5795                 iommu->flush.flush_context(iommu,
5796                                            domain->iommu_did[iommu->seq_id],
5797                                            PCI_DEVID(info->bus, info->devfn),
5798                                            DMA_CCMD_MASK_NOBIT,
5799                                            DMA_CCMD_DEVICE_INVL);
5800         }
5801
5802         /* Enable PASID support in the device, if it wasn't already */
5803         if (!info->pasid_enabled)
5804                 iommu_enable_dev_iotlb(info);
5805
5806         ret = 0;
5807
5808  out:
5809         spin_unlock(&iommu->lock);
5810         spin_unlock_irqrestore(&device_domain_lock, flags);
5811
5812         return ret;
5813 }
5814
5815 static void intel_iommu_apply_resv_region(struct device *dev,
5816                                           struct iommu_domain *domain,
5817                                           struct iommu_resv_region *region)
5818 {
5819         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5820         unsigned long start, end;
5821
5822         start = IOVA_PFN(region->start);
5823         end   = IOVA_PFN(region->start + region->length - 1);
5824
5825         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5826 }
5827
5828 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5829 {
5830         if (dev_is_pci(dev))
5831                 return pci_device_group(dev);
5832         return generic_device_group(dev);
5833 }
5834
5835 #ifdef CONFIG_INTEL_IOMMU_SVM
5836 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5837 {
5838         struct intel_iommu *iommu;
5839         u8 bus, devfn;
5840
5841         if (iommu_dummy(dev)) {
5842                 dev_warn(dev,
5843                          "No IOMMU translation for device; cannot enable SVM\n");
5844                 return NULL;
5845         }
5846
5847         iommu = device_to_iommu(dev, &bus, &devfn);
5848         if ((!iommu)) {
5849                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5850                 return NULL;
5851         }
5852
5853         return iommu;
5854 }
5855 #endif /* CONFIG_INTEL_IOMMU_SVM */
5856
5857 static int intel_iommu_enable_auxd(struct device *dev)
5858 {
5859         struct device_domain_info *info;
5860         struct intel_iommu *iommu;
5861         unsigned long flags;
5862         u8 bus, devfn;
5863         int ret;
5864
5865         iommu = device_to_iommu(dev, &bus, &devfn);
5866         if (!iommu || dmar_disabled)
5867                 return -EINVAL;
5868
5869         if (!sm_supported(iommu) || !pasid_supported(iommu))
5870                 return -EINVAL;
5871
5872         ret = intel_iommu_enable_pasid(iommu, dev);
5873         if (ret)
5874                 return -ENODEV;
5875
5876         spin_lock_irqsave(&device_domain_lock, flags);
5877         info = dev->archdata.iommu;
5878         info->auxd_enabled = 1;
5879         spin_unlock_irqrestore(&device_domain_lock, flags);
5880
5881         return 0;
5882 }
5883
5884 static int intel_iommu_disable_auxd(struct device *dev)
5885 {
5886         struct device_domain_info *info;
5887         unsigned long flags;
5888
5889         spin_lock_irqsave(&device_domain_lock, flags);
5890         info = dev->archdata.iommu;
5891         if (!WARN_ON(!info))
5892                 info->auxd_enabled = 0;
5893         spin_unlock_irqrestore(&device_domain_lock, flags);
5894
5895         return 0;
5896 }
5897
5898 /*
5899  * A PCI express designated vendor specific extended capability is defined
5900  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5901  * for system software and tools to detect endpoint devices supporting the
5902  * Intel scalable IO virtualization without host driver dependency.
5903  *
5904  * Returns the address of the matching extended capability structure within
5905  * the device's PCI configuration space or 0 if the device does not support
5906  * it.
5907  */
5908 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5909 {
5910         int pos;
5911         u16 vendor, id;
5912
5913         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5914         while (pos) {
5915                 pci_read_config_word(pdev, pos + 4, &vendor);
5916                 pci_read_config_word(pdev, pos + 8, &id);
5917                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5918                         return pos;
5919
5920                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5921         }
5922
5923         return 0;
5924 }
5925
5926 static bool
5927 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5928 {
5929         if (feat == IOMMU_DEV_FEAT_AUX) {
5930                 int ret;
5931
5932                 if (!dev_is_pci(dev) || dmar_disabled ||
5933                     !scalable_mode_support() || !iommu_pasid_support())
5934                         return false;
5935
5936                 ret = pci_pasid_features(to_pci_dev(dev));
5937                 if (ret < 0)
5938                         return false;
5939
5940                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5941         }
5942
5943         return false;
5944 }
5945
5946 static int
5947 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5948 {
5949         if (feat == IOMMU_DEV_FEAT_AUX)
5950                 return intel_iommu_enable_auxd(dev);
5951
5952         return -ENODEV;
5953 }
5954
5955 static int
5956 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5957 {
5958         if (feat == IOMMU_DEV_FEAT_AUX)
5959                 return intel_iommu_disable_auxd(dev);
5960
5961         return -ENODEV;
5962 }
5963
5964 static bool
5965 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5966 {
5967         struct device_domain_info *info = dev->archdata.iommu;
5968
5969         if (feat == IOMMU_DEV_FEAT_AUX)
5970                 return scalable_mode_support() && info && info->auxd_enabled;
5971
5972         return false;
5973 }
5974
5975 static int
5976 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5977 {
5978         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5979
5980         return dmar_domain->default_pasid > 0 ?
5981                         dmar_domain->default_pasid : -EINVAL;
5982 }
5983
5984 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5985                                            struct device *dev)
5986 {
5987         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5988 }
5989
5990 const struct iommu_ops intel_iommu_ops = {
5991         .capable                = intel_iommu_capable,
5992         .domain_alloc           = intel_iommu_domain_alloc,
5993         .domain_free            = intel_iommu_domain_free,
5994         .attach_dev             = intel_iommu_attach_device,
5995         .detach_dev             = intel_iommu_detach_device,
5996         .aux_attach_dev         = intel_iommu_aux_attach_device,
5997         .aux_detach_dev         = intel_iommu_aux_detach_device,
5998         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5999         .map                    = intel_iommu_map,
6000         .unmap                  = intel_iommu_unmap,
6001         .iova_to_phys           = intel_iommu_iova_to_phys,
6002         .add_device             = intel_iommu_add_device,
6003         .remove_device          = intel_iommu_remove_device,
6004         .get_resv_regions       = intel_iommu_get_resv_regions,
6005         .put_resv_regions       = intel_iommu_put_resv_regions,
6006         .apply_resv_region      = intel_iommu_apply_resv_region,
6007         .device_group           = intel_iommu_device_group,
6008         .dev_has_feat           = intel_iommu_dev_has_feat,
6009         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6010         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6011         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6012         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6013         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6014 };
6015
6016 static void quirk_iommu_igfx(struct pci_dev *dev)
6017 {
6018         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6019         dmar_map_gfx = 0;
6020 }
6021
6022 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6030
6031 /* Broadwell igfx malfunctions with dmar */
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6051 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6053 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6056
6057 static void quirk_iommu_rwbf(struct pci_dev *dev)
6058 {
6059         /*
6060          * Mobile 4 Series Chipset neglects to set RWBF capability,
6061          * but needs it. Same seems to hold for the desktop versions.
6062          */
6063         pci_info(dev, "Forcing write-buffer flush capability\n");
6064         rwbf_quirk = 1;
6065 }
6066
6067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6072 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6073 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6074
6075 #define GGC 0x52
6076 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6077 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6078 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6079 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6080 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6081 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6082 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6083 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6084
6085 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6086 {
6087         unsigned short ggc;
6088
6089         if (pci_read_config_word(dev, GGC, &ggc))
6090                 return;
6091
6092         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6093                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6094                 dmar_map_gfx = 0;
6095         } else if (dmar_map_gfx) {
6096                 /* we have to ensure the gfx device is idle before we flush */
6097                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6098                 intel_iommu_strict = 1;
6099        }
6100 }
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6105
6106 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6107    ISOCH DMAR unit for the Azalia sound device, but not give it any
6108    TLB entries, which causes it to deadlock. Check for that.  We do
6109    this in a function called from init_dmars(), instead of in a PCI
6110    quirk, because we don't want to print the obnoxious "BIOS broken"
6111    message if VT-d is actually disabled.
6112 */
6113 static void __init check_tylersburg_isoch(void)
6114 {
6115         struct pci_dev *pdev;
6116         uint32_t vtisochctrl;
6117
6118         /* If there's no Azalia in the system anyway, forget it. */
6119         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6120         if (!pdev)
6121                 return;
6122         pci_dev_put(pdev);
6123
6124         /* System Management Registers. Might be hidden, in which case
6125            we can't do the sanity check. But that's OK, because the
6126            known-broken BIOSes _don't_ actually hide it, so far. */
6127         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6128         if (!pdev)
6129                 return;
6130
6131         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6132                 pci_dev_put(pdev);
6133                 return;
6134         }
6135
6136         pci_dev_put(pdev);
6137
6138         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6139         if (vtisochctrl & 1)
6140                 return;
6141
6142         /* Drop all bits other than the number of TLB entries */
6143         vtisochctrl &= 0x1c;
6144
6145         /* If we have the recommended number of TLB entries (16), fine. */
6146         if (vtisochctrl == 0x10)
6147                 return;
6148
6149         /* Zero TLB entries? You get to ride the short bus to school. */
6150         if (!vtisochctrl) {
6151                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6152                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6153                      dmi_get_system_info(DMI_BIOS_VENDOR),
6154                      dmi_get_system_info(DMI_BIOS_VERSION),
6155                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6156                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6157                 return;
6158         }
6159
6160         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6161                vtisochctrl);
6162 }