Merge branch 'for-5.10/cp2112' into for-linus
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
368 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
369 struct device_domain_info *get_domain_info(struct device *dev)
370 {
371         struct device_domain_info *info;
372
373         if (!dev)
374                 return NULL;
375
376         info = dev_iommu_priv_get(dev);
377         if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
378                      info == DEFER_DEVICE_DOMAIN_INFO))
379                 return NULL;
380
381         return info;
382 }
383
384 DEFINE_SPINLOCK(device_domain_lock);
385 static LIST_HEAD(device_domain_list);
386
387 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
388                                 to_pci_dev(d)->untrusted)
389
390 /*
391  * Iterate over elements in device_domain_list and call the specified
392  * callback @fn against each element.
393  */
394 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
395                                      void *data), void *data)
396 {
397         int ret = 0;
398         unsigned long flags;
399         struct device_domain_info *info;
400
401         spin_lock_irqsave(&device_domain_lock, flags);
402         list_for_each_entry(info, &device_domain_list, global) {
403                 ret = fn(info, data);
404                 if (ret) {
405                         spin_unlock_irqrestore(&device_domain_lock, flags);
406                         return ret;
407                 }
408         }
409         spin_unlock_irqrestore(&device_domain_lock, flags);
410
411         return 0;
412 }
413
414 const struct iommu_ops intel_iommu_ops;
415
416 static bool translation_pre_enabled(struct intel_iommu *iommu)
417 {
418         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
419 }
420
421 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
422 {
423         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
424 }
425
426 static void init_translation_status(struct intel_iommu *iommu)
427 {
428         u32 gsts;
429
430         gsts = readl(iommu->reg + DMAR_GSTS_REG);
431         if (gsts & DMA_GSTS_TES)
432                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
433 }
434
435 static int __init intel_iommu_setup(char *str)
436 {
437         if (!str)
438                 return -EINVAL;
439         while (*str) {
440                 if (!strncmp(str, "on", 2)) {
441                         dmar_disabled = 0;
442                         pr_info("IOMMU enabled\n");
443                 } else if (!strncmp(str, "off", 3)) {
444                         dmar_disabled = 1;
445                         no_platform_optin = 1;
446                         pr_info("IOMMU disabled\n");
447                 } else if (!strncmp(str, "igfx_off", 8)) {
448                         dmar_map_gfx = 0;
449                         pr_info("Disable GFX device mapping\n");
450                 } else if (!strncmp(str, "forcedac", 8)) {
451                         pr_info("Forcing DAC for PCI devices\n");
452                         dmar_forcedac = 1;
453                 } else if (!strncmp(str, "strict", 6)) {
454                         pr_info("Disable batched IOTLB flush\n");
455                         intel_iommu_strict = 1;
456                 } else if (!strncmp(str, "sp_off", 6)) {
457                         pr_info("Disable supported super page\n");
458                         intel_iommu_superpage = 0;
459                 } else if (!strncmp(str, "sm_on", 5)) {
460                         pr_info("Intel-IOMMU: scalable mode supported\n");
461                         intel_iommu_sm = 1;
462                 } else if (!strncmp(str, "tboot_noforce", 13)) {
463                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
464                         intel_iommu_tboot_noforce = 1;
465                 } else if (!strncmp(str, "nobounce", 8)) {
466                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
467                         intel_no_bounce = 1;
468                 }
469
470                 str += strcspn(str, ",");
471                 while (*str == ',')
472                         str++;
473         }
474         return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483         struct dmar_domain **domains;
484         int idx = did >> 8;
485
486         domains = iommu->domains[idx];
487         if (!domains)
488                 return NULL;
489
490         return domains[did & 0xff];
491 }
492
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494                              struct dmar_domain *domain)
495 {
496         struct dmar_domain **domains;
497         int idx = did >> 8;
498
499         if (!iommu->domains[idx]) {
500                 size_t size = 256 * sizeof(struct dmar_domain *);
501                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502         }
503
504         domains = iommu->domains[idx];
505         if (WARN_ON(!domains))
506                 return;
507         else
508                 domains[did & 0xff] = domain;
509 }
510
511 void *alloc_pgtable_page(int node)
512 {
513         struct page *page;
514         void *vaddr = NULL;
515
516         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517         if (page)
518                 vaddr = page_address(page);
519         return vaddr;
520 }
521
522 void free_pgtable_page(void *vaddr)
523 {
524         free_page((unsigned long)vaddr);
525 }
526
527 static inline void *alloc_domain_mem(void)
528 {
529         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531
532 static void free_domain_mem(void *vaddr)
533 {
534         kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536
537 static inline void * alloc_devinfo_mem(void)
538 {
539         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544         kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551
552 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 {
554         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 }
556
557 static inline int domain_pfn_supported(struct dmar_domain *domain,
558                                        unsigned long pfn)
559 {
560         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561
562         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 }
564
565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566 {
567         unsigned long sagaw;
568         int agaw = -1;
569
570         sagaw = cap_sagaw(iommu->cap);
571         for (agaw = width_to_agaw(max_gaw);
572              agaw >= 0; agaw--) {
573                 if (test_bit(agaw, &sagaw))
574                         break;
575         }
576
577         return agaw;
578 }
579
580 /*
581  * Calculate max SAGAW for each iommu.
582  */
583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 {
585         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586 }
587
588 /*
589  * calculate agaw for each iommu.
590  * "SAGAW" may be different across iommus, use a default agaw, and
591  * get a supported less agaw for iommus that don't support the default agaw.
592  */
593 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 {
595         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 }
597
598 /* This functionin only returns single iommu in a domain */
599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600 {
601         int iommu_id;
602
603         /* si_domain and vm domain should not get here. */
604         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605                 return NULL;
606
607         for_each_domain_iommu(iommu_id, domain)
608                 break;
609
610         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611                 return NULL;
612
613         return g_iommus[iommu_id];
614 }
615
616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 {
618         return sm_supported(iommu) ?
619                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 }
621
622 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 {
624         struct dmar_drhd_unit *drhd;
625         struct intel_iommu *iommu;
626         bool found = false;
627         int i;
628
629         domain->iommu_coherency = 1;
630
631         for_each_domain_iommu(i, domain) {
632                 found = true;
633                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
634                         domain->iommu_coherency = 0;
635                         break;
636                 }
637         }
638         if (found)
639                 return;
640
641         /* No hardware attached; use lowest common denominator */
642         rcu_read_lock();
643         for_each_active_iommu(iommu, drhd) {
644                 if (!iommu_paging_structure_coherency(iommu)) {
645                         domain->iommu_coherency = 0;
646                         break;
647                 }
648         }
649         rcu_read_unlock();
650 }
651
652 static int domain_update_iommu_snooping(struct intel_iommu *skip)
653 {
654         struct dmar_drhd_unit *drhd;
655         struct intel_iommu *iommu;
656         int ret = 1;
657
658         rcu_read_lock();
659         for_each_active_iommu(iommu, drhd) {
660                 if (iommu != skip) {
661                         if (!ecap_sc_support(iommu->ecap)) {
662                                 ret = 0;
663                                 break;
664                         }
665                 }
666         }
667         rcu_read_unlock();
668
669         return ret;
670 }
671
672 static int domain_update_iommu_superpage(struct dmar_domain *domain,
673                                          struct intel_iommu *skip)
674 {
675         struct dmar_drhd_unit *drhd;
676         struct intel_iommu *iommu;
677         int mask = 0x3;
678
679         if (!intel_iommu_superpage) {
680                 return 0;
681         }
682
683         /* set iommu_superpage to the smallest common denominator */
684         rcu_read_lock();
685         for_each_active_iommu(iommu, drhd) {
686                 if (iommu != skip) {
687                         if (domain && domain_use_first_level(domain)) {
688                                 if (!cap_fl1gp_support(iommu->cap))
689                                         mask = 0x1;
690                         } else {
691                                 mask &= cap_super_page_val(iommu->cap);
692                         }
693
694                         if (!mask)
695                                 break;
696                 }
697         }
698         rcu_read_unlock();
699
700         return fls(mask);
701 }
702
703 /* Some capabilities may be different across iommus */
704 static void domain_update_iommu_cap(struct dmar_domain *domain)
705 {
706         domain_update_iommu_coherency(domain);
707         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
708         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
709 }
710
711 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
712                                          u8 devfn, int alloc)
713 {
714         struct root_entry *root = &iommu->root_entry[bus];
715         struct context_entry *context;
716         u64 *entry;
717
718         entry = &root->lo;
719         if (sm_supported(iommu)) {
720                 if (devfn >= 0x80) {
721                         devfn -= 0x80;
722                         entry = &root->hi;
723                 }
724                 devfn *= 2;
725         }
726         if (*entry & 1)
727                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
728         else {
729                 unsigned long phy_addr;
730                 if (!alloc)
731                         return NULL;
732
733                 context = alloc_pgtable_page(iommu->node);
734                 if (!context)
735                         return NULL;
736
737                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
738                 phy_addr = virt_to_phys((void *)context);
739                 *entry = phy_addr | 1;
740                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
741         }
742         return &context[devfn];
743 }
744
745 static int iommu_dummy(struct device *dev)
746 {
747         return dev_iommu_priv_get(dev) == DUMMY_DEVICE_DOMAIN_INFO;
748 }
749
750 static bool attach_deferred(struct device *dev)
751 {
752         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
753 }
754
755 /**
756  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
757  *                               sub-hierarchy of a candidate PCI-PCI bridge
758  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
759  * @bridge: the candidate PCI-PCI bridge
760  *
761  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
762  */
763 static bool
764 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
765 {
766         struct pci_dev *pdev, *pbridge;
767
768         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
769                 return false;
770
771         pdev = to_pci_dev(dev);
772         pbridge = to_pci_dev(bridge);
773
774         if (pbridge->subordinate &&
775             pbridge->subordinate->number <= pdev->bus->number &&
776             pbridge->subordinate->busn_res.end >= pdev->bus->number)
777                 return true;
778
779         return false;
780 }
781
782 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
783 {
784         struct dmar_drhd_unit *drhd = NULL;
785         struct pci_dev *pdev = NULL;
786         struct intel_iommu *iommu;
787         struct device *tmp;
788         u16 segment = 0;
789         int i;
790
791         if (!dev || iommu_dummy(dev))
792                 return NULL;
793
794         if (dev_is_pci(dev)) {
795                 struct pci_dev *pf_pdev;
796
797                 pdev = pci_real_dma_dev(to_pci_dev(dev));
798
799                 /* VFs aren't listed in scope tables; we need to look up
800                  * the PF instead to find the IOMMU. */
801                 pf_pdev = pci_physfn(pdev);
802                 dev = &pf_pdev->dev;
803                 segment = pci_domain_nr(pdev->bus);
804         } else if (has_acpi_companion(dev))
805                 dev = &ACPI_COMPANION(dev)->dev;
806
807         rcu_read_lock();
808         for_each_active_iommu(iommu, drhd) {
809                 if (pdev && segment != drhd->segment)
810                         continue;
811
812                 for_each_active_dev_scope(drhd->devices,
813                                           drhd->devices_cnt, i, tmp) {
814                         if (tmp == dev) {
815                                 /* For a VF use its original BDF# not that of the PF
816                                  * which we used for the IOMMU lookup. Strictly speaking
817                                  * we could do this for all PCI devices; we only need to
818                                  * get the BDF# from the scope table for ACPI matches. */
819                                 if (pdev && pdev->is_virtfn)
820                                         goto got_pdev;
821
822                                 if (bus && devfn) {
823                                         *bus = drhd->devices[i].bus;
824                                         *devfn = drhd->devices[i].devfn;
825                                 }
826                                 goto out;
827                         }
828
829                         if (is_downstream_to_pci_bridge(dev, tmp))
830                                 goto got_pdev;
831                 }
832
833                 if (pdev && drhd->include_all) {
834                 got_pdev:
835                         if (bus && devfn) {
836                                 *bus = pdev->bus->number;
837                                 *devfn = pdev->devfn;
838                         }
839                         goto out;
840                 }
841         }
842         iommu = NULL;
843  out:
844         rcu_read_unlock();
845
846         return iommu;
847 }
848
849 static void domain_flush_cache(struct dmar_domain *domain,
850                                void *addr, int size)
851 {
852         if (!domain->iommu_coherency)
853                 clflush_cache_range(addr, size);
854 }
855
856 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
857 {
858         struct context_entry *context;
859         int ret = 0;
860         unsigned long flags;
861
862         spin_lock_irqsave(&iommu->lock, flags);
863         context = iommu_context_addr(iommu, bus, devfn, 0);
864         if (context)
865                 ret = context_present(context);
866         spin_unlock_irqrestore(&iommu->lock, flags);
867         return ret;
868 }
869
870 static void free_context_table(struct intel_iommu *iommu)
871 {
872         int i;
873         unsigned long flags;
874         struct context_entry *context;
875
876         spin_lock_irqsave(&iommu->lock, flags);
877         if (!iommu->root_entry) {
878                 goto out;
879         }
880         for (i = 0; i < ROOT_ENTRY_NR; i++) {
881                 context = iommu_context_addr(iommu, i, 0, 0);
882                 if (context)
883                         free_pgtable_page(context);
884
885                 if (!sm_supported(iommu))
886                         continue;
887
888                 context = iommu_context_addr(iommu, i, 0x80, 0);
889                 if (context)
890                         free_pgtable_page(context);
891
892         }
893         free_pgtable_page(iommu->root_entry);
894         iommu->root_entry = NULL;
895 out:
896         spin_unlock_irqrestore(&iommu->lock, flags);
897 }
898
899 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
900                                       unsigned long pfn, int *target_level)
901 {
902         struct dma_pte *parent, *pte;
903         int level = agaw_to_level(domain->agaw);
904         int offset;
905
906         BUG_ON(!domain->pgd);
907
908         if (!domain_pfn_supported(domain, pfn))
909                 /* Address beyond IOMMU's addressing capabilities. */
910                 return NULL;
911
912         parent = domain->pgd;
913
914         while (1) {
915                 void *tmp_page;
916
917                 offset = pfn_level_offset(pfn, level);
918                 pte = &parent[offset];
919                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
920                         break;
921                 if (level == *target_level)
922                         break;
923
924                 if (!dma_pte_present(pte)) {
925                         uint64_t pteval;
926
927                         tmp_page = alloc_pgtable_page(domain->nid);
928
929                         if (!tmp_page)
930                                 return NULL;
931
932                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
933                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
934                         if (domain_use_first_level(domain))
935                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
936                         if (cmpxchg64(&pte->val, 0ULL, pteval))
937                                 /* Someone else set it while we were thinking; use theirs. */
938                                 free_pgtable_page(tmp_page);
939                         else
940                                 domain_flush_cache(domain, pte, sizeof(*pte));
941                 }
942                 if (level == 1)
943                         break;
944
945                 parent = phys_to_virt(dma_pte_addr(pte));
946                 level--;
947         }
948
949         if (!*target_level)
950                 *target_level = level;
951
952         return pte;
953 }
954
955 /* return address's pte at specific level */
956 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
957                                          unsigned long pfn,
958                                          int level, int *large_page)
959 {
960         struct dma_pte *parent, *pte;
961         int total = agaw_to_level(domain->agaw);
962         int offset;
963
964         parent = domain->pgd;
965         while (level <= total) {
966                 offset = pfn_level_offset(pfn, total);
967                 pte = &parent[offset];
968                 if (level == total)
969                         return pte;
970
971                 if (!dma_pte_present(pte)) {
972                         *large_page = total;
973                         break;
974                 }
975
976                 if (dma_pte_superpage(pte)) {
977                         *large_page = total;
978                         return pte;
979                 }
980
981                 parent = phys_to_virt(dma_pte_addr(pte));
982                 total--;
983         }
984         return NULL;
985 }
986
987 /* clear last level pte, a tlb flush should be followed */
988 static void dma_pte_clear_range(struct dmar_domain *domain,
989                                 unsigned long start_pfn,
990                                 unsigned long last_pfn)
991 {
992         unsigned int large_page;
993         struct dma_pte *first_pte, *pte;
994
995         BUG_ON(!domain_pfn_supported(domain, start_pfn));
996         BUG_ON(!domain_pfn_supported(domain, last_pfn));
997         BUG_ON(start_pfn > last_pfn);
998
999         /* we don't need lock here; nobody else touches the iova range */
1000         do {
1001                 large_page = 1;
1002                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1003                 if (!pte) {
1004                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1005                         continue;
1006                 }
1007                 do {
1008                         dma_clear_pte(pte);
1009                         start_pfn += lvl_to_nr_pages(large_page);
1010                         pte++;
1011                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1012
1013                 domain_flush_cache(domain, first_pte,
1014                                    (void *)pte - (void *)first_pte);
1015
1016         } while (start_pfn && start_pfn <= last_pfn);
1017 }
1018
1019 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1020                                int retain_level, struct dma_pte *pte,
1021                                unsigned long pfn, unsigned long start_pfn,
1022                                unsigned long last_pfn)
1023 {
1024         pfn = max(start_pfn, pfn);
1025         pte = &pte[pfn_level_offset(pfn, level)];
1026
1027         do {
1028                 unsigned long level_pfn;
1029                 struct dma_pte *level_pte;
1030
1031                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1032                         goto next;
1033
1034                 level_pfn = pfn & level_mask(level);
1035                 level_pte = phys_to_virt(dma_pte_addr(pte));
1036
1037                 if (level > 2) {
1038                         dma_pte_free_level(domain, level - 1, retain_level,
1039                                            level_pte, level_pfn, start_pfn,
1040                                            last_pfn);
1041                 }
1042
1043                 /*
1044                  * Free the page table if we're below the level we want to
1045                  * retain and the range covers the entire table.
1046                  */
1047                 if (level < retain_level && !(start_pfn > level_pfn ||
1048                       last_pfn < level_pfn + level_size(level) - 1)) {
1049                         dma_clear_pte(pte);
1050                         domain_flush_cache(domain, pte, sizeof(*pte));
1051                         free_pgtable_page(level_pte);
1052                 }
1053 next:
1054                 pfn += level_size(level);
1055         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1056 }
1057
1058 /*
1059  * clear last level (leaf) ptes and free page table pages below the
1060  * level we wish to keep intact.
1061  */
1062 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1063                                    unsigned long start_pfn,
1064                                    unsigned long last_pfn,
1065                                    int retain_level)
1066 {
1067         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1068         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1069         BUG_ON(start_pfn > last_pfn);
1070
1071         dma_pte_clear_range(domain, start_pfn, last_pfn);
1072
1073         /* We don't need lock here; nobody else touches the iova range */
1074         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1075                            domain->pgd, 0, start_pfn, last_pfn);
1076
1077         /* free pgd */
1078         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1079                 free_pgtable_page(domain->pgd);
1080                 domain->pgd = NULL;
1081         }
1082 }
1083
1084 /* When a page at a given level is being unlinked from its parent, we don't
1085    need to *modify* it at all. All we need to do is make a list of all the
1086    pages which can be freed just as soon as we've flushed the IOTLB and we
1087    know the hardware page-walk will no longer touch them.
1088    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1089    be freed. */
1090 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1091                                             int level, struct dma_pte *pte,
1092                                             struct page *freelist)
1093 {
1094         struct page *pg;
1095
1096         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1097         pg->freelist = freelist;
1098         freelist = pg;
1099
1100         if (level == 1)
1101                 return freelist;
1102
1103         pte = page_address(pg);
1104         do {
1105                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1106                         freelist = dma_pte_list_pagetables(domain, level - 1,
1107                                                            pte, freelist);
1108                 pte++;
1109         } while (!first_pte_in_page(pte));
1110
1111         return freelist;
1112 }
1113
1114 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1115                                         struct dma_pte *pte, unsigned long pfn,
1116                                         unsigned long start_pfn,
1117                                         unsigned long last_pfn,
1118                                         struct page *freelist)
1119 {
1120         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1121
1122         pfn = max(start_pfn, pfn);
1123         pte = &pte[pfn_level_offset(pfn, level)];
1124
1125         do {
1126                 unsigned long level_pfn;
1127
1128                 if (!dma_pte_present(pte))
1129                         goto next;
1130
1131                 level_pfn = pfn & level_mask(level);
1132
1133                 /* If range covers entire pagetable, free it */
1134                 if (start_pfn <= level_pfn &&
1135                     last_pfn >= level_pfn + level_size(level) - 1) {
1136                         /* These suborbinate page tables are going away entirely. Don't
1137                            bother to clear them; we're just going to *free* them. */
1138                         if (level > 1 && !dma_pte_superpage(pte))
1139                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140
1141                         dma_clear_pte(pte);
1142                         if (!first_pte)
1143                                 first_pte = pte;
1144                         last_pte = pte;
1145                 } else if (level > 1) {
1146                         /* Recurse down into a level that isn't *entirely* obsolete */
1147                         freelist = dma_pte_clear_level(domain, level - 1,
1148                                                        phys_to_virt(dma_pte_addr(pte)),
1149                                                        level_pfn, start_pfn, last_pfn,
1150                                                        freelist);
1151                 }
1152 next:
1153                 pfn += level_size(level);
1154         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155
1156         if (first_pte)
1157                 domain_flush_cache(domain, first_pte,
1158                                    (void *)++last_pte - (void *)first_pte);
1159
1160         return freelist;
1161 }
1162
1163 /* We can't just free the pages because the IOMMU may still be walking
1164    the page tables, and may have cached the intermediate levels. The
1165    pages can only be freed after the IOTLB flush has been done. */
1166 static struct page *domain_unmap(struct dmar_domain *domain,
1167                                  unsigned long start_pfn,
1168                                  unsigned long last_pfn)
1169 {
1170         struct page *freelist;
1171
1172         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174         BUG_ON(start_pfn > last_pfn);
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 pgd_page->freelist = freelist;
1184                 freelist = pgd_page;
1185
1186                 domain->pgd = NULL;
1187         }
1188
1189         return freelist;
1190 }
1191
1192 static void dma_free_pagelist(struct page *freelist)
1193 {
1194         struct page *pg;
1195
1196         while ((pg = freelist)) {
1197                 freelist = pg->freelist;
1198                 free_pgtable_page(page_address(pg));
1199         }
1200 }
1201
1202 static void iova_entry_free(unsigned long data)
1203 {
1204         struct page *freelist = (struct page *)data;
1205
1206         dma_free_pagelist(freelist);
1207 }
1208
1209 /* iommu handling */
1210 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1211 {
1212         struct root_entry *root;
1213         unsigned long flags;
1214
1215         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1216         if (!root) {
1217                 pr_err("Allocating root entry for %s failed\n",
1218                         iommu->name);
1219                 return -ENOMEM;
1220         }
1221
1222         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1223
1224         spin_lock_irqsave(&iommu->lock, flags);
1225         iommu->root_entry = root;
1226         spin_unlock_irqrestore(&iommu->lock, flags);
1227
1228         return 0;
1229 }
1230
1231 static void iommu_set_root_entry(struct intel_iommu *iommu)
1232 {
1233         u64 addr;
1234         u32 sts;
1235         unsigned long flag;
1236
1237         addr = virt_to_phys(iommu->root_entry);
1238         if (sm_supported(iommu))
1239                 addr |= DMA_RTADDR_SMT;
1240
1241         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1243
1244         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1245
1246         /* Make sure hardware complete it */
1247         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1248                       readl, (sts & DMA_GSTS_RTPS), sts);
1249
1250         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 }
1252
1253 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1254 {
1255         u32 val;
1256         unsigned long flag;
1257
1258         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1259                 return;
1260
1261         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1262         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1263
1264         /* Make sure hardware complete it */
1265         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1266                       readl, (!(val & DMA_GSTS_WBFS)), val);
1267
1268         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 }
1270
1271 /* return value determine if we need a write buffer flush */
1272 static void __iommu_flush_context(struct intel_iommu *iommu,
1273                                   u16 did, u16 source_id, u8 function_mask,
1274                                   u64 type)
1275 {
1276         u64 val = 0;
1277         unsigned long flag;
1278
1279         switch (type) {
1280         case DMA_CCMD_GLOBAL_INVL:
1281                 val = DMA_CCMD_GLOBAL_INVL;
1282                 break;
1283         case DMA_CCMD_DOMAIN_INVL:
1284                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1285                 break;
1286         case DMA_CCMD_DEVICE_INVL:
1287                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1288                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1289                 break;
1290         default:
1291                 BUG();
1292         }
1293         val |= DMA_CCMD_ICC;
1294
1295         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1297
1298         /* Make sure hardware complete it */
1299         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1300                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1301
1302         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 }
1304
1305 /* return value determine if we need a write buffer flush */
1306 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1307                                 u64 addr, unsigned int size_order, u64 type)
1308 {
1309         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1310         u64 val = 0, val_iva = 0;
1311         unsigned long flag;
1312
1313         switch (type) {
1314         case DMA_TLB_GLOBAL_FLUSH:
1315                 /* global flush doesn't need set IVA_REG */
1316                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1317                 break;
1318         case DMA_TLB_DSI_FLUSH:
1319                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1320                 break;
1321         case DMA_TLB_PSI_FLUSH:
1322                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1323                 /* IH bit is passed in as part of address */
1324                 val_iva = size_order | addr;
1325                 break;
1326         default:
1327                 BUG();
1328         }
1329         /* Note: set drain read/write */
1330 #if 0
1331         /*
1332          * This is probably to be super secure.. Looks like we can
1333          * ignore it without any impact.
1334          */
1335         if (cap_read_drain(iommu->cap))
1336                 val |= DMA_TLB_READ_DRAIN;
1337 #endif
1338         if (cap_write_drain(iommu->cap))
1339                 val |= DMA_TLB_WRITE_DRAIN;
1340
1341         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342         /* Note: Only uses first TLB reg currently */
1343         if (val_iva)
1344                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1345         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1346
1347         /* Make sure hardware complete it */
1348         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1349                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1350
1351         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352
1353         /* check IOTLB invalidation granularity */
1354         if (DMA_TLB_IAIG(val) == 0)
1355                 pr_err("Flush IOTLB failed\n");
1356         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1357                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1358                         (unsigned long long)DMA_TLB_IIRG(type),
1359                         (unsigned long long)DMA_TLB_IAIG(val));
1360 }
1361
1362 static struct device_domain_info *
1363 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1364                          u8 bus, u8 devfn)
1365 {
1366         struct device_domain_info *info;
1367
1368         assert_spin_locked(&device_domain_lock);
1369
1370         if (!iommu->qi)
1371                 return NULL;
1372
1373         list_for_each_entry(info, &domain->devices, link)
1374                 if (info->iommu == iommu && info->bus == bus &&
1375                     info->devfn == devfn) {
1376                         if (info->ats_supported && info->dev)
1377                                 return info;
1378                         break;
1379                 }
1380
1381         return NULL;
1382 }
1383
1384 static void domain_update_iotlb(struct dmar_domain *domain)
1385 {
1386         struct device_domain_info *info;
1387         bool has_iotlb_device = false;
1388
1389         assert_spin_locked(&device_domain_lock);
1390
1391         list_for_each_entry(info, &domain->devices, link) {
1392                 struct pci_dev *pdev;
1393
1394                 if (!info->dev || !dev_is_pci(info->dev))
1395                         continue;
1396
1397                 pdev = to_pci_dev(info->dev);
1398                 if (pdev->ats_enabled) {
1399                         has_iotlb_device = true;
1400                         break;
1401                 }
1402         }
1403
1404         domain->has_iotlb_device = has_iotlb_device;
1405 }
1406
1407 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1408 {
1409         struct pci_dev *pdev;
1410
1411         assert_spin_locked(&device_domain_lock);
1412
1413         if (!info || !dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1418          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1419          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1420          * reserved, which should be set to 0.
1421          */
1422         if (!ecap_dit(info->iommu->ecap))
1423                 info->pfsid = 0;
1424         else {
1425                 struct pci_dev *pf_pdev;
1426
1427                 /* pdev will be returned if device is not a vf */
1428                 pf_pdev = pci_physfn(pdev);
1429                 info->pfsid = pci_dev_id(pf_pdev);
1430         }
1431
1432 #ifdef CONFIG_INTEL_IOMMU_SVM
1433         /* The PCIe spec, in its wisdom, declares that the behaviour of
1434            the device if you enable PASID support after ATS support is
1435            undefined. So always enable PASID support on devices which
1436            have it, even if we can't yet know if we're ever going to
1437            use it. */
1438         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1439                 info->pasid_enabled = 1;
1440
1441         if (info->pri_supported &&
1442             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1443             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1444                 info->pri_enabled = 1;
1445 #endif
1446         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1447             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1448                 info->ats_enabled = 1;
1449                 domain_update_iotlb(info->domain);
1450                 info->ats_qdep = pci_ats_queue_depth(pdev);
1451         }
1452 }
1453
1454 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1455 {
1456         struct pci_dev *pdev;
1457
1458         assert_spin_locked(&device_domain_lock);
1459
1460         if (!dev_is_pci(info->dev))
1461                 return;
1462
1463         pdev = to_pci_dev(info->dev);
1464
1465         if (info->ats_enabled) {
1466                 pci_disable_ats(pdev);
1467                 info->ats_enabled = 0;
1468                 domain_update_iotlb(info->domain);
1469         }
1470 #ifdef CONFIG_INTEL_IOMMU_SVM
1471         if (info->pri_enabled) {
1472                 pci_disable_pri(pdev);
1473                 info->pri_enabled = 0;
1474         }
1475         if (info->pasid_enabled) {
1476                 pci_disable_pasid(pdev);
1477                 info->pasid_enabled = 0;
1478         }
1479 #endif
1480 }
1481
1482 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1483                                   u64 addr, unsigned mask)
1484 {
1485         u16 sid, qdep;
1486         unsigned long flags;
1487         struct device_domain_info *info;
1488
1489         if (!domain->has_iotlb_device)
1490                 return;
1491
1492         spin_lock_irqsave(&device_domain_lock, flags);
1493         list_for_each_entry(info, &domain->devices, link) {
1494                 if (!info->ats_enabled)
1495                         continue;
1496
1497                 sid = info->bus << 8 | info->devfn;
1498                 qdep = info->ats_qdep;
1499                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1500                                 qdep, addr, mask);
1501         }
1502         spin_unlock_irqrestore(&device_domain_lock, flags);
1503 }
1504
1505 static void domain_flush_piotlb(struct intel_iommu *iommu,
1506                                 struct dmar_domain *domain,
1507                                 u64 addr, unsigned long npages, bool ih)
1508 {
1509         u16 did = domain->iommu_did[iommu->seq_id];
1510
1511         if (domain->default_pasid)
1512                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1513                                 addr, npages, ih);
1514
1515         if (!list_empty(&domain->devices))
1516                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1517 }
1518
1519 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1520                                   struct dmar_domain *domain,
1521                                   unsigned long pfn, unsigned int pages,
1522                                   int ih, int map)
1523 {
1524         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1525         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1526         u16 did = domain->iommu_did[iommu->seq_id];
1527
1528         BUG_ON(pages == 0);
1529
1530         if (ih)
1531                 ih = 1 << 6;
1532
1533         if (domain_use_first_level(domain)) {
1534                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1535         } else {
1536                 /*
1537                  * Fallback to domain selective flush if no PSI support or
1538                  * the size is too big. PSI requires page size to be 2 ^ x,
1539                  * and the base address is naturally aligned to the size.
1540                  */
1541                 if (!cap_pgsel_inv(iommu->cap) ||
1542                     mask > cap_max_amask_val(iommu->cap))
1543                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1544                                                         DMA_TLB_DSI_FLUSH);
1545                 else
1546                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1547                                                         DMA_TLB_PSI_FLUSH);
1548         }
1549
1550         /*
1551          * In caching mode, changes of pages from non-present to present require
1552          * flush. However, device IOTLB doesn't need to be flushed in this case.
1553          */
1554         if (!cap_caching_mode(iommu->cap) || !map)
1555                 iommu_flush_dev_iotlb(domain, addr, mask);
1556 }
1557
1558 /* Notification for newly created mappings */
1559 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1560                                         struct dmar_domain *domain,
1561                                         unsigned long pfn, unsigned int pages)
1562 {
1563         /*
1564          * It's a non-present to present mapping. Only flush if caching mode
1565          * and second level.
1566          */
1567         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1568                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1569         else
1570                 iommu_flush_write_buffer(iommu);
1571 }
1572
1573 static void iommu_flush_iova(struct iova_domain *iovad)
1574 {
1575         struct dmar_domain *domain;
1576         int idx;
1577
1578         domain = container_of(iovad, struct dmar_domain, iovad);
1579
1580         for_each_domain_iommu(idx, domain) {
1581                 struct intel_iommu *iommu = g_iommus[idx];
1582                 u16 did = domain->iommu_did[iommu->seq_id];
1583
1584                 if (domain_use_first_level(domain))
1585                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1586                 else
1587                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1588                                                  DMA_TLB_DSI_FLUSH);
1589
1590                 if (!cap_caching_mode(iommu->cap))
1591                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1592                                               0, MAX_AGAW_PFN_WIDTH);
1593         }
1594 }
1595
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598         u32 pmen;
1599         unsigned long flags;
1600
1601         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602                 return;
1603
1604         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606         pmen &= ~DMA_PMEN_EPM;
1607         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609         /* wait for the protected region status bit to clear */
1610         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618         u32 sts;
1619         unsigned long flags;
1620
1621         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622         iommu->gcmd |= DMA_GCMD_TE;
1623         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625         /* Make sure hardware complete it */
1626         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627                       readl, (sts & DMA_GSTS_TES), sts);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flag;
1636
1637         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639                 return;
1640
1641         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642         iommu->gcmd &= ~DMA_GCMD_TE;
1643         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645         /* Make sure hardware complete it */
1646         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647                       readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654         u32 ndomains, nlongs;
1655         size_t size;
1656
1657         ndomains = cap_ndoms(iommu->cap);
1658         pr_debug("%s: Number of Domains supported <%d>\n",
1659                  iommu->name, ndomains);
1660         nlongs = BITS_TO_LONGS(ndomains);
1661
1662         spin_lock_init(&iommu->lock);
1663
1664         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1665         if (!iommu->domain_ids) {
1666                 pr_err("%s: Allocating domain id array failed\n",
1667                        iommu->name);
1668                 return -ENOMEM;
1669         }
1670
1671         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1672         iommu->domains = kzalloc(size, GFP_KERNEL);
1673
1674         if (iommu->domains) {
1675                 size = 256 * sizeof(struct dmar_domain *);
1676                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1677         }
1678
1679         if (!iommu->domains || !iommu->domains[0]) {
1680                 pr_err("%s: Allocating domain array failed\n",
1681                        iommu->name);
1682                 kfree(iommu->domain_ids);
1683                 kfree(iommu->domains);
1684                 iommu->domain_ids = NULL;
1685                 iommu->domains    = NULL;
1686                 return -ENOMEM;
1687         }
1688
1689         /*
1690          * If Caching mode is set, then invalid translations are tagged
1691          * with domain-id 0, hence we need to pre-allocate it. We also
1692          * use domain-id 0 as a marker for non-allocated domain-id, so
1693          * make sure it is not used for a real domain.
1694          */
1695         set_bit(0, iommu->domain_ids);
1696
1697         /*
1698          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1699          * entry for first-level or pass-through translation modes should
1700          * be programmed with a domain id different from those used for
1701          * second-level or nested translation. We reserve a domain id for
1702          * this purpose.
1703          */
1704         if (sm_supported(iommu))
1705                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1706
1707         return 0;
1708 }
1709
1710 static void disable_dmar_iommu(struct intel_iommu *iommu)
1711 {
1712         struct device_domain_info *info, *tmp;
1713         unsigned long flags;
1714
1715         if (!iommu->domains || !iommu->domain_ids)
1716                 return;
1717
1718         spin_lock_irqsave(&device_domain_lock, flags);
1719         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1720                 if (info->iommu != iommu)
1721                         continue;
1722
1723                 if (!info->dev || !info->domain)
1724                         continue;
1725
1726                 __dmar_remove_one_dev_info(info);
1727         }
1728         spin_unlock_irqrestore(&device_domain_lock, flags);
1729
1730         if (iommu->gcmd & DMA_GCMD_TE)
1731                 iommu_disable_translation(iommu);
1732 }
1733
1734 static void free_dmar_iommu(struct intel_iommu *iommu)
1735 {
1736         if ((iommu->domains) && (iommu->domain_ids)) {
1737                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1738                 int i;
1739
1740                 for (i = 0; i < elems; i++)
1741                         kfree(iommu->domains[i]);
1742                 kfree(iommu->domains);
1743                 kfree(iommu->domain_ids);
1744                 iommu->domains = NULL;
1745                 iommu->domain_ids = NULL;
1746         }
1747
1748         g_iommus[iommu->seq_id] = NULL;
1749
1750         /* free context mapping */
1751         free_context_table(iommu);
1752
1753 #ifdef CONFIG_INTEL_IOMMU_SVM
1754         if (pasid_supported(iommu)) {
1755                 if (ecap_prs(iommu->ecap))
1756                         intel_svm_finish_prq(iommu);
1757         }
1758         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1759                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1760
1761 #endif
1762 }
1763
1764 /*
1765  * Check and return whether first level is used by default for
1766  * DMA translation.
1767  */
1768 static bool first_level_by_default(void)
1769 {
1770         struct dmar_drhd_unit *drhd;
1771         struct intel_iommu *iommu;
1772         static int first_level_support = -1;
1773
1774         if (likely(first_level_support != -1))
1775                 return first_level_support;
1776
1777         first_level_support = 1;
1778
1779         rcu_read_lock();
1780         for_each_active_iommu(iommu, drhd) {
1781                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1782                         first_level_support = 0;
1783                         break;
1784                 }
1785         }
1786         rcu_read_unlock();
1787
1788         return first_level_support;
1789 }
1790
1791 static struct dmar_domain *alloc_domain(int flags)
1792 {
1793         struct dmar_domain *domain;
1794
1795         domain = alloc_domain_mem();
1796         if (!domain)
1797                 return NULL;
1798
1799         memset(domain, 0, sizeof(*domain));
1800         domain->nid = NUMA_NO_NODE;
1801         domain->flags = flags;
1802         if (first_level_by_default())
1803                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1804         domain->has_iotlb_device = false;
1805         INIT_LIST_HEAD(&domain->devices);
1806
1807         return domain;
1808 }
1809
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812                                struct intel_iommu *iommu)
1813 {
1814         unsigned long ndomains;
1815         int num;
1816
1817         assert_spin_locked(&device_domain_lock);
1818         assert_spin_locked(&iommu->lock);
1819
1820         domain->iommu_refcnt[iommu->seq_id] += 1;
1821         domain->iommu_count += 1;
1822         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823                 ndomains = cap_ndoms(iommu->cap);
1824                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1825
1826                 if (num >= ndomains) {
1827                         pr_err("%s: No free domain ids\n", iommu->name);
1828                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1829                         domain->iommu_count -= 1;
1830                         return -ENOSPC;
1831                 }
1832
1833                 set_bit(num, iommu->domain_ids);
1834                 set_iommu_domain(iommu, num, domain);
1835
1836                 domain->iommu_did[iommu->seq_id] = num;
1837                 domain->nid                      = iommu->node;
1838
1839                 domain_update_iommu_cap(domain);
1840         }
1841
1842         return 0;
1843 }
1844
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846                                struct intel_iommu *iommu)
1847 {
1848         int num, count;
1849
1850         assert_spin_locked(&device_domain_lock);
1851         assert_spin_locked(&iommu->lock);
1852
1853         domain->iommu_refcnt[iommu->seq_id] -= 1;
1854         count = --domain->iommu_count;
1855         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856                 num = domain->iommu_did[iommu->seq_id];
1857                 clear_bit(num, iommu->domain_ids);
1858                 set_iommu_domain(iommu, num, NULL);
1859
1860                 domain_update_iommu_cap(domain);
1861                 domain->iommu_did[iommu->seq_id] = 0;
1862         }
1863
1864         return count;
1865 }
1866
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1869
1870 static int dmar_init_reserved_ranges(void)
1871 {
1872         struct pci_dev *pdev = NULL;
1873         struct iova *iova;
1874         int i;
1875
1876         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1877
1878         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879                 &reserved_rbtree_key);
1880
1881         /* IOAPIC ranges shouldn't be accessed by DMA */
1882         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883                 IOVA_PFN(IOAPIC_RANGE_END));
1884         if (!iova) {
1885                 pr_err("Reserve IOAPIC range failed\n");
1886                 return -ENODEV;
1887         }
1888
1889         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1890         for_each_pci_dev(pdev) {
1891                 struct resource *r;
1892
1893                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894                         r = &pdev->resource[i];
1895                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1896                                 continue;
1897                         iova = reserve_iova(&reserved_iova_list,
1898                                             IOVA_PFN(r->start),
1899                                             IOVA_PFN(r->end));
1900                         if (!iova) {
1901                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1902                                 return -ENODEV;
1903                         }
1904                 }
1905         }
1906         return 0;
1907 }
1908
1909 static inline int guestwidth_to_adjustwidth(int gaw)
1910 {
1911         int agaw;
1912         int r = (gaw - 12) % 9;
1913
1914         if (r == 0)
1915                 agaw = gaw;
1916         else
1917                 agaw = gaw + 9 - r;
1918         if (agaw > 64)
1919                 agaw = 64;
1920         return agaw;
1921 }
1922
1923 static void domain_exit(struct dmar_domain *domain)
1924 {
1925
1926         /* Remove associated devices and clear attached or cached domains */
1927         domain_remove_dev_info(domain);
1928
1929         /* destroy iovas */
1930         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1931                 put_iova_domain(&domain->iovad);
1932
1933         if (domain->pgd) {
1934                 struct page *freelist;
1935
1936                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1937                 dma_free_pagelist(freelist);
1938         }
1939
1940         free_domain_mem(domain);
1941 }
1942
1943 /*
1944  * Get the PASID directory size for scalable mode context entry.
1945  * Value of X in the PDTS field of a scalable mode context entry
1946  * indicates PASID directory with 2^(X + 7) entries.
1947  */
1948 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1949 {
1950         int pds, max_pde;
1951
1952         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1953         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1954         if (pds < 7)
1955                 return 0;
1956
1957         return pds - 7;
1958 }
1959
1960 /*
1961  * Set the RID_PASID field of a scalable mode context entry. The
1962  * IOMMU hardware will use the PASID value set in this field for
1963  * DMA translations of DMA requests without PASID.
1964  */
1965 static inline void
1966 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1967 {
1968         context->hi |= pasid & ((1 << 20) - 1);
1969 }
1970
1971 /*
1972  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1973  * entry.
1974  */
1975 static inline void context_set_sm_dte(struct context_entry *context)
1976 {
1977         context->lo |= (1 << 2);
1978 }
1979
1980 /*
1981  * Set the PRE(Page Request Enable) field of a scalable mode context
1982  * entry.
1983  */
1984 static inline void context_set_sm_pre(struct context_entry *context)
1985 {
1986         context->lo |= (1 << 4);
1987 }
1988
1989 /* Convert value to context PASID directory size field coding. */
1990 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1991
1992 static int domain_context_mapping_one(struct dmar_domain *domain,
1993                                       struct intel_iommu *iommu,
1994                                       struct pasid_table *table,
1995                                       u8 bus, u8 devfn)
1996 {
1997         u16 did = domain->iommu_did[iommu->seq_id];
1998         int translation = CONTEXT_TT_MULTI_LEVEL;
1999         struct device_domain_info *info = NULL;
2000         struct context_entry *context;
2001         unsigned long flags;
2002         int ret;
2003
2004         WARN_ON(did == 0);
2005
2006         if (hw_pass_through && domain_type_is_si(domain))
2007                 translation = CONTEXT_TT_PASS_THROUGH;
2008
2009         pr_debug("Set context mapping for %02x:%02x.%d\n",
2010                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2011
2012         BUG_ON(!domain->pgd);
2013
2014         spin_lock_irqsave(&device_domain_lock, flags);
2015         spin_lock(&iommu->lock);
2016
2017         ret = -ENOMEM;
2018         context = iommu_context_addr(iommu, bus, devfn, 1);
2019         if (!context)
2020                 goto out_unlock;
2021
2022         ret = 0;
2023         if (context_present(context))
2024                 goto out_unlock;
2025
2026         /*
2027          * For kdump cases, old valid entries may be cached due to the
2028          * in-flight DMA and copied pgtable, but there is no unmapping
2029          * behaviour for them, thus we need an explicit cache flush for
2030          * the newly-mapped device. For kdump, at this point, the device
2031          * is supposed to finish reset at its driver probe stage, so no
2032          * in-flight DMA will exist, and we don't need to worry anymore
2033          * hereafter.
2034          */
2035         if (context_copied(context)) {
2036                 u16 did_old = context_domain_id(context);
2037
2038                 if (did_old < cap_ndoms(iommu->cap)) {
2039                         iommu->flush.flush_context(iommu, did_old,
2040                                                    (((u16)bus) << 8) | devfn,
2041                                                    DMA_CCMD_MASK_NOBIT,
2042                                                    DMA_CCMD_DEVICE_INVL);
2043                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2044                                                  DMA_TLB_DSI_FLUSH);
2045                 }
2046         }
2047
2048         context_clear_entry(context);
2049
2050         if (sm_supported(iommu)) {
2051                 unsigned long pds;
2052
2053                 WARN_ON(!table);
2054
2055                 /* Setup the PASID DIR pointer: */
2056                 pds = context_get_sm_pds(table);
2057                 context->lo = (u64)virt_to_phys(table->table) |
2058                                 context_pdts(pds);
2059
2060                 /* Setup the RID_PASID field: */
2061                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2062
2063                 /*
2064                  * Setup the Device-TLB enable bit and Page request
2065                  * Enable bit:
2066                  */
2067                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2068                 if (info && info->ats_supported)
2069                         context_set_sm_dte(context);
2070                 if (info && info->pri_supported)
2071                         context_set_sm_pre(context);
2072         } else {
2073                 struct dma_pte *pgd = domain->pgd;
2074                 int agaw;
2075
2076                 context_set_domain_id(context, did);
2077
2078                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2079                         /*
2080                          * Skip top levels of page tables for iommu which has
2081                          * less agaw than default. Unnecessary for PT mode.
2082                          */
2083                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2084                                 ret = -ENOMEM;
2085                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2086                                 if (!dma_pte_present(pgd))
2087                                         goto out_unlock;
2088                         }
2089
2090                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2091                         if (info && info->ats_supported)
2092                                 translation = CONTEXT_TT_DEV_IOTLB;
2093                         else
2094                                 translation = CONTEXT_TT_MULTI_LEVEL;
2095
2096                         context_set_address_root(context, virt_to_phys(pgd));
2097                         context_set_address_width(context, agaw);
2098                 } else {
2099                         /*
2100                          * In pass through mode, AW must be programmed to
2101                          * indicate the largest AGAW value supported by
2102                          * hardware. And ASR is ignored by hardware.
2103                          */
2104                         context_set_address_width(context, iommu->msagaw);
2105                 }
2106
2107                 context_set_translation_type(context, translation);
2108         }
2109
2110         context_set_fault_enable(context);
2111         context_set_present(context);
2112         if (!ecap_coherent(iommu->ecap))
2113                 clflush_cache_range(context, sizeof(*context));
2114
2115         /*
2116          * It's a non-present to present mapping. If hardware doesn't cache
2117          * non-present entry we only need to flush the write-buffer. If the
2118          * _does_ cache non-present entries, then it does so in the special
2119          * domain #0, which we have to flush:
2120          */
2121         if (cap_caching_mode(iommu->cap)) {
2122                 iommu->flush.flush_context(iommu, 0,
2123                                            (((u16)bus) << 8) | devfn,
2124                                            DMA_CCMD_MASK_NOBIT,
2125                                            DMA_CCMD_DEVICE_INVL);
2126                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2127         } else {
2128                 iommu_flush_write_buffer(iommu);
2129         }
2130         iommu_enable_dev_iotlb(info);
2131
2132         ret = 0;
2133
2134 out_unlock:
2135         spin_unlock(&iommu->lock);
2136         spin_unlock_irqrestore(&device_domain_lock, flags);
2137
2138         return ret;
2139 }
2140
2141 struct domain_context_mapping_data {
2142         struct dmar_domain *domain;
2143         struct intel_iommu *iommu;
2144         struct pasid_table *table;
2145 };
2146
2147 static int domain_context_mapping_cb(struct pci_dev *pdev,
2148                                      u16 alias, void *opaque)
2149 {
2150         struct domain_context_mapping_data *data = opaque;
2151
2152         return domain_context_mapping_one(data->domain, data->iommu,
2153                                           data->table, PCI_BUS_NUM(alias),
2154                                           alias & 0xff);
2155 }
2156
2157 static int
2158 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2159 {
2160         struct domain_context_mapping_data data;
2161         struct pasid_table *table;
2162         struct intel_iommu *iommu;
2163         u8 bus, devfn;
2164
2165         iommu = device_to_iommu(dev, &bus, &devfn);
2166         if (!iommu)
2167                 return -ENODEV;
2168
2169         table = intel_pasid_get_table(dev);
2170
2171         if (!dev_is_pci(dev))
2172                 return domain_context_mapping_one(domain, iommu, table,
2173                                                   bus, devfn);
2174
2175         data.domain = domain;
2176         data.iommu = iommu;
2177         data.table = table;
2178
2179         return pci_for_each_dma_alias(to_pci_dev(dev),
2180                                       &domain_context_mapping_cb, &data);
2181 }
2182
2183 static int domain_context_mapped_cb(struct pci_dev *pdev,
2184                                     u16 alias, void *opaque)
2185 {
2186         struct intel_iommu *iommu = opaque;
2187
2188         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2189 }
2190
2191 static int domain_context_mapped(struct device *dev)
2192 {
2193         struct intel_iommu *iommu;
2194         u8 bus, devfn;
2195
2196         iommu = device_to_iommu(dev, &bus, &devfn);
2197         if (!iommu)
2198                 return -ENODEV;
2199
2200         if (!dev_is_pci(dev))
2201                 return device_context_mapped(iommu, bus, devfn);
2202
2203         return !pci_for_each_dma_alias(to_pci_dev(dev),
2204                                        domain_context_mapped_cb, iommu);
2205 }
2206
2207 /* Returns a number of VTD pages, but aligned to MM page size */
2208 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2209                                             size_t size)
2210 {
2211         host_addr &= ~PAGE_MASK;
2212         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2213 }
2214
2215 /* Return largest possible superpage level for a given mapping */
2216 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2217                                           unsigned long iov_pfn,
2218                                           unsigned long phy_pfn,
2219                                           unsigned long pages)
2220 {
2221         int support, level = 1;
2222         unsigned long pfnmerge;
2223
2224         support = domain->iommu_superpage;
2225
2226         /* To use a large page, the virtual *and* physical addresses
2227            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2228            of them will mean we have to use smaller pages. So just
2229            merge them and check both at once. */
2230         pfnmerge = iov_pfn | phy_pfn;
2231
2232         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2233                 pages >>= VTD_STRIDE_SHIFT;
2234                 if (!pages)
2235                         break;
2236                 pfnmerge >>= VTD_STRIDE_SHIFT;
2237                 level++;
2238                 support--;
2239         }
2240         return level;
2241 }
2242
2243 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2244                             struct scatterlist *sg, unsigned long phys_pfn,
2245                             unsigned long nr_pages, int prot)
2246 {
2247         struct dma_pte *first_pte = NULL, *pte = NULL;
2248         phys_addr_t pteval;
2249         unsigned long sg_res = 0;
2250         unsigned int largepage_lvl = 0;
2251         unsigned long lvl_pages = 0;
2252         u64 attr;
2253
2254         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2255
2256         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2257                 return -EINVAL;
2258
2259         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2260         if (domain_use_first_level(domain))
2261                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2262
2263         if (!sg) {
2264                 sg_res = nr_pages;
2265                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2266         }
2267
2268         while (nr_pages > 0) {
2269                 uint64_t tmp;
2270
2271                 if (!sg_res) {
2272                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2273
2274                         sg_res = aligned_nrpages(sg->offset, sg->length);
2275                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2276                         sg->dma_length = sg->length;
2277                         pteval = (sg_phys(sg) - pgoff) | attr;
2278                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2279                 }
2280
2281                 if (!pte) {
2282                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2283
2284                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2285                         if (!pte)
2286                                 return -ENOMEM;
2287                         /* It is large page*/
2288                         if (largepage_lvl > 1) {
2289                                 unsigned long nr_superpages, end_pfn;
2290
2291                                 pteval |= DMA_PTE_LARGE_PAGE;
2292                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2293
2294                                 nr_superpages = sg_res / lvl_pages;
2295                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2296
2297                                 /*
2298                                  * Ensure that old small page tables are
2299                                  * removed to make room for superpage(s).
2300                                  * We're adding new large pages, so make sure
2301                                  * we don't remove their parent tables.
2302                                  */
2303                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2304                                                        largepage_lvl + 1);
2305                         } else {
2306                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2307                         }
2308
2309                 }
2310                 /* We don't need lock here, nobody else
2311                  * touches the iova range
2312                  */
2313                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2314                 if (tmp) {
2315                         static int dumps = 5;
2316                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2317                                 iov_pfn, tmp, (unsigned long long)pteval);
2318                         if (dumps) {
2319                                 dumps--;
2320                                 debug_dma_dump_mappings(NULL);
2321                         }
2322                         WARN_ON(1);
2323                 }
2324
2325                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2326
2327                 BUG_ON(nr_pages < lvl_pages);
2328                 BUG_ON(sg_res < lvl_pages);
2329
2330                 nr_pages -= lvl_pages;
2331                 iov_pfn += lvl_pages;
2332                 phys_pfn += lvl_pages;
2333                 pteval += lvl_pages * VTD_PAGE_SIZE;
2334                 sg_res -= lvl_pages;
2335
2336                 /* If the next PTE would be the first in a new page, then we
2337                    need to flush the cache on the entries we've just written.
2338                    And then we'll need to recalculate 'pte', so clear it and
2339                    let it get set again in the if (!pte) block above.
2340
2341                    If we're done (!nr_pages) we need to flush the cache too.
2342
2343                    Also if we've been setting superpages, we may need to
2344                    recalculate 'pte' and switch back to smaller pages for the
2345                    end of the mapping, if the trailing size is not enough to
2346                    use another superpage (i.e. sg_res < lvl_pages). */
2347                 pte++;
2348                 if (!nr_pages || first_pte_in_page(pte) ||
2349                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2350                         domain_flush_cache(domain, first_pte,
2351                                            (void *)pte - (void *)first_pte);
2352                         pte = NULL;
2353                 }
2354
2355                 if (!sg_res && nr_pages)
2356                         sg = sg_next(sg);
2357         }
2358         return 0;
2359 }
2360
2361 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2362                           struct scatterlist *sg, unsigned long phys_pfn,
2363                           unsigned long nr_pages, int prot)
2364 {
2365         int iommu_id, ret;
2366         struct intel_iommu *iommu;
2367
2368         /* Do the real mapping first */
2369         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2370         if (ret)
2371                 return ret;
2372
2373         for_each_domain_iommu(iommu_id, domain) {
2374                 iommu = g_iommus[iommu_id];
2375                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2376         }
2377
2378         return 0;
2379 }
2380
2381 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2382                                     struct scatterlist *sg, unsigned long nr_pages,
2383                                     int prot)
2384 {
2385         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2386 }
2387
2388 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2389                                      unsigned long phys_pfn, unsigned long nr_pages,
2390                                      int prot)
2391 {
2392         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2393 }
2394
2395 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2396 {
2397         unsigned long flags;
2398         struct context_entry *context;
2399         u16 did_old;
2400
2401         if (!iommu)
2402                 return;
2403
2404         spin_lock_irqsave(&iommu->lock, flags);
2405         context = iommu_context_addr(iommu, bus, devfn, 0);
2406         if (!context) {
2407                 spin_unlock_irqrestore(&iommu->lock, flags);
2408                 return;
2409         }
2410         did_old = context_domain_id(context);
2411         context_clear_entry(context);
2412         __iommu_flush_cache(iommu, context, sizeof(*context));
2413         spin_unlock_irqrestore(&iommu->lock, flags);
2414         iommu->flush.flush_context(iommu,
2415                                    did_old,
2416                                    (((u16)bus) << 8) | devfn,
2417                                    DMA_CCMD_MASK_NOBIT,
2418                                    DMA_CCMD_DEVICE_INVL);
2419         iommu->flush.flush_iotlb(iommu,
2420                                  did_old,
2421                                  0,
2422                                  0,
2423                                  DMA_TLB_DSI_FLUSH);
2424 }
2425
2426 static inline void unlink_domain_info(struct device_domain_info *info)
2427 {
2428         assert_spin_locked(&device_domain_lock);
2429         list_del(&info->link);
2430         list_del(&info->global);
2431         if (info->dev)
2432                 dev_iommu_priv_set(info->dev, NULL);
2433 }
2434
2435 static void domain_remove_dev_info(struct dmar_domain *domain)
2436 {
2437         struct device_domain_info *info, *tmp;
2438         unsigned long flags;
2439
2440         spin_lock_irqsave(&device_domain_lock, flags);
2441         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2442                 __dmar_remove_one_dev_info(info);
2443         spin_unlock_irqrestore(&device_domain_lock, flags);
2444 }
2445
2446 struct dmar_domain *find_domain(struct device *dev)
2447 {
2448         struct device_domain_info *info;
2449
2450         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2451                 return NULL;
2452
2453         /* No lock here, assumes no domain exit in normal case */
2454         info = get_domain_info(dev);
2455         if (likely(info))
2456                 return info->domain;
2457
2458         return NULL;
2459 }
2460
2461 static void do_deferred_attach(struct device *dev)
2462 {
2463         struct iommu_domain *domain;
2464
2465         dev_iommu_priv_set(dev, NULL);
2466         domain = iommu_get_domain_for_dev(dev);
2467         if (domain)
2468                 intel_iommu_attach_device(domain, dev);
2469 }
2470
2471 static inline struct device_domain_info *
2472 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2473 {
2474         struct device_domain_info *info;
2475
2476         list_for_each_entry(info, &device_domain_list, global)
2477                 if (info->segment == segment && info->bus == bus &&
2478                     info->devfn == devfn)
2479                         return info;
2480
2481         return NULL;
2482 }
2483
2484 static int domain_setup_first_level(struct intel_iommu *iommu,
2485                                     struct dmar_domain *domain,
2486                                     struct device *dev,
2487                                     int pasid)
2488 {
2489         int flags = PASID_FLAG_SUPERVISOR_MODE;
2490         struct dma_pte *pgd = domain->pgd;
2491         int agaw, level;
2492
2493         /*
2494          * Skip top levels of page tables for iommu which has
2495          * less agaw than default. Unnecessary for PT mode.
2496          */
2497         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2498                 pgd = phys_to_virt(dma_pte_addr(pgd));
2499                 if (!dma_pte_present(pgd))
2500                         return -ENOMEM;
2501         }
2502
2503         level = agaw_to_level(agaw);
2504         if (level != 4 && level != 5)
2505                 return -EINVAL;
2506
2507         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2508
2509         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2510                                              domain->iommu_did[iommu->seq_id],
2511                                              flags);
2512 }
2513
2514 static bool dev_is_real_dma_subdevice(struct device *dev)
2515 {
2516         return dev && dev_is_pci(dev) &&
2517                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2518 }
2519
2520 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2521                                                     int bus, int devfn,
2522                                                     struct device *dev,
2523                                                     struct dmar_domain *domain)
2524 {
2525         struct dmar_domain *found = NULL;
2526         struct device_domain_info *info;
2527         unsigned long flags;
2528         int ret;
2529
2530         info = alloc_devinfo_mem();
2531         if (!info)
2532                 return NULL;
2533
2534         if (!dev_is_real_dma_subdevice(dev)) {
2535                 info->bus = bus;
2536                 info->devfn = devfn;
2537                 info->segment = iommu->segment;
2538         } else {
2539                 struct pci_dev *pdev = to_pci_dev(dev);
2540
2541                 info->bus = pdev->bus->number;
2542                 info->devfn = pdev->devfn;
2543                 info->segment = pci_domain_nr(pdev->bus);
2544         }
2545
2546         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2547         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2548         info->ats_qdep = 0;
2549         info->dev = dev;
2550         info->domain = domain;
2551         info->iommu = iommu;
2552         info->pasid_table = NULL;
2553         info->auxd_enabled = 0;
2554         INIT_LIST_HEAD(&info->auxiliary_domains);
2555
2556         if (dev && dev_is_pci(dev)) {
2557                 struct pci_dev *pdev = to_pci_dev(info->dev);
2558
2559                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2560                     pci_ats_supported(pdev) &&
2561                     dmar_find_matched_atsr_unit(pdev))
2562                         info->ats_supported = 1;
2563
2564                 if (sm_supported(iommu)) {
2565                         if (pasid_supported(iommu)) {
2566                                 int features = pci_pasid_features(pdev);
2567                                 if (features >= 0)
2568                                         info->pasid_supported = features | 1;
2569                         }
2570
2571                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2572                             pci_pri_supported(pdev))
2573                                 info->pri_supported = 1;
2574                 }
2575         }
2576
2577         spin_lock_irqsave(&device_domain_lock, flags);
2578         if (dev)
2579                 found = find_domain(dev);
2580
2581         if (!found) {
2582                 struct device_domain_info *info2;
2583                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2584                                                        info->devfn);
2585                 if (info2) {
2586                         found      = info2->domain;
2587                         info2->dev = dev;
2588                 }
2589         }
2590
2591         if (found) {
2592                 spin_unlock_irqrestore(&device_domain_lock, flags);
2593                 free_devinfo_mem(info);
2594                 /* Caller must free the original domain */
2595                 return found;
2596         }
2597
2598         spin_lock(&iommu->lock);
2599         ret = domain_attach_iommu(domain, iommu);
2600         spin_unlock(&iommu->lock);
2601
2602         if (ret) {
2603                 spin_unlock_irqrestore(&device_domain_lock, flags);
2604                 free_devinfo_mem(info);
2605                 return NULL;
2606         }
2607
2608         list_add(&info->link, &domain->devices);
2609         list_add(&info->global, &device_domain_list);
2610         if (dev)
2611                 dev_iommu_priv_set(dev, info);
2612         spin_unlock_irqrestore(&device_domain_lock, flags);
2613
2614         /* PASID table is mandatory for a PCI device in scalable mode. */
2615         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2616                 ret = intel_pasid_alloc_table(dev);
2617                 if (ret) {
2618                         dev_err(dev, "PASID table allocation failed\n");
2619                         dmar_remove_one_dev_info(dev);
2620                         return NULL;
2621                 }
2622
2623                 /* Setup the PASID entry for requests without PASID: */
2624                 spin_lock(&iommu->lock);
2625                 if (hw_pass_through && domain_type_is_si(domain))
2626                         ret = intel_pasid_setup_pass_through(iommu, domain,
2627                                         dev, PASID_RID2PASID);
2628                 else if (domain_use_first_level(domain))
2629                         ret = domain_setup_first_level(iommu, domain, dev,
2630                                         PASID_RID2PASID);
2631                 else
2632                         ret = intel_pasid_setup_second_level(iommu, domain,
2633                                         dev, PASID_RID2PASID);
2634                 spin_unlock(&iommu->lock);
2635                 if (ret) {
2636                         dev_err(dev, "Setup RID2PASID failed\n");
2637                         dmar_remove_one_dev_info(dev);
2638                         return NULL;
2639                 }
2640         }
2641
2642         if (dev && domain_context_mapping(domain, dev)) {
2643                 dev_err(dev, "Domain context map failed\n");
2644                 dmar_remove_one_dev_info(dev);
2645                 return NULL;
2646         }
2647
2648         return domain;
2649 }
2650
2651 static int iommu_domain_identity_map(struct dmar_domain *domain,
2652                                      unsigned long first_vpfn,
2653                                      unsigned long last_vpfn)
2654 {
2655         /*
2656          * RMRR range might have overlap with physical memory range,
2657          * clear it first
2658          */
2659         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2660
2661         return __domain_mapping(domain, first_vpfn, NULL,
2662                                 first_vpfn, last_vpfn - first_vpfn + 1,
2663                                 DMA_PTE_READ|DMA_PTE_WRITE);
2664 }
2665
2666 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2667
2668 static int __init si_domain_init(int hw)
2669 {
2670         struct dmar_rmrr_unit *rmrr;
2671         struct device *dev;
2672         int i, nid, ret;
2673
2674         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2675         if (!si_domain)
2676                 return -EFAULT;
2677
2678         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2679                 domain_exit(si_domain);
2680                 return -EFAULT;
2681         }
2682
2683         if (hw)
2684                 return 0;
2685
2686         for_each_online_node(nid) {
2687                 unsigned long start_pfn, end_pfn;
2688                 int i;
2689
2690                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2691                         ret = iommu_domain_identity_map(si_domain,
2692                                         mm_to_dma_pfn(start_pfn),
2693                                         mm_to_dma_pfn(end_pfn));
2694                         if (ret)
2695                                 return ret;
2696                 }
2697         }
2698
2699         /*
2700          * Identity map the RMRRs so that devices with RMRRs could also use
2701          * the si_domain.
2702          */
2703         for_each_rmrr_units(rmrr) {
2704                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2705                                           i, dev) {
2706                         unsigned long long start = rmrr->base_address;
2707                         unsigned long long end = rmrr->end_address;
2708
2709                         if (WARN_ON(end < start ||
2710                                     end >> agaw_to_width(si_domain->agaw)))
2711                                 continue;
2712
2713                         ret = iommu_domain_identity_map(si_domain,
2714                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2715                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2716                         if (ret)
2717                                 return ret;
2718                 }
2719         }
2720
2721         return 0;
2722 }
2723
2724 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2725 {
2726         struct dmar_domain *ndomain;
2727         struct intel_iommu *iommu;
2728         u8 bus, devfn;
2729
2730         iommu = device_to_iommu(dev, &bus, &devfn);
2731         if (!iommu)
2732                 return -ENODEV;
2733
2734         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2735         if (ndomain != domain)
2736                 return -EBUSY;
2737
2738         return 0;
2739 }
2740
2741 static bool device_has_rmrr(struct device *dev)
2742 {
2743         struct dmar_rmrr_unit *rmrr;
2744         struct device *tmp;
2745         int i;
2746
2747         rcu_read_lock();
2748         for_each_rmrr_units(rmrr) {
2749                 /*
2750                  * Return TRUE if this RMRR contains the device that
2751                  * is passed in.
2752                  */
2753                 for_each_active_dev_scope(rmrr->devices,
2754                                           rmrr->devices_cnt, i, tmp)
2755                         if (tmp == dev ||
2756                             is_downstream_to_pci_bridge(dev, tmp)) {
2757                                 rcu_read_unlock();
2758                                 return true;
2759                         }
2760         }
2761         rcu_read_unlock();
2762         return false;
2763 }
2764
2765 /**
2766  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2767  * is relaxable (ie. is allowed to be not enforced under some conditions)
2768  * @dev: device handle
2769  *
2770  * We assume that PCI USB devices with RMRRs have them largely
2771  * for historical reasons and that the RMRR space is not actively used post
2772  * boot.  This exclusion may change if vendors begin to abuse it.
2773  *
2774  * The same exception is made for graphics devices, with the requirement that
2775  * any use of the RMRR regions will be torn down before assigning the device
2776  * to a guest.
2777  *
2778  * Return: true if the RMRR is relaxable, false otherwise
2779  */
2780 static bool device_rmrr_is_relaxable(struct device *dev)
2781 {
2782         struct pci_dev *pdev;
2783
2784         if (!dev_is_pci(dev))
2785                 return false;
2786
2787         pdev = to_pci_dev(dev);
2788         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2789                 return true;
2790         else
2791                 return false;
2792 }
2793
2794 /*
2795  * There are a couple cases where we need to restrict the functionality of
2796  * devices associated with RMRRs.  The first is when evaluating a device for
2797  * identity mapping because problems exist when devices are moved in and out
2798  * of domains and their respective RMRR information is lost.  This means that
2799  * a device with associated RMRRs will never be in a "passthrough" domain.
2800  * The second is use of the device through the IOMMU API.  This interface
2801  * expects to have full control of the IOVA space for the device.  We cannot
2802  * satisfy both the requirement that RMRR access is maintained and have an
2803  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2804  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2805  * We therefore prevent devices associated with an RMRR from participating in
2806  * the IOMMU API, which eliminates them from device assignment.
2807  *
2808  * In both cases, devices which have relaxable RMRRs are not concerned by this
2809  * restriction. See device_rmrr_is_relaxable comment.
2810  */
2811 static bool device_is_rmrr_locked(struct device *dev)
2812 {
2813         if (!device_has_rmrr(dev))
2814                 return false;
2815
2816         if (device_rmrr_is_relaxable(dev))
2817                 return false;
2818
2819         return true;
2820 }
2821
2822 /*
2823  * Return the required default domain type for a specific device.
2824  *
2825  * @dev: the device in query
2826  * @startup: true if this is during early boot
2827  *
2828  * Returns:
2829  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2830  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2831  *  - 0: both identity and dynamic domains work for this device
2832  */
2833 static int device_def_domain_type(struct device *dev)
2834 {
2835         if (dev_is_pci(dev)) {
2836                 struct pci_dev *pdev = to_pci_dev(dev);
2837
2838                 /*
2839                  * Prevent any device marked as untrusted from getting
2840                  * placed into the statically identity mapping domain.
2841                  */
2842                 if (pdev->untrusted)
2843                         return IOMMU_DOMAIN_DMA;
2844
2845                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2846                         return IOMMU_DOMAIN_IDENTITY;
2847
2848                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2849                         return IOMMU_DOMAIN_IDENTITY;
2850         }
2851
2852         return 0;
2853 }
2854
2855 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2856 {
2857         /*
2858          * Start from the sane iommu hardware state.
2859          * If the queued invalidation is already initialized by us
2860          * (for example, while enabling interrupt-remapping) then
2861          * we got the things already rolling from a sane state.
2862          */
2863         if (!iommu->qi) {
2864                 /*
2865                  * Clear any previous faults.
2866                  */
2867                 dmar_fault(-1, iommu);
2868                 /*
2869                  * Disable queued invalidation if supported and already enabled
2870                  * before OS handover.
2871                  */
2872                 dmar_disable_qi(iommu);
2873         }
2874
2875         if (dmar_enable_qi(iommu)) {
2876                 /*
2877                  * Queued Invalidate not enabled, use Register Based Invalidate
2878                  */
2879                 iommu->flush.flush_context = __iommu_flush_context;
2880                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2881                 pr_info("%s: Using Register based invalidation\n",
2882                         iommu->name);
2883         } else {
2884                 iommu->flush.flush_context = qi_flush_context;
2885                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2886                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2887         }
2888 }
2889
2890 static int copy_context_table(struct intel_iommu *iommu,
2891                               struct root_entry *old_re,
2892                               struct context_entry **tbl,
2893                               int bus, bool ext)
2894 {
2895         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2896         struct context_entry *new_ce = NULL, ce;
2897         struct context_entry *old_ce = NULL;
2898         struct root_entry re;
2899         phys_addr_t old_ce_phys;
2900
2901         tbl_idx = ext ? bus * 2 : bus;
2902         memcpy(&re, old_re, sizeof(re));
2903
2904         for (devfn = 0; devfn < 256; devfn++) {
2905                 /* First calculate the correct index */
2906                 idx = (ext ? devfn * 2 : devfn) % 256;
2907
2908                 if (idx == 0) {
2909                         /* First save what we may have and clean up */
2910                         if (new_ce) {
2911                                 tbl[tbl_idx] = new_ce;
2912                                 __iommu_flush_cache(iommu, new_ce,
2913                                                     VTD_PAGE_SIZE);
2914                                 pos = 1;
2915                         }
2916
2917                         if (old_ce)
2918                                 memunmap(old_ce);
2919
2920                         ret = 0;
2921                         if (devfn < 0x80)
2922                                 old_ce_phys = root_entry_lctp(&re);
2923                         else
2924                                 old_ce_phys = root_entry_uctp(&re);
2925
2926                         if (!old_ce_phys) {
2927                                 if (ext && devfn == 0) {
2928                                         /* No LCTP, try UCTP */
2929                                         devfn = 0x7f;
2930                                         continue;
2931                                 } else {
2932                                         goto out;
2933                                 }
2934                         }
2935
2936                         ret = -ENOMEM;
2937                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2938                                         MEMREMAP_WB);
2939                         if (!old_ce)
2940                                 goto out;
2941
2942                         new_ce = alloc_pgtable_page(iommu->node);
2943                         if (!new_ce)
2944                                 goto out_unmap;
2945
2946                         ret = 0;
2947                 }
2948
2949                 /* Now copy the context entry */
2950                 memcpy(&ce, old_ce + idx, sizeof(ce));
2951
2952                 if (!__context_present(&ce))
2953                         continue;
2954
2955                 did = context_domain_id(&ce);
2956                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2957                         set_bit(did, iommu->domain_ids);
2958
2959                 /*
2960                  * We need a marker for copied context entries. This
2961                  * marker needs to work for the old format as well as
2962                  * for extended context entries.
2963                  *
2964                  * Bit 67 of the context entry is used. In the old
2965                  * format this bit is available to software, in the
2966                  * extended format it is the PGE bit, but PGE is ignored
2967                  * by HW if PASIDs are disabled (and thus still
2968                  * available).
2969                  *
2970                  * So disable PASIDs first and then mark the entry
2971                  * copied. This means that we don't copy PASID
2972                  * translations from the old kernel, but this is fine as
2973                  * faults there are not fatal.
2974                  */
2975                 context_clear_pasid_enable(&ce);
2976                 context_set_copied(&ce);
2977
2978                 new_ce[idx] = ce;
2979         }
2980
2981         tbl[tbl_idx + pos] = new_ce;
2982
2983         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2984
2985 out_unmap:
2986         memunmap(old_ce);
2987
2988 out:
2989         return ret;
2990 }
2991
2992 static int copy_translation_tables(struct intel_iommu *iommu)
2993 {
2994         struct context_entry **ctxt_tbls;
2995         struct root_entry *old_rt;
2996         phys_addr_t old_rt_phys;
2997         int ctxt_table_entries;
2998         unsigned long flags;
2999         u64 rtaddr_reg;
3000         int bus, ret;
3001         bool new_ext, ext;
3002
3003         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3004         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3005         new_ext    = !!ecap_ecs(iommu->ecap);
3006
3007         /*
3008          * The RTT bit can only be changed when translation is disabled,
3009          * but disabling translation means to open a window for data
3010          * corruption. So bail out and don't copy anything if we would
3011          * have to change the bit.
3012          */
3013         if (new_ext != ext)
3014                 return -EINVAL;
3015
3016         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3017         if (!old_rt_phys)
3018                 return -EINVAL;
3019
3020         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3021         if (!old_rt)
3022                 return -ENOMEM;
3023
3024         /* This is too big for the stack - allocate it from slab */
3025         ctxt_table_entries = ext ? 512 : 256;
3026         ret = -ENOMEM;
3027         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3028         if (!ctxt_tbls)
3029                 goto out_unmap;
3030
3031         for (bus = 0; bus < 256; bus++) {
3032                 ret = copy_context_table(iommu, &old_rt[bus],
3033                                          ctxt_tbls, bus, ext);
3034                 if (ret) {
3035                         pr_err("%s: Failed to copy context table for bus %d\n",
3036                                 iommu->name, bus);
3037                         continue;
3038                 }
3039         }
3040
3041         spin_lock_irqsave(&iommu->lock, flags);
3042
3043         /* Context tables are copied, now write them to the root_entry table */
3044         for (bus = 0; bus < 256; bus++) {
3045                 int idx = ext ? bus * 2 : bus;
3046                 u64 val;
3047
3048                 if (ctxt_tbls[idx]) {
3049                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3050                         iommu->root_entry[bus].lo = val;
3051                 }
3052
3053                 if (!ext || !ctxt_tbls[idx + 1])
3054                         continue;
3055
3056                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3057                 iommu->root_entry[bus].hi = val;
3058         }
3059
3060         spin_unlock_irqrestore(&iommu->lock, flags);
3061
3062         kfree(ctxt_tbls);
3063
3064         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3065
3066         ret = 0;
3067
3068 out_unmap:
3069         memunmap(old_rt);
3070
3071         return ret;
3072 }
3073
3074 #ifdef CONFIG_INTEL_IOMMU_SVM
3075 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3076 {
3077         struct intel_iommu *iommu = data;
3078         ioasid_t ioasid;
3079
3080         if (!iommu)
3081                 return INVALID_IOASID;
3082         /*
3083          * VT-d virtual command interface always uses the full 20 bit
3084          * PASID range. Host can partition guest PASID range based on
3085          * policies but it is out of guest's control.
3086          */
3087         if (min < PASID_MIN || max > intel_pasid_max_id)
3088                 return INVALID_IOASID;
3089
3090         if (vcmd_alloc_pasid(iommu, &ioasid))
3091                 return INVALID_IOASID;
3092
3093         return ioasid;
3094 }
3095
3096 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3097 {
3098         struct intel_iommu *iommu = data;
3099
3100         if (!iommu)
3101                 return;
3102         /*
3103          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3104          * We can only free the PASID when all the devices are unbound.
3105          */
3106         if (ioasid_find(NULL, ioasid, NULL)) {
3107                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3108                 return;
3109         }
3110         vcmd_free_pasid(iommu, ioasid);
3111 }
3112
3113 static void register_pasid_allocator(struct intel_iommu *iommu)
3114 {
3115         /*
3116          * If we are running in the host, no need for custom allocator
3117          * in that PASIDs are allocated from the host system-wide.
3118          */
3119         if (!cap_caching_mode(iommu->cap))
3120                 return;
3121
3122         if (!sm_supported(iommu)) {
3123                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3124                 return;
3125         }
3126
3127         /*
3128          * Register a custom PASID allocator if we are running in a guest,
3129          * guest PASID must be obtained via virtual command interface.
3130          * There can be multiple vIOMMUs in each guest but only one allocator
3131          * is active. All vIOMMU allocators will eventually be calling the same
3132          * host allocator.
3133          */
3134         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3135                 return;
3136
3137         pr_info("Register custom PASID allocator\n");
3138         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3139         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3140         iommu->pasid_allocator.pdata = (void *)iommu;
3141         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3142                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3143                 /*
3144                  * Disable scalable mode on this IOMMU if there
3145                  * is no custom allocator. Mixing SM capable vIOMMU
3146                  * and non-SM vIOMMU are not supported.
3147                  */
3148                 intel_iommu_sm = 0;
3149         }
3150 }
3151 #endif
3152
3153 static int __init init_dmars(void)
3154 {
3155         struct dmar_drhd_unit *drhd;
3156         struct intel_iommu *iommu;
3157         int ret;
3158
3159         /*
3160          * for each drhd
3161          *    allocate root
3162          *    initialize and program root entry to not present
3163          * endfor
3164          */
3165         for_each_drhd_unit(drhd) {
3166                 /*
3167                  * lock not needed as this is only incremented in the single
3168                  * threaded kernel __init code path all other access are read
3169                  * only
3170                  */
3171                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3172                         g_num_of_iommus++;
3173                         continue;
3174                 }
3175                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3176         }
3177
3178         /* Preallocate enough resources for IOMMU hot-addition */
3179         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3180                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3181
3182         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3183                         GFP_KERNEL);
3184         if (!g_iommus) {
3185                 pr_err("Allocating global iommu array failed\n");
3186                 ret = -ENOMEM;
3187                 goto error;
3188         }
3189
3190         for_each_iommu(iommu, drhd) {
3191                 if (drhd->ignored) {
3192                         iommu_disable_translation(iommu);
3193                         continue;
3194                 }
3195
3196                 /*
3197                  * Find the max pasid size of all IOMMU's in the system.
3198                  * We need to ensure the system pasid table is no bigger
3199                  * than the smallest supported.
3200                  */
3201                 if (pasid_supported(iommu)) {
3202                         u32 temp = 2 << ecap_pss(iommu->ecap);
3203
3204                         intel_pasid_max_id = min_t(u32, temp,
3205                                                    intel_pasid_max_id);
3206                 }
3207
3208                 g_iommus[iommu->seq_id] = iommu;
3209
3210                 intel_iommu_init_qi(iommu);
3211
3212                 ret = iommu_init_domains(iommu);
3213                 if (ret)
3214                         goto free_iommu;
3215
3216                 init_translation_status(iommu);
3217
3218                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3219                         iommu_disable_translation(iommu);
3220                         clear_translation_pre_enabled(iommu);
3221                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3222                                 iommu->name);
3223                 }
3224
3225                 /*
3226                  * TBD:
3227                  * we could share the same root & context tables
3228                  * among all IOMMU's. Need to Split it later.
3229                  */
3230                 ret = iommu_alloc_root_entry(iommu);
3231                 if (ret)
3232                         goto free_iommu;
3233
3234                 if (translation_pre_enabled(iommu)) {
3235                         pr_info("Translation already enabled - trying to copy translation structures\n");
3236
3237                         ret = copy_translation_tables(iommu);
3238                         if (ret) {
3239                                 /*
3240                                  * We found the IOMMU with translation
3241                                  * enabled - but failed to copy over the
3242                                  * old root-entry table. Try to proceed
3243                                  * by disabling translation now and
3244                                  * allocating a clean root-entry table.
3245                                  * This might cause DMAR faults, but
3246                                  * probably the dump will still succeed.
3247                                  */
3248                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3249                                        iommu->name);
3250                                 iommu_disable_translation(iommu);
3251                                 clear_translation_pre_enabled(iommu);
3252                         } else {
3253                                 pr_info("Copied translation tables from previous kernel for %s\n",
3254                                         iommu->name);
3255                         }
3256                 }
3257
3258                 if (!ecap_pass_through(iommu->ecap))
3259                         hw_pass_through = 0;
3260                 intel_svm_check(iommu);
3261         }
3262
3263         /*
3264          * Now that qi is enabled on all iommus, set the root entry and flush
3265          * caches. This is required on some Intel X58 chipsets, otherwise the
3266          * flush_context function will loop forever and the boot hangs.
3267          */
3268         for_each_active_iommu(iommu, drhd) {
3269                 iommu_flush_write_buffer(iommu);
3270 #ifdef CONFIG_INTEL_IOMMU_SVM
3271                 register_pasid_allocator(iommu);
3272 #endif
3273                 iommu_set_root_entry(iommu);
3274                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3275                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3276         }
3277
3278 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3279         dmar_map_gfx = 0;
3280 #endif
3281
3282         if (!dmar_map_gfx)
3283                 iommu_identity_mapping |= IDENTMAP_GFX;
3284
3285         check_tylersburg_isoch();
3286
3287         ret = si_domain_init(hw_pass_through);
3288         if (ret)
3289                 goto free_iommu;
3290
3291         /*
3292          * for each drhd
3293          *   enable fault log
3294          *   global invalidate context cache
3295          *   global invalidate iotlb
3296          *   enable translation
3297          */
3298         for_each_iommu(iommu, drhd) {
3299                 if (drhd->ignored) {
3300                         /*
3301                          * we always have to disable PMRs or DMA may fail on
3302                          * this device
3303                          */
3304                         if (force_on)
3305                                 iommu_disable_protect_mem_regions(iommu);
3306                         continue;
3307                 }
3308
3309                 iommu_flush_write_buffer(iommu);
3310
3311 #ifdef CONFIG_INTEL_IOMMU_SVM
3312                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3313                         /*
3314                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3315                          * could cause possible lock race condition.
3316                          */
3317                         up_write(&dmar_global_lock);
3318                         ret = intel_svm_enable_prq(iommu);
3319                         down_write(&dmar_global_lock);
3320                         if (ret)
3321                                 goto free_iommu;
3322                 }
3323 #endif
3324                 ret = dmar_set_interrupt(iommu);
3325                 if (ret)
3326                         goto free_iommu;
3327         }
3328
3329         return 0;
3330
3331 free_iommu:
3332         for_each_active_iommu(iommu, drhd) {
3333                 disable_dmar_iommu(iommu);
3334                 free_dmar_iommu(iommu);
3335         }
3336
3337         kfree(g_iommus);
3338
3339 error:
3340         return ret;
3341 }
3342
3343 /* This takes a number of _MM_ pages, not VTD pages */
3344 static unsigned long intel_alloc_iova(struct device *dev,
3345                                      struct dmar_domain *domain,
3346                                      unsigned long nrpages, uint64_t dma_mask)
3347 {
3348         unsigned long iova_pfn;
3349
3350         /*
3351          * Restrict dma_mask to the width that the iommu can handle.
3352          * First-level translation restricts the input-address to a
3353          * canonical address (i.e., address bits 63:N have the same
3354          * value as address bit [N-1], where N is 48-bits with 4-level
3355          * paging and 57-bits with 5-level paging). Hence, skip bit
3356          * [N-1].
3357          */
3358         if (domain_use_first_level(domain))
3359                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3360                                  dma_mask);
3361         else
3362                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3363                                  dma_mask);
3364
3365         /* Ensure we reserve the whole size-aligned region */
3366         nrpages = __roundup_pow_of_two(nrpages);
3367
3368         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3369                 /*
3370                  * First try to allocate an io virtual address in
3371                  * DMA_BIT_MASK(32) and if that fails then try allocating
3372                  * from higher range
3373                  */
3374                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3375                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3376                 if (iova_pfn)
3377                         return iova_pfn;
3378         }
3379         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3380                                    IOVA_PFN(dma_mask), true);
3381         if (unlikely(!iova_pfn)) {
3382                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3383                              nrpages);
3384                 return 0;
3385         }
3386
3387         return iova_pfn;
3388 }
3389
3390 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3391                                      size_t size, int dir, u64 dma_mask)
3392 {
3393         struct dmar_domain *domain;
3394         phys_addr_t start_paddr;
3395         unsigned long iova_pfn;
3396         int prot = 0;
3397         int ret;
3398         struct intel_iommu *iommu;
3399         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3400
3401         BUG_ON(dir == DMA_NONE);
3402
3403         if (unlikely(attach_deferred(dev)))
3404                 do_deferred_attach(dev);
3405
3406         domain = find_domain(dev);
3407         if (!domain)
3408                 return DMA_MAPPING_ERROR;
3409
3410         iommu = domain_get_iommu(domain);
3411         size = aligned_nrpages(paddr, size);
3412
3413         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3414         if (!iova_pfn)
3415                 goto error;
3416
3417         /*
3418          * Check if DMAR supports zero-length reads on write only
3419          * mappings..
3420          */
3421         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3422                         !cap_zlr(iommu->cap))
3423                 prot |= DMA_PTE_READ;
3424         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3425                 prot |= DMA_PTE_WRITE;
3426         /*
3427          * paddr - (paddr + size) might be partial page, we should map the whole
3428          * page.  Note: if two part of one page are separately mapped, we
3429          * might have two guest_addr mapping to the same host paddr, but this
3430          * is not a big problem
3431          */
3432         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3433                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3434         if (ret)
3435                 goto error;
3436
3437         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3438         start_paddr += paddr & ~PAGE_MASK;
3439
3440         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3441
3442         return start_paddr;
3443
3444 error:
3445         if (iova_pfn)
3446                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3447         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3448                 size, (unsigned long long)paddr, dir);
3449         return DMA_MAPPING_ERROR;
3450 }
3451
3452 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3453                                  unsigned long offset, size_t size,
3454                                  enum dma_data_direction dir,
3455                                  unsigned long attrs)
3456 {
3457         return __intel_map_single(dev, page_to_phys(page) + offset,
3458                                   size, dir, *dev->dma_mask);
3459 }
3460
3461 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3462                                      size_t size, enum dma_data_direction dir,
3463                                      unsigned long attrs)
3464 {
3465         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3466 }
3467
3468 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3469 {
3470         struct dmar_domain *domain;
3471         unsigned long start_pfn, last_pfn;
3472         unsigned long nrpages;
3473         unsigned long iova_pfn;
3474         struct intel_iommu *iommu;
3475         struct page *freelist;
3476         struct pci_dev *pdev = NULL;
3477
3478         domain = find_domain(dev);
3479         BUG_ON(!domain);
3480
3481         iommu = domain_get_iommu(domain);
3482
3483         iova_pfn = IOVA_PFN(dev_addr);
3484
3485         nrpages = aligned_nrpages(dev_addr, size);
3486         start_pfn = mm_to_dma_pfn(iova_pfn);
3487         last_pfn = start_pfn + nrpages - 1;
3488
3489         if (dev_is_pci(dev))
3490                 pdev = to_pci_dev(dev);
3491
3492         freelist = domain_unmap(domain, start_pfn, last_pfn);
3493         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3494                         !has_iova_flush_queue(&domain->iovad)) {
3495                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3496                                       nrpages, !freelist, 0);
3497                 /* free iova */
3498                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3499                 dma_free_pagelist(freelist);
3500         } else {
3501                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3502                            (unsigned long)freelist);
3503                 /*
3504                  * queue up the release of the unmap to save the 1/6th of the
3505                  * cpu used up by the iotlb flush operation...
3506                  */
3507         }
3508
3509         trace_unmap_single(dev, dev_addr, size);
3510 }
3511
3512 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3513                              size_t size, enum dma_data_direction dir,
3514                              unsigned long attrs)
3515 {
3516         intel_unmap(dev, dev_addr, size);
3517 }
3518
3519 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3520                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3521 {
3522         intel_unmap(dev, dev_addr, size);
3523 }
3524
3525 static void *intel_alloc_coherent(struct device *dev, size_t size,
3526                                   dma_addr_t *dma_handle, gfp_t flags,
3527                                   unsigned long attrs)
3528 {
3529         struct page *page = NULL;
3530         int order;
3531
3532         if (unlikely(attach_deferred(dev)))
3533                 do_deferred_attach(dev);
3534
3535         size = PAGE_ALIGN(size);
3536         order = get_order(size);
3537
3538         if (gfpflags_allow_blocking(flags)) {
3539                 unsigned int count = size >> PAGE_SHIFT;
3540
3541                 page = dma_alloc_from_contiguous(dev, count, order,
3542                                                  flags & __GFP_NOWARN);
3543         }
3544
3545         if (!page)
3546                 page = alloc_pages(flags, order);
3547         if (!page)
3548                 return NULL;
3549         memset(page_address(page), 0, size);
3550
3551         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3552                                          DMA_BIDIRECTIONAL,
3553                                          dev->coherent_dma_mask);
3554         if (*dma_handle != DMA_MAPPING_ERROR)
3555                 return page_address(page);
3556         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3557                 __free_pages(page, order);
3558
3559         return NULL;
3560 }
3561
3562 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3563                                 dma_addr_t dma_handle, unsigned long attrs)
3564 {
3565         int order;
3566         struct page *page = virt_to_page(vaddr);
3567
3568         size = PAGE_ALIGN(size);
3569         order = get_order(size);
3570
3571         intel_unmap(dev, dma_handle, size);
3572         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3573                 __free_pages(page, order);
3574 }
3575
3576 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3577                            int nelems, enum dma_data_direction dir,
3578                            unsigned long attrs)
3579 {
3580         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3581         unsigned long nrpages = 0;
3582         struct scatterlist *sg;
3583         int i;
3584
3585         for_each_sg(sglist, sg, nelems, i) {
3586                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3587         }
3588
3589         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3590
3591         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3592 }
3593
3594 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3595                         enum dma_data_direction dir, unsigned long attrs)
3596 {
3597         int i;
3598         struct dmar_domain *domain;
3599         size_t size = 0;
3600         int prot = 0;
3601         unsigned long iova_pfn;
3602         int ret;
3603         struct scatterlist *sg;
3604         unsigned long start_vpfn;
3605         struct intel_iommu *iommu;
3606
3607         BUG_ON(dir == DMA_NONE);
3608
3609         if (unlikely(attach_deferred(dev)))
3610                 do_deferred_attach(dev);
3611
3612         domain = find_domain(dev);
3613         if (!domain)
3614                 return 0;
3615
3616         iommu = domain_get_iommu(domain);
3617
3618         for_each_sg(sglist, sg, nelems, i)
3619                 size += aligned_nrpages(sg->offset, sg->length);
3620
3621         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3622                                 *dev->dma_mask);
3623         if (!iova_pfn) {
3624                 sglist->dma_length = 0;
3625                 return 0;
3626         }
3627
3628         /*
3629          * Check if DMAR supports zero-length reads on write only
3630          * mappings..
3631          */
3632         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3633                         !cap_zlr(iommu->cap))
3634                 prot |= DMA_PTE_READ;
3635         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3636                 prot |= DMA_PTE_WRITE;
3637
3638         start_vpfn = mm_to_dma_pfn(iova_pfn);
3639
3640         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3641         if (unlikely(ret)) {
3642                 dma_pte_free_pagetable(domain, start_vpfn,
3643                                        start_vpfn + size - 1,
3644                                        agaw_to_level(domain->agaw) + 1);
3645                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3646                 return 0;
3647         }
3648
3649         for_each_sg(sglist, sg, nelems, i)
3650                 trace_map_sg(dev, i + 1, nelems, sg);
3651
3652         return nelems;
3653 }
3654
3655 static u64 intel_get_required_mask(struct device *dev)
3656 {
3657         return DMA_BIT_MASK(32);
3658 }
3659
3660 static const struct dma_map_ops intel_dma_ops = {
3661         .alloc = intel_alloc_coherent,
3662         .free = intel_free_coherent,
3663         .map_sg = intel_map_sg,
3664         .unmap_sg = intel_unmap_sg,
3665         .map_page = intel_map_page,
3666         .unmap_page = intel_unmap_page,
3667         .map_resource = intel_map_resource,
3668         .unmap_resource = intel_unmap_resource,
3669         .dma_supported = dma_direct_supported,
3670         .mmap = dma_common_mmap,
3671         .get_sgtable = dma_common_get_sgtable,
3672         .get_required_mask = intel_get_required_mask,
3673 };
3674
3675 static void
3676 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3677                    enum dma_data_direction dir, enum dma_sync_target target)
3678 {
3679         struct dmar_domain *domain;
3680         phys_addr_t tlb_addr;
3681
3682         domain = find_domain(dev);
3683         if (WARN_ON(!domain))
3684                 return;
3685
3686         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3687         if (is_swiotlb_buffer(tlb_addr))
3688                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3689 }
3690
3691 static dma_addr_t
3692 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3693                   enum dma_data_direction dir, unsigned long attrs,
3694                   u64 dma_mask)
3695 {
3696         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3697         struct dmar_domain *domain;
3698         struct intel_iommu *iommu;
3699         unsigned long iova_pfn;
3700         unsigned long nrpages;
3701         phys_addr_t tlb_addr;
3702         int prot = 0;
3703         int ret;
3704
3705         if (unlikely(attach_deferred(dev)))
3706                 do_deferred_attach(dev);
3707
3708         domain = find_domain(dev);
3709
3710         if (WARN_ON(dir == DMA_NONE || !domain))
3711                 return DMA_MAPPING_ERROR;
3712
3713         iommu = domain_get_iommu(domain);
3714         if (WARN_ON(!iommu))
3715                 return DMA_MAPPING_ERROR;
3716
3717         nrpages = aligned_nrpages(0, size);
3718         iova_pfn = intel_alloc_iova(dev, domain,
3719                                     dma_to_mm_pfn(nrpages), dma_mask);
3720         if (!iova_pfn)
3721                 return DMA_MAPPING_ERROR;
3722
3723         /*
3724          * Check if DMAR supports zero-length reads on write only
3725          * mappings..
3726          */
3727         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3728                         !cap_zlr(iommu->cap))
3729                 prot |= DMA_PTE_READ;
3730         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3731                 prot |= DMA_PTE_WRITE;
3732
3733         /*
3734          * If both the physical buffer start address and size are
3735          * page aligned, we don't need to use a bounce page.
3736          */
3737         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3738                 tlb_addr = swiotlb_tbl_map_single(dev,
3739                                 __phys_to_dma(dev, io_tlb_start),
3740                                 paddr, size, aligned_size, dir, attrs);
3741                 if (tlb_addr == DMA_MAPPING_ERROR) {
3742                         goto swiotlb_error;
3743                 } else {
3744                         /* Cleanup the padding area. */
3745                         void *padding_start = phys_to_virt(tlb_addr);
3746                         size_t padding_size = aligned_size;
3747
3748                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3749                             (dir == DMA_TO_DEVICE ||
3750                              dir == DMA_BIDIRECTIONAL)) {
3751                                 padding_start += size;
3752                                 padding_size -= size;
3753                         }
3754
3755                         memset(padding_start, 0, padding_size);
3756                 }
3757         } else {
3758                 tlb_addr = paddr;
3759         }
3760
3761         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3762                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3763         if (ret)
3764                 goto mapping_error;
3765
3766         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3767
3768         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3769
3770 mapping_error:
3771         if (is_swiotlb_buffer(tlb_addr))
3772                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3773                                          aligned_size, dir, attrs);
3774 swiotlb_error:
3775         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3776         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3777                 size, (unsigned long long)paddr, dir);
3778
3779         return DMA_MAPPING_ERROR;
3780 }
3781
3782 static void
3783 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3784                     enum dma_data_direction dir, unsigned long attrs)
3785 {
3786         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3787         struct dmar_domain *domain;
3788         phys_addr_t tlb_addr;
3789
3790         domain = find_domain(dev);
3791         if (WARN_ON(!domain))
3792                 return;
3793
3794         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3795         if (WARN_ON(!tlb_addr))
3796                 return;
3797
3798         intel_unmap(dev, dev_addr, size);
3799         if (is_swiotlb_buffer(tlb_addr))
3800                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3801                                          aligned_size, dir, attrs);
3802
3803         trace_bounce_unmap_single(dev, dev_addr, size);
3804 }
3805
3806 static dma_addr_t
3807 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3808                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3809 {
3810         return bounce_map_single(dev, page_to_phys(page) + offset,
3811                                  size, dir, attrs, *dev->dma_mask);
3812 }
3813
3814 static dma_addr_t
3815 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3816                     enum dma_data_direction dir, unsigned long attrs)
3817 {
3818         return bounce_map_single(dev, phys_addr, size,
3819                                  dir, attrs, *dev->dma_mask);
3820 }
3821
3822 static void
3823 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3824                   enum dma_data_direction dir, unsigned long attrs)
3825 {
3826         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3827 }
3828
3829 static void
3830 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3831                       enum dma_data_direction dir, unsigned long attrs)
3832 {
3833         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3834 }
3835
3836 static void
3837 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3838                 enum dma_data_direction dir, unsigned long attrs)
3839 {
3840         struct scatterlist *sg;
3841         int i;
3842
3843         for_each_sg(sglist, sg, nelems, i)
3844                 bounce_unmap_page(dev, sg->dma_address,
3845                                   sg_dma_len(sg), dir, attrs);
3846 }
3847
3848 static int
3849 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3850               enum dma_data_direction dir, unsigned long attrs)
3851 {
3852         int i;
3853         struct scatterlist *sg;
3854
3855         for_each_sg(sglist, sg, nelems, i) {
3856                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3857                                                   sg->offset, sg->length,
3858                                                   dir, attrs);
3859                 if (sg->dma_address == DMA_MAPPING_ERROR)
3860                         goto out_unmap;
3861                 sg_dma_len(sg) = sg->length;
3862         }
3863
3864         for_each_sg(sglist, sg, nelems, i)
3865                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3866
3867         return nelems;
3868
3869 out_unmap:
3870         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3871         return 0;
3872 }
3873
3874 static void
3875 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3876                            size_t size, enum dma_data_direction dir)
3877 {
3878         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3879 }
3880
3881 static void
3882 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3883                               size_t size, enum dma_data_direction dir)
3884 {
3885         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3886 }
3887
3888 static void
3889 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3890                        int nelems, enum dma_data_direction dir)
3891 {
3892         struct scatterlist *sg;
3893         int i;
3894
3895         for_each_sg(sglist, sg, nelems, i)
3896                 bounce_sync_single(dev, sg_dma_address(sg),
3897                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3898 }
3899
3900 static void
3901 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3902                           int nelems, enum dma_data_direction dir)
3903 {
3904         struct scatterlist *sg;
3905         int i;
3906
3907         for_each_sg(sglist, sg, nelems, i)
3908                 bounce_sync_single(dev, sg_dma_address(sg),
3909                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3910 }
3911
3912 static const struct dma_map_ops bounce_dma_ops = {
3913         .alloc                  = intel_alloc_coherent,
3914         .free                   = intel_free_coherent,
3915         .map_sg                 = bounce_map_sg,
3916         .unmap_sg               = bounce_unmap_sg,
3917         .map_page               = bounce_map_page,
3918         .unmap_page             = bounce_unmap_page,
3919         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3920         .sync_single_for_device = bounce_sync_single_for_device,
3921         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3922         .sync_sg_for_device     = bounce_sync_sg_for_device,
3923         .map_resource           = bounce_map_resource,
3924         .unmap_resource         = bounce_unmap_resource,
3925         .dma_supported          = dma_direct_supported,
3926 };
3927
3928 static inline int iommu_domain_cache_init(void)
3929 {
3930         int ret = 0;
3931
3932         iommu_domain_cache = kmem_cache_create("iommu_domain",
3933                                          sizeof(struct dmar_domain),
3934                                          0,
3935                                          SLAB_HWCACHE_ALIGN,
3936
3937                                          NULL);
3938         if (!iommu_domain_cache) {
3939                 pr_err("Couldn't create iommu_domain cache\n");
3940                 ret = -ENOMEM;
3941         }
3942
3943         return ret;
3944 }
3945
3946 static inline int iommu_devinfo_cache_init(void)
3947 {
3948         int ret = 0;
3949
3950         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3951                                          sizeof(struct device_domain_info),
3952                                          0,
3953                                          SLAB_HWCACHE_ALIGN,
3954                                          NULL);
3955         if (!iommu_devinfo_cache) {
3956                 pr_err("Couldn't create devinfo cache\n");
3957                 ret = -ENOMEM;
3958         }
3959
3960         return ret;
3961 }
3962
3963 static int __init iommu_init_mempool(void)
3964 {
3965         int ret;
3966         ret = iova_cache_get();
3967         if (ret)
3968                 return ret;
3969
3970         ret = iommu_domain_cache_init();
3971         if (ret)
3972                 goto domain_error;
3973
3974         ret = iommu_devinfo_cache_init();
3975         if (!ret)
3976                 return ret;
3977
3978         kmem_cache_destroy(iommu_domain_cache);
3979 domain_error:
3980         iova_cache_put();
3981
3982         return -ENOMEM;
3983 }
3984
3985 static void __init iommu_exit_mempool(void)
3986 {
3987         kmem_cache_destroy(iommu_devinfo_cache);
3988         kmem_cache_destroy(iommu_domain_cache);
3989         iova_cache_put();
3990 }
3991
3992 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3993 {
3994         struct dmar_drhd_unit *drhd;
3995         u32 vtbar;
3996         int rc;
3997
3998         /* We know that this device on this chipset has its own IOMMU.
3999          * If we find it under a different IOMMU, then the BIOS is lying
4000          * to us. Hope that the IOMMU for this device is actually
4001          * disabled, and it needs no translation...
4002          */
4003         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4004         if (rc) {
4005                 /* "can't" happen */
4006                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4007                 return;
4008         }
4009         vtbar &= 0xffff0000;
4010
4011         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4012         drhd = dmar_find_matched_drhd_unit(pdev);
4013         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4014                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4015                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4016                 dev_iommu_priv_set(&pdev->dev, DUMMY_DEVICE_DOMAIN_INFO);
4017         }
4018 }
4019 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4020
4021 static void __init init_no_remapping_devices(void)
4022 {
4023         struct dmar_drhd_unit *drhd;
4024         struct device *dev;
4025         int i;
4026
4027         for_each_drhd_unit(drhd) {
4028                 if (!drhd->include_all) {
4029                         for_each_active_dev_scope(drhd->devices,
4030                                                   drhd->devices_cnt, i, dev)
4031                                 break;
4032                         /* ignore DMAR unit if no devices exist */
4033                         if (i == drhd->devices_cnt)
4034                                 drhd->ignored = 1;
4035                 }
4036         }
4037
4038         for_each_active_drhd_unit(drhd) {
4039                 if (drhd->include_all)
4040                         continue;
4041
4042                 for_each_active_dev_scope(drhd->devices,
4043                                           drhd->devices_cnt, i, dev)
4044                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4045                                 break;
4046                 if (i < drhd->devices_cnt)
4047                         continue;
4048
4049                 /* This IOMMU has *only* gfx devices. Either bypass it or
4050                    set the gfx_mapped flag, as appropriate */
4051                 drhd->gfx_dedicated = 1;
4052                 if (!dmar_map_gfx) {
4053                         drhd->ignored = 1;
4054                         for_each_active_dev_scope(drhd->devices,
4055                                                   drhd->devices_cnt, i, dev)
4056                                 dev_iommu_priv_set(dev, DUMMY_DEVICE_DOMAIN_INFO);
4057                 }
4058         }
4059 }
4060
4061 #ifdef CONFIG_SUSPEND
4062 static int init_iommu_hw(void)
4063 {
4064         struct dmar_drhd_unit *drhd;
4065         struct intel_iommu *iommu = NULL;
4066
4067         for_each_active_iommu(iommu, drhd)
4068                 if (iommu->qi)
4069                         dmar_reenable_qi(iommu);
4070
4071         for_each_iommu(iommu, drhd) {
4072                 if (drhd->ignored) {
4073                         /*
4074                          * we always have to disable PMRs or DMA may fail on
4075                          * this device
4076                          */
4077                         if (force_on)
4078                                 iommu_disable_protect_mem_regions(iommu);
4079                         continue;
4080                 }
4081
4082                 iommu_flush_write_buffer(iommu);
4083
4084                 iommu_set_root_entry(iommu);
4085
4086                 iommu->flush.flush_context(iommu, 0, 0, 0,
4087                                            DMA_CCMD_GLOBAL_INVL);
4088                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4089                 iommu_enable_translation(iommu);
4090                 iommu_disable_protect_mem_regions(iommu);
4091         }
4092
4093         return 0;
4094 }
4095
4096 static void iommu_flush_all(void)
4097 {
4098         struct dmar_drhd_unit *drhd;
4099         struct intel_iommu *iommu;
4100
4101         for_each_active_iommu(iommu, drhd) {
4102                 iommu->flush.flush_context(iommu, 0, 0, 0,
4103                                            DMA_CCMD_GLOBAL_INVL);
4104                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4105                                          DMA_TLB_GLOBAL_FLUSH);
4106         }
4107 }
4108
4109 static int iommu_suspend(void)
4110 {
4111         struct dmar_drhd_unit *drhd;
4112         struct intel_iommu *iommu = NULL;
4113         unsigned long flag;
4114
4115         for_each_active_iommu(iommu, drhd) {
4116                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4117                                                  GFP_ATOMIC);
4118                 if (!iommu->iommu_state)
4119                         goto nomem;
4120         }
4121
4122         iommu_flush_all();
4123
4124         for_each_active_iommu(iommu, drhd) {
4125                 iommu_disable_translation(iommu);
4126
4127                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4128
4129                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4130                         readl(iommu->reg + DMAR_FECTL_REG);
4131                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4132                         readl(iommu->reg + DMAR_FEDATA_REG);
4133                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4134                         readl(iommu->reg + DMAR_FEADDR_REG);
4135                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4136                         readl(iommu->reg + DMAR_FEUADDR_REG);
4137
4138                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4139         }
4140         return 0;
4141
4142 nomem:
4143         for_each_active_iommu(iommu, drhd)
4144                 kfree(iommu->iommu_state);
4145
4146         return -ENOMEM;
4147 }
4148
4149 static void iommu_resume(void)
4150 {
4151         struct dmar_drhd_unit *drhd;
4152         struct intel_iommu *iommu = NULL;
4153         unsigned long flag;
4154
4155         if (init_iommu_hw()) {
4156                 if (force_on)
4157                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4158                 else
4159                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4160                 return;
4161         }
4162
4163         for_each_active_iommu(iommu, drhd) {
4164
4165                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4166
4167                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4168                         iommu->reg + DMAR_FECTL_REG);
4169                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4170                         iommu->reg + DMAR_FEDATA_REG);
4171                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4172                         iommu->reg + DMAR_FEADDR_REG);
4173                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4174                         iommu->reg + DMAR_FEUADDR_REG);
4175
4176                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4177         }
4178
4179         for_each_active_iommu(iommu, drhd)
4180                 kfree(iommu->iommu_state);
4181 }
4182
4183 static struct syscore_ops iommu_syscore_ops = {
4184         .resume         = iommu_resume,
4185         .suspend        = iommu_suspend,
4186 };
4187
4188 static void __init init_iommu_pm_ops(void)
4189 {
4190         register_syscore_ops(&iommu_syscore_ops);
4191 }
4192
4193 #else
4194 static inline void init_iommu_pm_ops(void) {}
4195 #endif  /* CONFIG_PM */
4196
4197 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4198 {
4199         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4200             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4201             rmrr->end_address <= rmrr->base_address ||
4202             arch_rmrr_sanity_check(rmrr))
4203                 return -EINVAL;
4204
4205         return 0;
4206 }
4207
4208 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4209 {
4210         struct acpi_dmar_reserved_memory *rmrr;
4211         struct dmar_rmrr_unit *rmrru;
4212
4213         rmrr = (struct acpi_dmar_reserved_memory *)header;
4214         if (rmrr_sanity_check(rmrr)) {
4215                 pr_warn(FW_BUG
4216                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4217                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4218                            rmrr->base_address, rmrr->end_address,
4219                            dmi_get_system_info(DMI_BIOS_VENDOR),
4220                            dmi_get_system_info(DMI_BIOS_VERSION),
4221                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4222                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4223         }
4224
4225         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4226         if (!rmrru)
4227                 goto out;
4228
4229         rmrru->hdr = header;
4230
4231         rmrru->base_address = rmrr->base_address;
4232         rmrru->end_address = rmrr->end_address;
4233
4234         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4235                                 ((void *)rmrr) + rmrr->header.length,
4236                                 &rmrru->devices_cnt);
4237         if (rmrru->devices_cnt && rmrru->devices == NULL)
4238                 goto free_rmrru;
4239
4240         list_add(&rmrru->list, &dmar_rmrr_units);
4241
4242         return 0;
4243 free_rmrru:
4244         kfree(rmrru);
4245 out:
4246         return -ENOMEM;
4247 }
4248
4249 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4250 {
4251         struct dmar_atsr_unit *atsru;
4252         struct acpi_dmar_atsr *tmp;
4253
4254         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4255                                 dmar_rcu_check()) {
4256                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4257                 if (atsr->segment != tmp->segment)
4258                         continue;
4259                 if (atsr->header.length != tmp->header.length)
4260                         continue;
4261                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4262                         return atsru;
4263         }
4264
4265         return NULL;
4266 }
4267
4268 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4269 {
4270         struct acpi_dmar_atsr *atsr;
4271         struct dmar_atsr_unit *atsru;
4272
4273         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4274                 return 0;
4275
4276         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4277         atsru = dmar_find_atsr(atsr);
4278         if (atsru)
4279                 return 0;
4280
4281         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4282         if (!atsru)
4283                 return -ENOMEM;
4284
4285         /*
4286          * If memory is allocated from slab by ACPI _DSM method, we need to
4287          * copy the memory content because the memory buffer will be freed
4288          * on return.
4289          */
4290         atsru->hdr = (void *)(atsru + 1);
4291         memcpy(atsru->hdr, hdr, hdr->length);
4292         atsru->include_all = atsr->flags & 0x1;
4293         if (!atsru->include_all) {
4294                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4295                                 (void *)atsr + atsr->header.length,
4296                                 &atsru->devices_cnt);
4297                 if (atsru->devices_cnt && atsru->devices == NULL) {
4298                         kfree(atsru);
4299                         return -ENOMEM;
4300                 }
4301         }
4302
4303         list_add_rcu(&atsru->list, &dmar_atsr_units);
4304
4305         return 0;
4306 }
4307
4308 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4309 {
4310         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4311         kfree(atsru);
4312 }
4313
4314 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4315 {
4316         struct acpi_dmar_atsr *atsr;
4317         struct dmar_atsr_unit *atsru;
4318
4319         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320         atsru = dmar_find_atsr(atsr);
4321         if (atsru) {
4322                 list_del_rcu(&atsru->list);
4323                 synchronize_rcu();
4324                 intel_iommu_free_atsr(atsru);
4325         }
4326
4327         return 0;
4328 }
4329
4330 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4331 {
4332         int i;
4333         struct device *dev;
4334         struct acpi_dmar_atsr *atsr;
4335         struct dmar_atsr_unit *atsru;
4336
4337         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4338         atsru = dmar_find_atsr(atsr);
4339         if (!atsru)
4340                 return 0;
4341
4342         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4343                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4344                                           i, dev)
4345                         return -EBUSY;
4346         }
4347
4348         return 0;
4349 }
4350
4351 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4352 {
4353         int sp, ret;
4354         struct intel_iommu *iommu = dmaru->iommu;
4355
4356         if (g_iommus[iommu->seq_id])
4357                 return 0;
4358
4359         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4360                 pr_warn("%s: Doesn't support hardware pass through.\n",
4361                         iommu->name);
4362                 return -ENXIO;
4363         }
4364         if (!ecap_sc_support(iommu->ecap) &&
4365             domain_update_iommu_snooping(iommu)) {
4366                 pr_warn("%s: Doesn't support snooping.\n",
4367                         iommu->name);
4368                 return -ENXIO;
4369         }
4370         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4371         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4372                 pr_warn("%s: Doesn't support large page.\n",
4373                         iommu->name);
4374                 return -ENXIO;
4375         }
4376
4377         /*
4378          * Disable translation if already enabled prior to OS handover.
4379          */
4380         if (iommu->gcmd & DMA_GCMD_TE)
4381                 iommu_disable_translation(iommu);
4382
4383         g_iommus[iommu->seq_id] = iommu;
4384         ret = iommu_init_domains(iommu);
4385         if (ret == 0)
4386                 ret = iommu_alloc_root_entry(iommu);
4387         if (ret)
4388                 goto out;
4389
4390         intel_svm_check(iommu);
4391
4392         if (dmaru->ignored) {
4393                 /*
4394                  * we always have to disable PMRs or DMA may fail on this device
4395                  */
4396                 if (force_on)
4397                         iommu_disable_protect_mem_regions(iommu);
4398                 return 0;
4399         }
4400
4401         intel_iommu_init_qi(iommu);
4402         iommu_flush_write_buffer(iommu);
4403
4404 #ifdef CONFIG_INTEL_IOMMU_SVM
4405         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4406                 ret = intel_svm_enable_prq(iommu);
4407                 if (ret)
4408                         goto disable_iommu;
4409         }
4410 #endif
4411         ret = dmar_set_interrupt(iommu);
4412         if (ret)
4413                 goto disable_iommu;
4414
4415         iommu_set_root_entry(iommu);
4416         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4417         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4418         iommu_enable_translation(iommu);
4419
4420         iommu_disable_protect_mem_regions(iommu);
4421         return 0;
4422
4423 disable_iommu:
4424         disable_dmar_iommu(iommu);
4425 out:
4426         free_dmar_iommu(iommu);
4427         return ret;
4428 }
4429
4430 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4431 {
4432         int ret = 0;
4433         struct intel_iommu *iommu = dmaru->iommu;
4434
4435         if (!intel_iommu_enabled)
4436                 return 0;
4437         if (iommu == NULL)
4438                 return -EINVAL;
4439
4440         if (insert) {
4441                 ret = intel_iommu_add(dmaru);
4442         } else {
4443                 disable_dmar_iommu(iommu);
4444                 free_dmar_iommu(iommu);
4445         }
4446
4447         return ret;
4448 }
4449
4450 static void intel_iommu_free_dmars(void)
4451 {
4452         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4453         struct dmar_atsr_unit *atsru, *atsr_n;
4454
4455         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4456                 list_del(&rmrru->list);
4457                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4458                 kfree(rmrru);
4459         }
4460
4461         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4462                 list_del(&atsru->list);
4463                 intel_iommu_free_atsr(atsru);
4464         }
4465 }
4466
4467 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4468 {
4469         int i, ret = 1;
4470         struct pci_bus *bus;
4471         struct pci_dev *bridge = NULL;
4472         struct device *tmp;
4473         struct acpi_dmar_atsr *atsr;
4474         struct dmar_atsr_unit *atsru;
4475
4476         dev = pci_physfn(dev);
4477         for (bus = dev->bus; bus; bus = bus->parent) {
4478                 bridge = bus->self;
4479                 /* If it's an integrated device, allow ATS */
4480                 if (!bridge)
4481                         return 1;
4482                 /* Connected via non-PCIe: no ATS */
4483                 if (!pci_is_pcie(bridge) ||
4484                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4485                         return 0;
4486                 /* If we found the root port, look it up in the ATSR */
4487                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4488                         break;
4489         }
4490
4491         rcu_read_lock();
4492         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4493                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4494                 if (atsr->segment != pci_domain_nr(dev->bus))
4495                         continue;
4496
4497                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4498                         if (tmp == &bridge->dev)
4499                                 goto out;
4500
4501                 if (atsru->include_all)
4502                         goto out;
4503         }
4504         ret = 0;
4505 out:
4506         rcu_read_unlock();
4507
4508         return ret;
4509 }
4510
4511 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4512 {
4513         int ret;
4514         struct dmar_rmrr_unit *rmrru;
4515         struct dmar_atsr_unit *atsru;
4516         struct acpi_dmar_atsr *atsr;
4517         struct acpi_dmar_reserved_memory *rmrr;
4518
4519         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4520                 return 0;
4521
4522         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4523                 rmrr = container_of(rmrru->hdr,
4524                                     struct acpi_dmar_reserved_memory, header);
4525                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4526                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4527                                 ((void *)rmrr) + rmrr->header.length,
4528                                 rmrr->segment, rmrru->devices,
4529                                 rmrru->devices_cnt);
4530                         if (ret < 0)
4531                                 return ret;
4532                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4533                         dmar_remove_dev_scope(info, rmrr->segment,
4534                                 rmrru->devices, rmrru->devices_cnt);
4535                 }
4536         }
4537
4538         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4539                 if (atsru->include_all)
4540                         continue;
4541
4542                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4543                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4544                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4545                                         (void *)atsr + atsr->header.length,
4546                                         atsr->segment, atsru->devices,
4547                                         atsru->devices_cnt);
4548                         if (ret > 0)
4549                                 break;
4550                         else if (ret < 0)
4551                                 return ret;
4552                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4553                         if (dmar_remove_dev_scope(info, atsr->segment,
4554                                         atsru->devices, atsru->devices_cnt))
4555                                 break;
4556                 }
4557         }
4558
4559         return 0;
4560 }
4561
4562 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4563                                        unsigned long val, void *v)
4564 {
4565         struct memory_notify *mhp = v;
4566         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4568                         mhp->nr_pages - 1);
4569
4570         switch (val) {
4571         case MEM_GOING_ONLINE:
4572                 if (iommu_domain_identity_map(si_domain,
4573                                               start_vpfn, last_vpfn)) {
4574                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4575                                 start_vpfn, last_vpfn);
4576                         return NOTIFY_BAD;
4577                 }
4578                 break;
4579
4580         case MEM_OFFLINE:
4581         case MEM_CANCEL_ONLINE:
4582                 {
4583                         struct dmar_drhd_unit *drhd;
4584                         struct intel_iommu *iommu;
4585                         struct page *freelist;
4586
4587                         freelist = domain_unmap(si_domain,
4588                                                 start_vpfn, last_vpfn);
4589
4590                         rcu_read_lock();
4591                         for_each_active_iommu(iommu, drhd)
4592                                 iommu_flush_iotlb_psi(iommu, si_domain,
4593                                         start_vpfn, mhp->nr_pages,
4594                                         !freelist, 0);
4595                         rcu_read_unlock();
4596                         dma_free_pagelist(freelist);
4597                 }
4598                 break;
4599         }
4600
4601         return NOTIFY_OK;
4602 }
4603
4604 static struct notifier_block intel_iommu_memory_nb = {
4605         .notifier_call = intel_iommu_memory_notifier,
4606         .priority = 0
4607 };
4608
4609 static void free_all_cpu_cached_iovas(unsigned int cpu)
4610 {
4611         int i;
4612
4613         for (i = 0; i < g_num_of_iommus; i++) {
4614                 struct intel_iommu *iommu = g_iommus[i];
4615                 struct dmar_domain *domain;
4616                 int did;
4617
4618                 if (!iommu)
4619                         continue;
4620
4621                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4622                         domain = get_iommu_domain(iommu, (u16)did);
4623
4624                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4625                                 continue;
4626
4627                         free_cpu_cached_iovas(cpu, &domain->iovad);
4628                 }
4629         }
4630 }
4631
4632 static int intel_iommu_cpu_dead(unsigned int cpu)
4633 {
4634         free_all_cpu_cached_iovas(cpu);
4635         return 0;
4636 }
4637
4638 static void intel_disable_iommus(void)
4639 {
4640         struct intel_iommu *iommu = NULL;
4641         struct dmar_drhd_unit *drhd;
4642
4643         for_each_iommu(iommu, drhd)
4644                 iommu_disable_translation(iommu);
4645 }
4646
4647 void intel_iommu_shutdown(void)
4648 {
4649         struct dmar_drhd_unit *drhd;
4650         struct intel_iommu *iommu = NULL;
4651
4652         if (no_iommu || dmar_disabled)
4653                 return;
4654
4655         down_write(&dmar_global_lock);
4656
4657         /* Disable PMRs explicitly here. */
4658         for_each_iommu(iommu, drhd)
4659                 iommu_disable_protect_mem_regions(iommu);
4660
4661         /* Make sure the IOMMUs are switched off */
4662         intel_disable_iommus();
4663
4664         up_write(&dmar_global_lock);
4665 }
4666
4667 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4668 {
4669         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4670
4671         return container_of(iommu_dev, struct intel_iommu, iommu);
4672 }
4673
4674 static ssize_t intel_iommu_show_version(struct device *dev,
4675                                         struct device_attribute *attr,
4676                                         char *buf)
4677 {
4678         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4680         return sprintf(buf, "%d:%d\n",
4681                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4682 }
4683 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4684
4685 static ssize_t intel_iommu_show_address(struct device *dev,
4686                                         struct device_attribute *attr,
4687                                         char *buf)
4688 {
4689         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690         return sprintf(buf, "%llx\n", iommu->reg_phys);
4691 }
4692 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4693
4694 static ssize_t intel_iommu_show_cap(struct device *dev,
4695                                     struct device_attribute *attr,
4696                                     char *buf)
4697 {
4698         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699         return sprintf(buf, "%llx\n", iommu->cap);
4700 }
4701 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4702
4703 static ssize_t intel_iommu_show_ecap(struct device *dev,
4704                                     struct device_attribute *attr,
4705                                     char *buf)
4706 {
4707         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708         return sprintf(buf, "%llx\n", iommu->ecap);
4709 }
4710 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4711
4712 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4713                                       struct device_attribute *attr,
4714                                       char *buf)
4715 {
4716         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4717         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4718 }
4719 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4720
4721 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4722                                            struct device_attribute *attr,
4723                                            char *buf)
4724 {
4725         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4726         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4727                                                   cap_ndoms(iommu->cap)));
4728 }
4729 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4730
4731 static struct attribute *intel_iommu_attrs[] = {
4732         &dev_attr_version.attr,
4733         &dev_attr_address.attr,
4734         &dev_attr_cap.attr,
4735         &dev_attr_ecap.attr,
4736         &dev_attr_domains_supported.attr,
4737         &dev_attr_domains_used.attr,
4738         NULL,
4739 };
4740
4741 static struct attribute_group intel_iommu_group = {
4742         .name = "intel-iommu",
4743         .attrs = intel_iommu_attrs,
4744 };
4745
4746 const struct attribute_group *intel_iommu_groups[] = {
4747         &intel_iommu_group,
4748         NULL,
4749 };
4750
4751 static inline bool has_external_pci(void)
4752 {
4753         struct pci_dev *pdev = NULL;
4754
4755         for_each_pci_dev(pdev)
4756                 if (pdev->external_facing)
4757                         return true;
4758
4759         return false;
4760 }
4761
4762 static int __init platform_optin_force_iommu(void)
4763 {
4764         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4765                 return 0;
4766
4767         if (no_iommu || dmar_disabled)
4768                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4769
4770         /*
4771          * If Intel-IOMMU is disabled by default, we will apply identity
4772          * map for all devices except those marked as being untrusted.
4773          */
4774         if (dmar_disabled)
4775                 iommu_set_default_passthrough(false);
4776
4777         dmar_disabled = 0;
4778         no_iommu = 0;
4779
4780         return 1;
4781 }
4782
4783 static int __init probe_acpi_namespace_devices(void)
4784 {
4785         struct dmar_drhd_unit *drhd;
4786         /* To avoid a -Wunused-but-set-variable warning. */
4787         struct intel_iommu *iommu __maybe_unused;
4788         struct device *dev;
4789         int i, ret = 0;
4790
4791         for_each_active_iommu(iommu, drhd) {
4792                 for_each_active_dev_scope(drhd->devices,
4793                                           drhd->devices_cnt, i, dev) {
4794                         struct acpi_device_physical_node *pn;
4795                         struct iommu_group *group;
4796                         struct acpi_device *adev;
4797
4798                         if (dev->bus != &acpi_bus_type)
4799                                 continue;
4800
4801                         adev = to_acpi_device(dev);
4802                         mutex_lock(&adev->physical_node_lock);
4803                         list_for_each_entry(pn,
4804                                             &adev->physical_node_list, node) {
4805                                 group = iommu_group_get(pn->dev);
4806                                 if (group) {
4807                                         iommu_group_put(group);
4808                                         continue;
4809                                 }
4810
4811                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4812                                 ret = iommu_probe_device(pn->dev);
4813                                 if (ret)
4814                                         break;
4815                         }
4816                         mutex_unlock(&adev->physical_node_lock);
4817
4818                         if (ret)
4819                                 return ret;
4820                 }
4821         }
4822
4823         return 0;
4824 }
4825
4826 int __init intel_iommu_init(void)
4827 {
4828         int ret = -ENODEV;
4829         struct dmar_drhd_unit *drhd;
4830         struct intel_iommu *iommu;
4831
4832         /*
4833          * Intel IOMMU is required for a TXT/tboot launch or platform
4834          * opt in, so enforce that.
4835          */
4836         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4837
4838         if (iommu_init_mempool()) {
4839                 if (force_on)
4840                         panic("tboot: Failed to initialize iommu memory\n");
4841                 return -ENOMEM;
4842         }
4843
4844         down_write(&dmar_global_lock);
4845         if (dmar_table_init()) {
4846                 if (force_on)
4847                         panic("tboot: Failed to initialize DMAR table\n");
4848                 goto out_free_dmar;
4849         }
4850
4851         if (dmar_dev_scope_init() < 0) {
4852                 if (force_on)
4853                         panic("tboot: Failed to initialize DMAR device scope\n");
4854                 goto out_free_dmar;
4855         }
4856
4857         up_write(&dmar_global_lock);
4858
4859         /*
4860          * The bus notifier takes the dmar_global_lock, so lockdep will
4861          * complain later when we register it under the lock.
4862          */
4863         dmar_register_bus_notifier();
4864
4865         down_write(&dmar_global_lock);
4866
4867         if (!no_iommu)
4868                 intel_iommu_debugfs_init();
4869
4870         if (no_iommu || dmar_disabled) {
4871                 /*
4872                  * We exit the function here to ensure IOMMU's remapping and
4873                  * mempool aren't setup, which means that the IOMMU's PMRs
4874                  * won't be disabled via the call to init_dmars(). So disable
4875                  * it explicitly here. The PMRs were setup by tboot prior to
4876                  * calling SENTER, but the kernel is expected to reset/tear
4877                  * down the PMRs.
4878                  */
4879                 if (intel_iommu_tboot_noforce) {
4880                         for_each_iommu(iommu, drhd)
4881                                 iommu_disable_protect_mem_regions(iommu);
4882                 }
4883
4884                 /*
4885                  * Make sure the IOMMUs are switched off, even when we
4886                  * boot into a kexec kernel and the previous kernel left
4887                  * them enabled
4888                  */
4889                 intel_disable_iommus();
4890                 goto out_free_dmar;
4891         }
4892
4893         if (list_empty(&dmar_rmrr_units))
4894                 pr_info("No RMRR found\n");
4895
4896         if (list_empty(&dmar_atsr_units))
4897                 pr_info("No ATSR found\n");
4898
4899         if (dmar_init_reserved_ranges()) {
4900                 if (force_on)
4901                         panic("tboot: Failed to reserve iommu ranges\n");
4902                 goto out_free_reserved_range;
4903         }
4904
4905         if (dmar_map_gfx)
4906                 intel_iommu_gfx_mapped = 1;
4907
4908         init_no_remapping_devices();
4909
4910         ret = init_dmars();
4911         if (ret) {
4912                 if (force_on)
4913                         panic("tboot: Failed to initialize DMARs\n");
4914                 pr_err("Initialization failed\n");
4915                 goto out_free_reserved_range;
4916         }
4917         up_write(&dmar_global_lock);
4918
4919         init_iommu_pm_ops();
4920
4921         down_read(&dmar_global_lock);
4922         for_each_active_iommu(iommu, drhd) {
4923                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4924                                        intel_iommu_groups,
4925                                        "%s", iommu->name);
4926                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4927                 iommu_device_register(&iommu->iommu);
4928         }
4929         up_read(&dmar_global_lock);
4930
4931         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4932         if (si_domain && !hw_pass_through)
4933                 register_memory_notifier(&intel_iommu_memory_nb);
4934         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4935                           intel_iommu_cpu_dead);
4936
4937         down_read(&dmar_global_lock);
4938         if (probe_acpi_namespace_devices())
4939                 pr_warn("ACPI name space devices didn't probe correctly\n");
4940
4941         /* Finally, we enable the DMA remapping hardware. */
4942         for_each_iommu(iommu, drhd) {
4943                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4944                         iommu_enable_translation(iommu);
4945
4946                 iommu_disable_protect_mem_regions(iommu);
4947         }
4948         up_read(&dmar_global_lock);
4949
4950         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4951
4952         intel_iommu_enabled = 1;
4953
4954         return 0;
4955
4956 out_free_reserved_range:
4957         put_iova_domain(&reserved_iova_list);
4958 out_free_dmar:
4959         intel_iommu_free_dmars();
4960         up_write(&dmar_global_lock);
4961         iommu_exit_mempool();
4962         return ret;
4963 }
4964
4965 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4966 {
4967         struct intel_iommu *iommu = opaque;
4968
4969         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4970         return 0;
4971 }
4972
4973 /*
4974  * NB - intel-iommu lacks any sort of reference counting for the users of
4975  * dependent devices.  If multiple endpoints have intersecting dependent
4976  * devices, unbinding the driver from any one of them will possibly leave
4977  * the others unable to operate.
4978  */
4979 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4980 {
4981         if (!iommu || !dev || !dev_is_pci(dev))
4982                 return;
4983
4984         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4985 }
4986
4987 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4988 {
4989         struct dmar_domain *domain;
4990         struct intel_iommu *iommu;
4991         unsigned long flags;
4992
4993         assert_spin_locked(&device_domain_lock);
4994
4995         if (WARN_ON(!info))
4996                 return;
4997
4998         iommu = info->iommu;
4999         domain = info->domain;
5000
5001         if (info->dev) {
5002                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5003                         intel_pasid_tear_down_entry(iommu, info->dev,
5004                                         PASID_RID2PASID, false);
5005
5006                 iommu_disable_dev_iotlb(info);
5007                 if (!dev_is_real_dma_subdevice(info->dev))
5008                         domain_context_clear(iommu, info->dev);
5009                 intel_pasid_free_table(info->dev);
5010         }
5011
5012         unlink_domain_info(info);
5013
5014         spin_lock_irqsave(&iommu->lock, flags);
5015         domain_detach_iommu(domain, iommu);
5016         spin_unlock_irqrestore(&iommu->lock, flags);
5017
5018         free_devinfo_mem(info);
5019 }
5020
5021 static void dmar_remove_one_dev_info(struct device *dev)
5022 {
5023         struct device_domain_info *info;
5024         unsigned long flags;
5025
5026         spin_lock_irqsave(&device_domain_lock, flags);
5027         info = get_domain_info(dev);
5028         if (info)
5029                 __dmar_remove_one_dev_info(info);
5030         spin_unlock_irqrestore(&device_domain_lock, flags);
5031 }
5032
5033 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5034 {
5035         int adjust_width;
5036
5037         /* calculate AGAW */
5038         domain->gaw = guest_width;
5039         adjust_width = guestwidth_to_adjustwidth(guest_width);
5040         domain->agaw = width_to_agaw(adjust_width);
5041
5042         domain->iommu_coherency = 0;
5043         domain->iommu_snooping = 0;
5044         domain->iommu_superpage = 0;
5045         domain->max_addr = 0;
5046
5047         /* always allocate the top pgd */
5048         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5049         if (!domain->pgd)
5050                 return -ENOMEM;
5051         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5052         return 0;
5053 }
5054
5055 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5056 {
5057         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5058         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5059
5060         if (!intel_iommu_strict &&
5061             init_iova_flush_queue(&dmar_domain->iovad,
5062                                   iommu_flush_iova, iova_entry_free))
5063                 pr_info("iova flush queue initialization failed\n");
5064 }
5065
5066 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5067 {
5068         struct dmar_domain *dmar_domain;
5069         struct iommu_domain *domain;
5070
5071         switch (type) {
5072         case IOMMU_DOMAIN_DMA:
5073         case IOMMU_DOMAIN_UNMANAGED:
5074                 dmar_domain = alloc_domain(0);
5075                 if (!dmar_domain) {
5076                         pr_err("Can't allocate dmar_domain\n");
5077                         return NULL;
5078                 }
5079                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5080                         pr_err("Domain initialization failed\n");
5081                         domain_exit(dmar_domain);
5082                         return NULL;
5083                 }
5084
5085                 if (type == IOMMU_DOMAIN_DMA)
5086                         intel_init_iova_domain(dmar_domain);
5087
5088                 domain_update_iommu_cap(dmar_domain);
5089
5090                 domain = &dmar_domain->domain;
5091                 domain->geometry.aperture_start = 0;
5092                 domain->geometry.aperture_end   =
5093                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5094                 domain->geometry.force_aperture = true;
5095
5096                 return domain;
5097         case IOMMU_DOMAIN_IDENTITY:
5098                 return &si_domain->domain;
5099         default:
5100                 return NULL;
5101         }
5102
5103         return NULL;
5104 }
5105
5106 static void intel_iommu_domain_free(struct iommu_domain *domain)
5107 {
5108         if (domain != &si_domain->domain)
5109                 domain_exit(to_dmar_domain(domain));
5110 }
5111
5112 /*
5113  * Check whether a @domain could be attached to the @dev through the
5114  * aux-domain attach/detach APIs.
5115  */
5116 static inline bool
5117 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5118 {
5119         struct device_domain_info *info = get_domain_info(dev);
5120
5121         return info && info->auxd_enabled &&
5122                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5123 }
5124
5125 static void auxiliary_link_device(struct dmar_domain *domain,
5126                                   struct device *dev)
5127 {
5128         struct device_domain_info *info = get_domain_info(dev);
5129
5130         assert_spin_locked(&device_domain_lock);
5131         if (WARN_ON(!info))
5132                 return;
5133
5134         domain->auxd_refcnt++;
5135         list_add(&domain->auxd, &info->auxiliary_domains);
5136 }
5137
5138 static void auxiliary_unlink_device(struct dmar_domain *domain,
5139                                     struct device *dev)
5140 {
5141         struct device_domain_info *info = get_domain_info(dev);
5142
5143         assert_spin_locked(&device_domain_lock);
5144         if (WARN_ON(!info))
5145                 return;
5146
5147         list_del(&domain->auxd);
5148         domain->auxd_refcnt--;
5149
5150         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5151                 ioasid_free(domain->default_pasid);
5152 }
5153
5154 static int aux_domain_add_dev(struct dmar_domain *domain,
5155                               struct device *dev)
5156 {
5157         int ret;
5158         unsigned long flags;
5159         struct intel_iommu *iommu;
5160
5161         iommu = device_to_iommu(dev, NULL, NULL);
5162         if (!iommu)
5163                 return -ENODEV;
5164
5165         if (domain->default_pasid <= 0) {
5166                 int pasid;
5167
5168                 /* No private data needed for the default pasid */
5169                 pasid = ioasid_alloc(NULL, PASID_MIN,
5170                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5171                                      NULL);
5172                 if (pasid == INVALID_IOASID) {
5173                         pr_err("Can't allocate default pasid\n");
5174                         return -ENODEV;
5175                 }
5176                 domain->default_pasid = pasid;
5177         }
5178
5179         spin_lock_irqsave(&device_domain_lock, flags);
5180         /*
5181          * iommu->lock must be held to attach domain to iommu and setup the
5182          * pasid entry for second level translation.
5183          */
5184         spin_lock(&iommu->lock);
5185         ret = domain_attach_iommu(domain, iommu);
5186         if (ret)
5187                 goto attach_failed;
5188
5189         /* Setup the PASID entry for mediated devices: */
5190         if (domain_use_first_level(domain))
5191                 ret = domain_setup_first_level(iommu, domain, dev,
5192                                                domain->default_pasid);
5193         else
5194                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5195                                                      domain->default_pasid);
5196         if (ret)
5197                 goto table_failed;
5198         spin_unlock(&iommu->lock);
5199
5200         auxiliary_link_device(domain, dev);
5201
5202         spin_unlock_irqrestore(&device_domain_lock, flags);
5203
5204         return 0;
5205
5206 table_failed:
5207         domain_detach_iommu(domain, iommu);
5208 attach_failed:
5209         spin_unlock(&iommu->lock);
5210         spin_unlock_irqrestore(&device_domain_lock, flags);
5211         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5212                 ioasid_free(domain->default_pasid);
5213
5214         return ret;
5215 }
5216
5217 static void aux_domain_remove_dev(struct dmar_domain *domain,
5218                                   struct device *dev)
5219 {
5220         struct device_domain_info *info;
5221         struct intel_iommu *iommu;
5222         unsigned long flags;
5223
5224         if (!is_aux_domain(dev, &domain->domain))
5225                 return;
5226
5227         spin_lock_irqsave(&device_domain_lock, flags);
5228         info = get_domain_info(dev);
5229         iommu = info->iommu;
5230
5231         auxiliary_unlink_device(domain, dev);
5232
5233         spin_lock(&iommu->lock);
5234         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5235         domain_detach_iommu(domain, iommu);
5236         spin_unlock(&iommu->lock);
5237
5238         spin_unlock_irqrestore(&device_domain_lock, flags);
5239 }
5240
5241 static int prepare_domain_attach_device(struct iommu_domain *domain,
5242                                         struct device *dev)
5243 {
5244         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5245         struct intel_iommu *iommu;
5246         int addr_width;
5247
5248         iommu = device_to_iommu(dev, NULL, NULL);
5249         if (!iommu)
5250                 return -ENODEV;
5251
5252         /* check if this iommu agaw is sufficient for max mapped address */
5253         addr_width = agaw_to_width(iommu->agaw);
5254         if (addr_width > cap_mgaw(iommu->cap))
5255                 addr_width = cap_mgaw(iommu->cap);
5256
5257         if (dmar_domain->max_addr > (1LL << addr_width)) {
5258                 dev_err(dev, "%s: iommu width (%d) is not "
5259                         "sufficient for the mapped address (%llx)\n",
5260                         __func__, addr_width, dmar_domain->max_addr);
5261                 return -EFAULT;
5262         }
5263         dmar_domain->gaw = addr_width;
5264
5265         /*
5266          * Knock out extra levels of page tables if necessary
5267          */
5268         while (iommu->agaw < dmar_domain->agaw) {
5269                 struct dma_pte *pte;
5270
5271                 pte = dmar_domain->pgd;
5272                 if (dma_pte_present(pte)) {
5273                         dmar_domain->pgd = (struct dma_pte *)
5274                                 phys_to_virt(dma_pte_addr(pte));
5275                         free_pgtable_page(pte);
5276                 }
5277                 dmar_domain->agaw--;
5278         }
5279
5280         return 0;
5281 }
5282
5283 static int intel_iommu_attach_device(struct iommu_domain *domain,
5284                                      struct device *dev)
5285 {
5286         int ret;
5287
5288         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5289             device_is_rmrr_locked(dev)) {
5290                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5291                 return -EPERM;
5292         }
5293
5294         if (is_aux_domain(dev, domain))
5295                 return -EPERM;
5296
5297         /* normally dev is not mapped */
5298         if (unlikely(domain_context_mapped(dev))) {
5299                 struct dmar_domain *old_domain;
5300
5301                 old_domain = find_domain(dev);
5302                 if (old_domain)
5303                         dmar_remove_one_dev_info(dev);
5304         }
5305
5306         ret = prepare_domain_attach_device(domain, dev);
5307         if (ret)
5308                 return ret;
5309
5310         return domain_add_dev_info(to_dmar_domain(domain), dev);
5311 }
5312
5313 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5314                                          struct device *dev)
5315 {
5316         int ret;
5317
5318         if (!is_aux_domain(dev, domain))
5319                 return -EPERM;
5320
5321         ret = prepare_domain_attach_device(domain, dev);
5322         if (ret)
5323                 return ret;
5324
5325         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5326 }
5327
5328 static void intel_iommu_detach_device(struct iommu_domain *domain,
5329                                       struct device *dev)
5330 {
5331         dmar_remove_one_dev_info(dev);
5332 }
5333
5334 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5335                                           struct device *dev)
5336 {
5337         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5338 }
5339
5340 /*
5341  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5342  * VT-d granularity. Invalidation is typically included in the unmap operation
5343  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5344  * owns the first level page tables. Invalidations of translation caches in the
5345  * guest are trapped and passed down to the host.
5346  *
5347  * vIOMMU in the guest will only expose first level page tables, therefore
5348  * we do not support IOTLB granularity for request without PASID (second level).
5349  *
5350  * For example, to find the VT-d granularity encoding for IOTLB
5351  * type and page selective granularity within PASID:
5352  * X: indexed by iommu cache type
5353  * Y: indexed by enum iommu_inv_granularity
5354  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5355  */
5356
5357 static const int
5358 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5359         /*
5360          * PASID based IOTLB invalidation: PASID selective (per PASID),
5361          * page selective (address granularity)
5362          */
5363         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5364         /* PASID based dev TLBs */
5365         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5366         /* PASID cache */
5367         {-EINVAL, -EINVAL, -EINVAL}
5368 };
5369
5370 static inline int to_vtd_granularity(int type, int granu)
5371 {
5372         return inv_type_granu_table[type][granu];
5373 }
5374
5375 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5376 {
5377         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5378
5379         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5380          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5381          * granu size in contiguous memory.
5382          */
5383         return order_base_2(nr_pages);
5384 }
5385
5386 #ifdef CONFIG_INTEL_IOMMU_SVM
5387 static int
5388 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5389                            struct iommu_cache_invalidate_info *inv_info)
5390 {
5391         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5392         struct device_domain_info *info;
5393         struct intel_iommu *iommu;
5394         unsigned long flags;
5395         int cache_type;
5396         u8 bus, devfn;
5397         u16 did, sid;
5398         int ret = 0;
5399         u64 size = 0;
5400
5401         if (!inv_info || !dmar_domain ||
5402             inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5403                 return -EINVAL;
5404
5405         if (!dev || !dev_is_pci(dev))
5406                 return -ENODEV;
5407
5408         iommu = device_to_iommu(dev, &bus, &devfn);
5409         if (!iommu)
5410                 return -ENODEV;
5411
5412         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5413                 return -EINVAL;
5414
5415         spin_lock_irqsave(&device_domain_lock, flags);
5416         spin_lock(&iommu->lock);
5417         info = get_domain_info(dev);
5418         if (!info) {
5419                 ret = -EINVAL;
5420                 goto out_unlock;
5421         }
5422         did = dmar_domain->iommu_did[iommu->seq_id];
5423         sid = PCI_DEVID(bus, devfn);
5424
5425         /* Size is only valid in address selective invalidation */
5426         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5427                 size = to_vtd_size(inv_info->addr_info.granule_size,
5428                                    inv_info->addr_info.nb_granules);
5429
5430         for_each_set_bit(cache_type,
5431                          (unsigned long *)&inv_info->cache,
5432                          IOMMU_CACHE_INV_TYPE_NR) {
5433                 int granu = 0;
5434                 u64 pasid = 0;
5435                 u64 addr = 0;
5436
5437                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5438                 if (granu == -EINVAL) {
5439                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5440                                            cache_type, inv_info->granularity);
5441                         break;
5442                 }
5443
5444                 /*
5445                  * PASID is stored in different locations based on the
5446                  * granularity.
5447                  */
5448                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5449                     (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5450                         pasid = inv_info->pasid_info.pasid;
5451                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5452                          (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5453                         pasid = inv_info->addr_info.pasid;
5454
5455                 switch (BIT(cache_type)) {
5456                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5457                         /* HW will ignore LSB bits based on address mask */
5458                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5459                             size &&
5460                             (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5461                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5462                                                    inv_info->addr_info.addr, size);
5463                         }
5464
5465                         /*
5466                          * If granu is PASID-selective, address is ignored.
5467                          * We use npages = -1 to indicate that.
5468                          */
5469                         qi_flush_piotlb(iommu, did, pasid,
5470                                         mm_to_dma_pfn(inv_info->addr_info.addr),
5471                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5472                                         inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5473
5474                         if (!info->ats_enabled)
5475                                 break;
5476                         /*
5477                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5478                          * in the guest may assume IOTLB flush is inclusive,
5479                          * which is more efficient.
5480                          */
5481                         fallthrough;
5482                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5483                         /*
5484                          * PASID based device TLB invalidation does not support
5485                          * IOMMU_INV_GRANU_PASID granularity but only supports
5486                          * IOMMU_INV_GRANU_ADDR.
5487                          * The equivalent of that is we set the size to be the
5488                          * entire range of 64 bit. User only provides PASID info
5489                          * without address info. So we set addr to 0.
5490                          */
5491                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5492                                 size = 64 - VTD_PAGE_SHIFT;
5493                                 addr = 0;
5494                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5495                                 addr = inv_info->addr_info.addr;
5496                         }
5497
5498                         if (info->ats_enabled)
5499                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5500                                                 info->pfsid, pasid,
5501                                                 info->ats_qdep, addr,
5502                                                 size);
5503                         else
5504                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5505                         break;
5506                 default:
5507                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5508                                             cache_type);
5509                         ret = -EINVAL;
5510                 }
5511         }
5512 out_unlock:
5513         spin_unlock(&iommu->lock);
5514         spin_unlock_irqrestore(&device_domain_lock, flags);
5515
5516         return ret;
5517 }
5518 #endif
5519
5520 static int intel_iommu_map(struct iommu_domain *domain,
5521                            unsigned long iova, phys_addr_t hpa,
5522                            size_t size, int iommu_prot, gfp_t gfp)
5523 {
5524         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5525         u64 max_addr;
5526         int prot = 0;
5527         int ret;
5528
5529         if (iommu_prot & IOMMU_READ)
5530                 prot |= DMA_PTE_READ;
5531         if (iommu_prot & IOMMU_WRITE)
5532                 prot |= DMA_PTE_WRITE;
5533         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5534                 prot |= DMA_PTE_SNP;
5535
5536         max_addr = iova + size;
5537         if (dmar_domain->max_addr < max_addr) {
5538                 u64 end;
5539
5540                 /* check if minimum agaw is sufficient for mapped address */
5541                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5542                 if (end < max_addr) {
5543                         pr_err("%s: iommu width (%d) is not "
5544                                "sufficient for the mapped address (%llx)\n",
5545                                __func__, dmar_domain->gaw, max_addr);
5546                         return -EFAULT;
5547                 }
5548                 dmar_domain->max_addr = max_addr;
5549         }
5550         /* Round up size to next multiple of PAGE_SIZE, if it and
5551            the low bits of hpa would take us onto the next page */
5552         size = aligned_nrpages(hpa, size);
5553         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5554                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5555         return ret;
5556 }
5557
5558 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5559                                 unsigned long iova, size_t size,
5560                                 struct iommu_iotlb_gather *gather)
5561 {
5562         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5563         struct page *freelist = NULL;
5564         unsigned long start_pfn, last_pfn;
5565         unsigned int npages;
5566         int iommu_id, level = 0;
5567
5568         /* Cope with horrid API which requires us to unmap more than the
5569            size argument if it happens to be a large-page mapping. */
5570         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5571
5572         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5573                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5574
5575         start_pfn = iova >> VTD_PAGE_SHIFT;
5576         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5577
5578         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5579
5580         npages = last_pfn - start_pfn + 1;
5581
5582         for_each_domain_iommu(iommu_id, dmar_domain)
5583                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5584                                       start_pfn, npages, !freelist, 0);
5585
5586         dma_free_pagelist(freelist);
5587
5588         if (dmar_domain->max_addr == iova + size)
5589                 dmar_domain->max_addr = iova;
5590
5591         return size;
5592 }
5593
5594 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5595                                             dma_addr_t iova)
5596 {
5597         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5598         struct dma_pte *pte;
5599         int level = 0;
5600         u64 phys = 0;
5601
5602         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5603         if (pte && dma_pte_present(pte))
5604                 phys = dma_pte_addr(pte) +
5605                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5606                                                 VTD_PAGE_SHIFT) - 1));
5607
5608         return phys;
5609 }
5610
5611 static inline bool scalable_mode_support(void)
5612 {
5613         struct dmar_drhd_unit *drhd;
5614         struct intel_iommu *iommu;
5615         bool ret = true;
5616
5617         rcu_read_lock();
5618         for_each_active_iommu(iommu, drhd) {
5619                 if (!sm_supported(iommu)) {
5620                         ret = false;
5621                         break;
5622                 }
5623         }
5624         rcu_read_unlock();
5625
5626         return ret;
5627 }
5628
5629 static inline bool iommu_pasid_support(void)
5630 {
5631         struct dmar_drhd_unit *drhd;
5632         struct intel_iommu *iommu;
5633         bool ret = true;
5634
5635         rcu_read_lock();
5636         for_each_active_iommu(iommu, drhd) {
5637                 if (!pasid_supported(iommu)) {
5638                         ret = false;
5639                         break;
5640                 }
5641         }
5642         rcu_read_unlock();
5643
5644         return ret;
5645 }
5646
5647 static inline bool nested_mode_support(void)
5648 {
5649         struct dmar_drhd_unit *drhd;
5650         struct intel_iommu *iommu;
5651         bool ret = true;
5652
5653         rcu_read_lock();
5654         for_each_active_iommu(iommu, drhd) {
5655                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5656                         ret = false;
5657                         break;
5658                 }
5659         }
5660         rcu_read_unlock();
5661
5662         return ret;
5663 }
5664
5665 static bool intel_iommu_capable(enum iommu_cap cap)
5666 {
5667         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5668                 return domain_update_iommu_snooping(NULL) == 1;
5669         if (cap == IOMMU_CAP_INTR_REMAP)
5670                 return irq_remapping_enabled == 1;
5671
5672         return false;
5673 }
5674
5675 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5676 {
5677         struct intel_iommu *iommu;
5678
5679         iommu = device_to_iommu(dev, NULL, NULL);
5680         if (!iommu)
5681                 return ERR_PTR(-ENODEV);
5682
5683         if (translation_pre_enabled(iommu))
5684                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5685
5686         return &iommu->iommu;
5687 }
5688
5689 static void intel_iommu_release_device(struct device *dev)
5690 {
5691         struct intel_iommu *iommu;
5692
5693         iommu = device_to_iommu(dev, NULL, NULL);
5694         if (!iommu)
5695                 return;
5696
5697         dmar_remove_one_dev_info(dev);
5698
5699         set_dma_ops(dev, NULL);
5700 }
5701
5702 static void intel_iommu_probe_finalize(struct device *dev)
5703 {
5704         struct iommu_domain *domain;
5705
5706         domain = iommu_get_domain_for_dev(dev);
5707         if (device_needs_bounce(dev))
5708                 set_dma_ops(dev, &bounce_dma_ops);
5709         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5710                 set_dma_ops(dev, &intel_dma_ops);
5711         else
5712                 set_dma_ops(dev, NULL);
5713 }
5714
5715 static void intel_iommu_get_resv_regions(struct device *device,
5716                                          struct list_head *head)
5717 {
5718         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5719         struct iommu_resv_region *reg;
5720         struct dmar_rmrr_unit *rmrr;
5721         struct device *i_dev;
5722         int i;
5723
5724         down_read(&dmar_global_lock);
5725         for_each_rmrr_units(rmrr) {
5726                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5727                                           i, i_dev) {
5728                         struct iommu_resv_region *resv;
5729                         enum iommu_resv_type type;
5730                         size_t length;
5731
5732                         if (i_dev != device &&
5733                             !is_downstream_to_pci_bridge(device, i_dev))
5734                                 continue;
5735
5736                         length = rmrr->end_address - rmrr->base_address + 1;
5737
5738                         type = device_rmrr_is_relaxable(device) ?
5739                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5740
5741                         resv = iommu_alloc_resv_region(rmrr->base_address,
5742                                                        length, prot, type);
5743                         if (!resv)
5744                                 break;
5745
5746                         list_add_tail(&resv->list, head);
5747                 }
5748         }
5749         up_read(&dmar_global_lock);
5750
5751 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5752         if (dev_is_pci(device)) {
5753                 struct pci_dev *pdev = to_pci_dev(device);
5754
5755                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5756                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5757                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5758                         if (reg)
5759                                 list_add_tail(&reg->list, head);
5760                 }
5761         }
5762 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5763
5764         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5765                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5766                                       0, IOMMU_RESV_MSI);
5767         if (!reg)
5768                 return;
5769         list_add_tail(&reg->list, head);
5770 }
5771
5772 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5773 {
5774         struct device_domain_info *info;
5775         struct context_entry *context;
5776         struct dmar_domain *domain;
5777         unsigned long flags;
5778         u64 ctx_lo;
5779         int ret;
5780
5781         domain = find_domain(dev);
5782         if (!domain)
5783                 return -EINVAL;
5784
5785         spin_lock_irqsave(&device_domain_lock, flags);
5786         spin_lock(&iommu->lock);
5787
5788         ret = -EINVAL;
5789         info = get_domain_info(dev);
5790         if (!info || !info->pasid_supported)
5791                 goto out;
5792
5793         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5794         if (WARN_ON(!context))
5795                 goto out;
5796
5797         ctx_lo = context[0].lo;
5798
5799         if (!(ctx_lo & CONTEXT_PASIDE)) {
5800                 ctx_lo |= CONTEXT_PASIDE;
5801                 context[0].lo = ctx_lo;
5802                 wmb();
5803                 iommu->flush.flush_context(iommu,
5804                                            domain->iommu_did[iommu->seq_id],
5805                                            PCI_DEVID(info->bus, info->devfn),
5806                                            DMA_CCMD_MASK_NOBIT,
5807                                            DMA_CCMD_DEVICE_INVL);
5808         }
5809
5810         /* Enable PASID support in the device, if it wasn't already */
5811         if (!info->pasid_enabled)
5812                 iommu_enable_dev_iotlb(info);
5813
5814         ret = 0;
5815
5816  out:
5817         spin_unlock(&iommu->lock);
5818         spin_unlock_irqrestore(&device_domain_lock, flags);
5819
5820         return ret;
5821 }
5822
5823 static void intel_iommu_apply_resv_region(struct device *dev,
5824                                           struct iommu_domain *domain,
5825                                           struct iommu_resv_region *region)
5826 {
5827         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5828         unsigned long start, end;
5829
5830         start = IOVA_PFN(region->start);
5831         end   = IOVA_PFN(region->start + region->length - 1);
5832
5833         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5834 }
5835
5836 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5837 {
5838         if (dev_is_pci(dev))
5839                 return pci_device_group(dev);
5840         return generic_device_group(dev);
5841 }
5842
5843 static int intel_iommu_enable_auxd(struct device *dev)
5844 {
5845         struct device_domain_info *info;
5846         struct intel_iommu *iommu;
5847         unsigned long flags;
5848         int ret;
5849
5850         iommu = device_to_iommu(dev, NULL, NULL);
5851         if (!iommu || dmar_disabled)
5852                 return -EINVAL;
5853
5854         if (!sm_supported(iommu) || !pasid_supported(iommu))
5855                 return -EINVAL;
5856
5857         ret = intel_iommu_enable_pasid(iommu, dev);
5858         if (ret)
5859                 return -ENODEV;
5860
5861         spin_lock_irqsave(&device_domain_lock, flags);
5862         info = get_domain_info(dev);
5863         info->auxd_enabled = 1;
5864         spin_unlock_irqrestore(&device_domain_lock, flags);
5865
5866         return 0;
5867 }
5868
5869 static int intel_iommu_disable_auxd(struct device *dev)
5870 {
5871         struct device_domain_info *info;
5872         unsigned long flags;
5873
5874         spin_lock_irqsave(&device_domain_lock, flags);
5875         info = get_domain_info(dev);
5876         if (!WARN_ON(!info))
5877                 info->auxd_enabled = 0;
5878         spin_unlock_irqrestore(&device_domain_lock, flags);
5879
5880         return 0;
5881 }
5882
5883 /*
5884  * A PCI express designated vendor specific extended capability is defined
5885  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886  * for system software and tools to detect endpoint devices supporting the
5887  * Intel scalable IO virtualization without host driver dependency.
5888  *
5889  * Returns the address of the matching extended capability structure within
5890  * the device's PCI configuration space or 0 if the device does not support
5891  * it.
5892  */
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5894 {
5895         int pos;
5896         u16 vendor, id;
5897
5898         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5899         while (pos) {
5900                 pci_read_config_word(pdev, pos + 4, &vendor);
5901                 pci_read_config_word(pdev, pos + 8, &id);
5902                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5903                         return pos;
5904
5905                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5906         }
5907
5908         return 0;
5909 }
5910
5911 static bool
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5913 {
5914         if (feat == IOMMU_DEV_FEAT_AUX) {
5915                 int ret;
5916
5917                 if (!dev_is_pci(dev) || dmar_disabled ||
5918                     !scalable_mode_support() || !iommu_pasid_support())
5919                         return false;
5920
5921                 ret = pci_pasid_features(to_pci_dev(dev));
5922                 if (ret < 0)
5923                         return false;
5924
5925                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5926         }
5927
5928         if (feat == IOMMU_DEV_FEAT_SVA) {
5929                 struct device_domain_info *info = get_domain_info(dev);
5930
5931                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5932                         info->pasid_supported && info->pri_supported &&
5933                         info->ats_supported;
5934         }
5935
5936         return false;
5937 }
5938
5939 static int
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5941 {
5942         if (feat == IOMMU_DEV_FEAT_AUX)
5943                 return intel_iommu_enable_auxd(dev);
5944
5945         if (feat == IOMMU_DEV_FEAT_SVA) {
5946                 struct device_domain_info *info = get_domain_info(dev);
5947
5948                 if (!info)
5949                         return -EINVAL;
5950
5951                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5952                         return 0;
5953         }
5954
5955         return -ENODEV;
5956 }
5957
5958 static int
5959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5960 {
5961         if (feat == IOMMU_DEV_FEAT_AUX)
5962                 return intel_iommu_disable_auxd(dev);
5963
5964         return -ENODEV;
5965 }
5966
5967 static bool
5968 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5969 {
5970         struct device_domain_info *info = get_domain_info(dev);
5971
5972         if (feat == IOMMU_DEV_FEAT_AUX)
5973                 return scalable_mode_support() && info && info->auxd_enabled;
5974
5975         return false;
5976 }
5977
5978 static int
5979 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5980 {
5981         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5982
5983         return dmar_domain->default_pasid > 0 ?
5984                         dmar_domain->default_pasid : -EINVAL;
5985 }
5986
5987 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5988                                            struct device *dev)
5989 {
5990         return attach_deferred(dev);
5991 }
5992
5993 static int
5994 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5995                             enum iommu_attr attr, void *data)
5996 {
5997         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5998         unsigned long flags;
5999         int ret = 0;
6000
6001         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6002                 return -EINVAL;
6003
6004         switch (attr) {
6005         case DOMAIN_ATTR_NESTING:
6006                 spin_lock_irqsave(&device_domain_lock, flags);
6007                 if (nested_mode_support() &&
6008                     list_empty(&dmar_domain->devices)) {
6009                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6010                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6011                 } else {
6012                         ret = -ENODEV;
6013                 }
6014                 spin_unlock_irqrestore(&device_domain_lock, flags);
6015                 break;
6016         default:
6017                 ret = -EINVAL;
6018                 break;
6019         }
6020
6021         return ret;
6022 }
6023
6024 /*
6025  * Check that the device does not live on an external facing PCI port that is
6026  * marked as untrusted. Such devices should not be able to apply quirks and
6027  * thus not be able to bypass the IOMMU restrictions.
6028  */
6029 static bool risky_device(struct pci_dev *pdev)
6030 {
6031         if (pdev->untrusted) {
6032                 pci_info(pdev,
6033                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6034                          pdev->vendor, pdev->device);
6035                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6036                 return true;
6037         }
6038         return false;
6039 }
6040
6041 const struct iommu_ops intel_iommu_ops = {
6042         .capable                = intel_iommu_capable,
6043         .domain_alloc           = intel_iommu_domain_alloc,
6044         .domain_free            = intel_iommu_domain_free,
6045         .domain_set_attr        = intel_iommu_domain_set_attr,
6046         .attach_dev             = intel_iommu_attach_device,
6047         .detach_dev             = intel_iommu_detach_device,
6048         .aux_attach_dev         = intel_iommu_aux_attach_device,
6049         .aux_detach_dev         = intel_iommu_aux_detach_device,
6050         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6051         .map                    = intel_iommu_map,
6052         .unmap                  = intel_iommu_unmap,
6053         .iova_to_phys           = intel_iommu_iova_to_phys,
6054         .probe_device           = intel_iommu_probe_device,
6055         .probe_finalize         = intel_iommu_probe_finalize,
6056         .release_device         = intel_iommu_release_device,
6057         .get_resv_regions       = intel_iommu_get_resv_regions,
6058         .put_resv_regions       = generic_iommu_put_resv_regions,
6059         .apply_resv_region      = intel_iommu_apply_resv_region,
6060         .device_group           = intel_iommu_device_group,
6061         .dev_has_feat           = intel_iommu_dev_has_feat,
6062         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6063         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6064         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6065         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6066         .def_domain_type        = device_def_domain_type,
6067         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6068 #ifdef CONFIG_INTEL_IOMMU_SVM
6069         .cache_invalidate       = intel_iommu_sva_invalidate,
6070         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6071         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6072         .sva_bind               = intel_svm_bind,
6073         .sva_unbind             = intel_svm_unbind,
6074         .sva_get_pasid          = intel_svm_get_pasid,
6075         .page_response          = intel_svm_page_response,
6076 #endif
6077 };
6078
6079 static void quirk_iommu_igfx(struct pci_dev *dev)
6080 {
6081         if (risky_device(dev))
6082                 return;
6083
6084         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6085         dmar_map_gfx = 0;
6086 }
6087
6088 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6096
6097 /* Broadwell igfx malfunctions with dmar */
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6122
6123 static void quirk_iommu_rwbf(struct pci_dev *dev)
6124 {
6125         if (risky_device(dev))
6126                 return;
6127
6128         /*
6129          * Mobile 4 Series Chipset neglects to set RWBF capability,
6130          * but needs it. Same seems to hold for the desktop versions.
6131          */
6132         pci_info(dev, "Forcing write-buffer flush capability\n");
6133         rwbf_quirk = 1;
6134 }
6135
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6143
6144 #define GGC 0x52
6145 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6146 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6147 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6148 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6149 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6150 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6151 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6152 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6153
6154 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6155 {
6156         unsigned short ggc;
6157
6158         if (risky_device(dev))
6159                 return;
6160
6161         if (pci_read_config_word(dev, GGC, &ggc))
6162                 return;
6163
6164         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6165                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6166                 dmar_map_gfx = 0;
6167         } else if (dmar_map_gfx) {
6168                 /* we have to ensure the gfx device is idle before we flush */
6169                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6170                 intel_iommu_strict = 1;
6171        }
6172 }
6173 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6174 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6175 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6176 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6177
6178 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6179 {
6180         unsigned short ver;
6181
6182         if (!IS_GFX_DEVICE(dev))
6183                 return;
6184
6185         ver = (dev->device >> 8) & 0xff;
6186         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6187             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6188             ver != 0x9a)
6189                 return;
6190
6191         if (risky_device(dev))
6192                 return;
6193
6194         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6195         iommu_skip_te_disable = 1;
6196 }
6197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6198
6199 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6200    ISOCH DMAR unit for the Azalia sound device, but not give it any
6201    TLB entries, which causes it to deadlock. Check for that.  We do
6202    this in a function called from init_dmars(), instead of in a PCI
6203    quirk, because we don't want to print the obnoxious "BIOS broken"
6204    message if VT-d is actually disabled.
6205 */
6206 static void __init check_tylersburg_isoch(void)
6207 {
6208         struct pci_dev *pdev;
6209         uint32_t vtisochctrl;
6210
6211         /* If there's no Azalia in the system anyway, forget it. */
6212         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6213         if (!pdev)
6214                 return;
6215
6216         if (risky_device(pdev)) {
6217                 pci_dev_put(pdev);
6218                 return;
6219         }
6220
6221         pci_dev_put(pdev);
6222
6223         /* System Management Registers. Might be hidden, in which case
6224            we can't do the sanity check. But that's OK, because the
6225            known-broken BIOSes _don't_ actually hide it, so far. */
6226         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6227         if (!pdev)
6228                 return;
6229
6230         if (risky_device(pdev)) {
6231                 pci_dev_put(pdev);
6232                 return;
6233         }
6234
6235         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6236                 pci_dev_put(pdev);
6237                 return;
6238         }
6239
6240         pci_dev_put(pdev);
6241
6242         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6243         if (vtisochctrl & 1)
6244                 return;
6245
6246         /* Drop all bits other than the number of TLB entries */
6247         vtisochctrl &= 0x1c;
6248
6249         /* If we have the recommended number of TLB entries (16), fine. */
6250         if (vtisochctrl == 0x10)
6251                 return;
6252
6253         /* Zero TLB entries? You get to ride the short bus to school. */
6254         if (!vtisochctrl) {
6255                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6256                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6257                      dmi_get_system_info(DMI_BIOS_VENDOR),
6258                      dmi_get_system_info(DMI_BIOS_VERSION),
6259                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6260                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6261                 return;
6262         }
6263
6264         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6265                vtisochctrl);
6266 }