Merge tag 'Smack-for-5.10' of git://github.com/cschaufler/smack-next
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev_iommu_priv_get(dev);
376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377                 return NULL;
378
379         return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
386                                 to_pci_dev(d)->untrusted)
387
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393                                      void *data), void *data)
394 {
395         int ret = 0;
396         unsigned long flags;
397         struct device_domain_info *info;
398
399         spin_lock_irqsave(&device_domain_lock, flags);
400         list_for_each_entry(info, &device_domain_list, global) {
401                 ret = fn(info, data);
402                 if (ret) {
403                         spin_unlock_irqrestore(&device_domain_lock, flags);
404                         return ret;
405                 }
406         }
407         spin_unlock_irqrestore(&device_domain_lock, flags);
408
409         return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426         u32 gsts;
427
428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
429         if (gsts & DMA_GSTS_TES)
430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
433 static int __init intel_iommu_setup(char *str)
434 {
435         if (!str)
436                 return -EINVAL;
437         while (*str) {
438                 if (!strncmp(str, "on", 2)) {
439                         dmar_disabled = 0;
440                         pr_info("IOMMU enabled\n");
441                 } else if (!strncmp(str, "off", 3)) {
442                         dmar_disabled = 1;
443                         no_platform_optin = 1;
444                         pr_info("IOMMU disabled\n");
445                 } else if (!strncmp(str, "igfx_off", 8)) {
446                         dmar_map_gfx = 0;
447                         pr_info("Disable GFX device mapping\n");
448                 } else if (!strncmp(str, "forcedac", 8)) {
449                         pr_info("Forcing DAC for PCI devices\n");
450                         dmar_forcedac = 1;
451                 } else if (!strncmp(str, "strict", 6)) {
452                         pr_info("Disable batched IOTLB flush\n");
453                         intel_iommu_strict = 1;
454                 } else if (!strncmp(str, "sp_off", 6)) {
455                         pr_info("Disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 } else if (!strncmp(str, "sm_on", 5)) {
458                         pr_info("Intel-IOMMU: scalable mode supported\n");
459                         intel_iommu_sm = 1;
460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462                         intel_iommu_tboot_noforce = 1;
463                 } else if (!strncmp(str, "nobounce", 8)) {
464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465                         intel_no_bounce = 1;
466                 }
467
468                 str += strcspn(str, ",");
469                 while (*str == ',')
470                         str++;
471         }
472         return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481         struct dmar_domain **domains;
482         int idx = did >> 8;
483
484         domains = iommu->domains[idx];
485         if (!domains)
486                 return NULL;
487
488         return domains[did & 0xff];
489 }
490
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492                              struct dmar_domain *domain)
493 {
494         struct dmar_domain **domains;
495         int idx = did >> 8;
496
497         if (!iommu->domains[idx]) {
498                 size_t size = 256 * sizeof(struct dmar_domain *);
499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500         }
501
502         domains = iommu->domains[idx];
503         if (WARN_ON(!domains))
504                 return;
505         else
506                 domains[did & 0xff] = domain;
507 }
508
509 void *alloc_pgtable_page(int node)
510 {
511         struct page *page;
512         void *vaddr = NULL;
513
514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515         if (page)
516                 vaddr = page_address(page);
517         return vaddr;
518 }
519
520 void free_pgtable_page(void *vaddr)
521 {
522         free_page((unsigned long)vaddr);
523 }
524
525 static inline void *alloc_domain_mem(void)
526 {
527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
530 static void free_domain_mem(void *vaddr)
531 {
532         kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
535 static inline void * alloc_devinfo_mem(void)
536 {
537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616         return sm_supported(iommu) ?
617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu;
624         bool found = false;
625         int i;
626
627         domain->iommu_coherency = 1;
628
629         for_each_domain_iommu(i, domain) {
630                 found = true;
631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632                         domain->iommu_coherency = 0;
633                         break;
634                 }
635         }
636         if (found)
637                 return;
638
639         /* No hardware attached; use lowest common denominator */
640         rcu_read_lock();
641         for_each_active_iommu(iommu, drhd) {
642                 if (!iommu_paging_structure_coherency(iommu)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         rcu_read_unlock();
648 }
649
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652         struct dmar_drhd_unit *drhd;
653         struct intel_iommu *iommu;
654         int ret = 1;
655
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         if (!ecap_sc_support(iommu->ecap)) {
660                                 ret = 0;
661                                 break;
662                         }
663                 }
664         }
665         rcu_read_unlock();
666
667         return ret;
668 }
669
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671                                          struct intel_iommu *skip)
672 {
673         struct dmar_drhd_unit *drhd;
674         struct intel_iommu *iommu;
675         int mask = 0x3;
676
677         if (!intel_iommu_superpage) {
678                 return 0;
679         }
680
681         /* set iommu_superpage to the smallest common denominator */
682         rcu_read_lock();
683         for_each_active_iommu(iommu, drhd) {
684                 if (iommu != skip) {
685                         if (domain && domain_use_first_level(domain)) {
686                                 if (!cap_fl1gp_support(iommu->cap))
687                                         mask = 0x1;
688                         } else {
689                                 mask &= cap_super_page_val(iommu->cap);
690                         }
691
692                         if (!mask)
693                                 break;
694                 }
695         }
696         rcu_read_unlock();
697
698         return fls(mask);
699 }
700
701 /* Some capabilities may be different across iommus */
702 static void domain_update_iommu_cap(struct dmar_domain *domain)
703 {
704         domain_update_iommu_coherency(domain);
705         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
706         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
707 }
708
709 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
710                                          u8 devfn, int alloc)
711 {
712         struct root_entry *root = &iommu->root_entry[bus];
713         struct context_entry *context;
714         u64 *entry;
715
716         entry = &root->lo;
717         if (sm_supported(iommu)) {
718                 if (devfn >= 0x80) {
719                         devfn -= 0x80;
720                         entry = &root->hi;
721                 }
722                 devfn *= 2;
723         }
724         if (*entry & 1)
725                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
726         else {
727                 unsigned long phy_addr;
728                 if (!alloc)
729                         return NULL;
730
731                 context = alloc_pgtable_page(iommu->node);
732                 if (!context)
733                         return NULL;
734
735                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
736                 phy_addr = virt_to_phys((void *)context);
737                 *entry = phy_addr | 1;
738                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
739         }
740         return &context[devfn];
741 }
742
743 static bool attach_deferred(struct device *dev)
744 {
745         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
746 }
747
748 /**
749  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750  *                               sub-hierarchy of a candidate PCI-PCI bridge
751  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752  * @bridge: the candidate PCI-PCI bridge
753  *
754  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755  */
756 static bool
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758 {
759         struct pci_dev *pdev, *pbridge;
760
761         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762                 return false;
763
764         pdev = to_pci_dev(dev);
765         pbridge = to_pci_dev(bridge);
766
767         if (pbridge->subordinate &&
768             pbridge->subordinate->number <= pdev->bus->number &&
769             pbridge->subordinate->busn_res.end >= pdev->bus->number)
770                 return true;
771
772         return false;
773 }
774
775 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
776 {
777         struct dmar_drhd_unit *drhd;
778         u32 vtbar;
779         int rc;
780
781         /* We know that this device on this chipset has its own IOMMU.
782          * If we find it under a different IOMMU, then the BIOS is lying
783          * to us. Hope that the IOMMU for this device is actually
784          * disabled, and it needs no translation...
785          */
786         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
787         if (rc) {
788                 /* "can't" happen */
789                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
790                 return false;
791         }
792         vtbar &= 0xffff0000;
793
794         /* we know that the this iommu should be at offset 0xa000 from vtbar */
795         drhd = dmar_find_matched_drhd_unit(pdev);
796         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
797                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
798                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
799                 return true;
800         }
801
802         return false;
803 }
804
805 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
806 {
807         if (!iommu || iommu->drhd->ignored)
808                 return true;
809
810         if (dev_is_pci(dev)) {
811                 struct pci_dev *pdev = to_pci_dev(dev);
812
813                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
814                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
815                     quirk_ioat_snb_local_iommu(pdev))
816                         return true;
817         }
818
819         return false;
820 }
821
822 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
823 {
824         struct dmar_drhd_unit *drhd = NULL;
825         struct pci_dev *pdev = NULL;
826         struct intel_iommu *iommu;
827         struct device *tmp;
828         u16 segment = 0;
829         int i;
830
831         if (!dev)
832                 return NULL;
833
834         if (dev_is_pci(dev)) {
835                 struct pci_dev *pf_pdev;
836
837                 pdev = pci_real_dma_dev(to_pci_dev(dev));
838
839                 /* VFs aren't listed in scope tables; we need to look up
840                  * the PF instead to find the IOMMU. */
841                 pf_pdev = pci_physfn(pdev);
842                 dev = &pf_pdev->dev;
843                 segment = pci_domain_nr(pdev->bus);
844         } else if (has_acpi_companion(dev))
845                 dev = &ACPI_COMPANION(dev)->dev;
846
847         rcu_read_lock();
848         for_each_iommu(iommu, drhd) {
849                 if (pdev && segment != drhd->segment)
850                         continue;
851
852                 for_each_active_dev_scope(drhd->devices,
853                                           drhd->devices_cnt, i, tmp) {
854                         if (tmp == dev) {
855                                 /* For a VF use its original BDF# not that of the PF
856                                  * which we used for the IOMMU lookup. Strictly speaking
857                                  * we could do this for all PCI devices; we only need to
858                                  * get the BDF# from the scope table for ACPI matches. */
859                                 if (pdev && pdev->is_virtfn)
860                                         goto got_pdev;
861
862                                 if (bus && devfn) {
863                                         *bus = drhd->devices[i].bus;
864                                         *devfn = drhd->devices[i].devfn;
865                                 }
866                                 goto out;
867                         }
868
869                         if (is_downstream_to_pci_bridge(dev, tmp))
870                                 goto got_pdev;
871                 }
872
873                 if (pdev && drhd->include_all) {
874                 got_pdev:
875                         if (bus && devfn) {
876                                 *bus = pdev->bus->number;
877                                 *devfn = pdev->devfn;
878                         }
879                         goto out;
880                 }
881         }
882         iommu = NULL;
883  out:
884         if (iommu_is_dummy(iommu, dev))
885                 iommu = NULL;
886
887         rcu_read_unlock();
888
889         return iommu;
890 }
891
892 static void domain_flush_cache(struct dmar_domain *domain,
893                                void *addr, int size)
894 {
895         if (!domain->iommu_coherency)
896                 clflush_cache_range(addr, size);
897 }
898
899 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
900 {
901         struct context_entry *context;
902         int ret = 0;
903         unsigned long flags;
904
905         spin_lock_irqsave(&iommu->lock, flags);
906         context = iommu_context_addr(iommu, bus, devfn, 0);
907         if (context)
908                 ret = context_present(context);
909         spin_unlock_irqrestore(&iommu->lock, flags);
910         return ret;
911 }
912
913 static void free_context_table(struct intel_iommu *iommu)
914 {
915         int i;
916         unsigned long flags;
917         struct context_entry *context;
918
919         spin_lock_irqsave(&iommu->lock, flags);
920         if (!iommu->root_entry) {
921                 goto out;
922         }
923         for (i = 0; i < ROOT_ENTRY_NR; i++) {
924                 context = iommu_context_addr(iommu, i, 0, 0);
925                 if (context)
926                         free_pgtable_page(context);
927
928                 if (!sm_supported(iommu))
929                         continue;
930
931                 context = iommu_context_addr(iommu, i, 0x80, 0);
932                 if (context)
933                         free_pgtable_page(context);
934
935         }
936         free_pgtable_page(iommu->root_entry);
937         iommu->root_entry = NULL;
938 out:
939         spin_unlock_irqrestore(&iommu->lock, flags);
940 }
941
942 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
943                                       unsigned long pfn, int *target_level)
944 {
945         struct dma_pte *parent, *pte;
946         int level = agaw_to_level(domain->agaw);
947         int offset;
948
949         BUG_ON(!domain->pgd);
950
951         if (!domain_pfn_supported(domain, pfn))
952                 /* Address beyond IOMMU's addressing capabilities. */
953                 return NULL;
954
955         parent = domain->pgd;
956
957         while (1) {
958                 void *tmp_page;
959
960                 offset = pfn_level_offset(pfn, level);
961                 pte = &parent[offset];
962                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
963                         break;
964                 if (level == *target_level)
965                         break;
966
967                 if (!dma_pte_present(pte)) {
968                         uint64_t pteval;
969
970                         tmp_page = alloc_pgtable_page(domain->nid);
971
972                         if (!tmp_page)
973                                 return NULL;
974
975                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
976                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
977                         if (domain_use_first_level(domain))
978                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
979                         if (cmpxchg64(&pte->val, 0ULL, pteval))
980                                 /* Someone else set it while we were thinking; use theirs. */
981                                 free_pgtable_page(tmp_page);
982                         else
983                                 domain_flush_cache(domain, pte, sizeof(*pte));
984                 }
985                 if (level == 1)
986                         break;
987
988                 parent = phys_to_virt(dma_pte_addr(pte));
989                 level--;
990         }
991
992         if (!*target_level)
993                 *target_level = level;
994
995         return pte;
996 }
997
998 /* return address's pte at specific level */
999 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000                                          unsigned long pfn,
1001                                          int level, int *large_page)
1002 {
1003         struct dma_pte *parent, *pte;
1004         int total = agaw_to_level(domain->agaw);
1005         int offset;
1006
1007         parent = domain->pgd;
1008         while (level <= total) {
1009                 offset = pfn_level_offset(pfn, total);
1010                 pte = &parent[offset];
1011                 if (level == total)
1012                         return pte;
1013
1014                 if (!dma_pte_present(pte)) {
1015                         *large_page = total;
1016                         break;
1017                 }
1018
1019                 if (dma_pte_superpage(pte)) {
1020                         *large_page = total;
1021                         return pte;
1022                 }
1023
1024                 parent = phys_to_virt(dma_pte_addr(pte));
1025                 total--;
1026         }
1027         return NULL;
1028 }
1029
1030 /* clear last level pte, a tlb flush should be followed */
1031 static void dma_pte_clear_range(struct dmar_domain *domain,
1032                                 unsigned long start_pfn,
1033                                 unsigned long last_pfn)
1034 {
1035         unsigned int large_page;
1036         struct dma_pte *first_pte, *pte;
1037
1038         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040         BUG_ON(start_pfn > last_pfn);
1041
1042         /* we don't need lock here; nobody else touches the iova range */
1043         do {
1044                 large_page = 1;
1045                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046                 if (!pte) {
1047                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048                         continue;
1049                 }
1050                 do {
1051                         dma_clear_pte(pte);
1052                         start_pfn += lvl_to_nr_pages(large_page);
1053                         pte++;
1054                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055
1056                 domain_flush_cache(domain, first_pte,
1057                                    (void *)pte - (void *)first_pte);
1058
1059         } while (start_pfn && start_pfn <= last_pfn);
1060 }
1061
1062 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063                                int retain_level, struct dma_pte *pte,
1064                                unsigned long pfn, unsigned long start_pfn,
1065                                unsigned long last_pfn)
1066 {
1067         pfn = max(start_pfn, pfn);
1068         pte = &pte[pfn_level_offset(pfn, level)];
1069
1070         do {
1071                 unsigned long level_pfn;
1072                 struct dma_pte *level_pte;
1073
1074                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075                         goto next;
1076
1077                 level_pfn = pfn & level_mask(level);
1078                 level_pte = phys_to_virt(dma_pte_addr(pte));
1079
1080                 if (level > 2) {
1081                         dma_pte_free_level(domain, level - 1, retain_level,
1082                                            level_pte, level_pfn, start_pfn,
1083                                            last_pfn);
1084                 }
1085
1086                 /*
1087                  * Free the page table if we're below the level we want to
1088                  * retain and the range covers the entire table.
1089                  */
1090                 if (level < retain_level && !(start_pfn > level_pfn ||
1091                       last_pfn < level_pfn + level_size(level) - 1)) {
1092                         dma_clear_pte(pte);
1093                         domain_flush_cache(domain, pte, sizeof(*pte));
1094                         free_pgtable_page(level_pte);
1095                 }
1096 next:
1097                 pfn += level_size(level);
1098         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099 }
1100
1101 /*
1102  * clear last level (leaf) ptes and free page table pages below the
1103  * level we wish to keep intact.
1104  */
1105 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106                                    unsigned long start_pfn,
1107                                    unsigned long last_pfn,
1108                                    int retain_level)
1109 {
1110         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112         BUG_ON(start_pfn > last_pfn);
1113
1114         dma_pte_clear_range(domain, start_pfn, last_pfn);
1115
1116         /* We don't need lock here; nobody else touches the iova range */
1117         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118                            domain->pgd, 0, start_pfn, last_pfn);
1119
1120         /* free pgd */
1121         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122                 free_pgtable_page(domain->pgd);
1123                 domain->pgd = NULL;
1124         }
1125 }
1126
1127 /* When a page at a given level is being unlinked from its parent, we don't
1128    need to *modify* it at all. All we need to do is make a list of all the
1129    pages which can be freed just as soon as we've flushed the IOTLB and we
1130    know the hardware page-walk will no longer touch them.
1131    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132    be freed. */
1133 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134                                             int level, struct dma_pte *pte,
1135                                             struct page *freelist)
1136 {
1137         struct page *pg;
1138
1139         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140         pg->freelist = freelist;
1141         freelist = pg;
1142
1143         if (level == 1)
1144                 return freelist;
1145
1146         pte = page_address(pg);
1147         do {
1148                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149                         freelist = dma_pte_list_pagetables(domain, level - 1,
1150                                                            pte, freelist);
1151                 pte++;
1152         } while (!first_pte_in_page(pte));
1153
1154         return freelist;
1155 }
1156
1157 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158                                         struct dma_pte *pte, unsigned long pfn,
1159                                         unsigned long start_pfn,
1160                                         unsigned long last_pfn,
1161                                         struct page *freelist)
1162 {
1163         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164
1165         pfn = max(start_pfn, pfn);
1166         pte = &pte[pfn_level_offset(pfn, level)];
1167
1168         do {
1169                 unsigned long level_pfn;
1170
1171                 if (!dma_pte_present(pte))
1172                         goto next;
1173
1174                 level_pfn = pfn & level_mask(level);
1175
1176                 /* If range covers entire pagetable, free it */
1177                 if (start_pfn <= level_pfn &&
1178                     last_pfn >= level_pfn + level_size(level) - 1) {
1179                         /* These suborbinate page tables are going away entirely. Don't
1180                            bother to clear them; we're just going to *free* them. */
1181                         if (level > 1 && !dma_pte_superpage(pte))
1182                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183
1184                         dma_clear_pte(pte);
1185                         if (!first_pte)
1186                                 first_pte = pte;
1187                         last_pte = pte;
1188                 } else if (level > 1) {
1189                         /* Recurse down into a level that isn't *entirely* obsolete */
1190                         freelist = dma_pte_clear_level(domain, level - 1,
1191                                                        phys_to_virt(dma_pte_addr(pte)),
1192                                                        level_pfn, start_pfn, last_pfn,
1193                                                        freelist);
1194                 }
1195 next:
1196                 pfn += level_size(level);
1197         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198
1199         if (first_pte)
1200                 domain_flush_cache(domain, first_pte,
1201                                    (void *)++last_pte - (void *)first_pte);
1202
1203         return freelist;
1204 }
1205
1206 /* We can't just free the pages because the IOMMU may still be walking
1207    the page tables, and may have cached the intermediate levels. The
1208    pages can only be freed after the IOTLB flush has been done. */
1209 static struct page *domain_unmap(struct dmar_domain *domain,
1210                                  unsigned long start_pfn,
1211                                  unsigned long last_pfn)
1212 {
1213         struct page *freelist;
1214
1215         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217         BUG_ON(start_pfn > last_pfn);
1218
1219         /* we don't need lock here; nobody else touches the iova range */
1220         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1222
1223         /* free pgd */
1224         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225                 struct page *pgd_page = virt_to_page(domain->pgd);
1226                 pgd_page->freelist = freelist;
1227                 freelist = pgd_page;
1228
1229                 domain->pgd = NULL;
1230         }
1231
1232         return freelist;
1233 }
1234
1235 static void dma_free_pagelist(struct page *freelist)
1236 {
1237         struct page *pg;
1238
1239         while ((pg = freelist)) {
1240                 freelist = pg->freelist;
1241                 free_pgtable_page(page_address(pg));
1242         }
1243 }
1244
1245 static void iova_entry_free(unsigned long data)
1246 {
1247         struct page *freelist = (struct page *)data;
1248
1249         dma_free_pagelist(freelist);
1250 }
1251
1252 /* iommu handling */
1253 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254 {
1255         struct root_entry *root;
1256         unsigned long flags;
1257
1258         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259         if (!root) {
1260                 pr_err("Allocating root entry for %s failed\n",
1261                         iommu->name);
1262                 return -ENOMEM;
1263         }
1264
1265         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1266
1267         spin_lock_irqsave(&iommu->lock, flags);
1268         iommu->root_entry = root;
1269         spin_unlock_irqrestore(&iommu->lock, flags);
1270
1271         return 0;
1272 }
1273
1274 static void iommu_set_root_entry(struct intel_iommu *iommu)
1275 {
1276         u64 addr;
1277         u32 sts;
1278         unsigned long flag;
1279
1280         addr = virt_to_phys(iommu->root_entry);
1281         if (sm_supported(iommu))
1282                 addr |= DMA_RTADDR_SMT;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286
1287         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288
1289         /* Make sure hardware complete it */
1290         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291                       readl, (sts & DMA_GSTS_RTPS), sts);
1292
1293         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 }
1295
1296 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297 {
1298         u32 val;
1299         unsigned long flag;
1300
1301         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302                 return;
1303
1304         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306
1307         /* Make sure hardware complete it */
1308         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309                       readl, (!(val & DMA_GSTS_WBFS)), val);
1310
1311         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312 }
1313
1314 /* return value determine if we need a write buffer flush */
1315 static void __iommu_flush_context(struct intel_iommu *iommu,
1316                                   u16 did, u16 source_id, u8 function_mask,
1317                                   u64 type)
1318 {
1319         u64 val = 0;
1320         unsigned long flag;
1321
1322         switch (type) {
1323         case DMA_CCMD_GLOBAL_INVL:
1324                 val = DMA_CCMD_GLOBAL_INVL;
1325                 break;
1326         case DMA_CCMD_DOMAIN_INVL:
1327                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328                 break;
1329         case DMA_CCMD_DEVICE_INVL:
1330                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332                 break;
1333         default:
1334                 BUG();
1335         }
1336         val |= DMA_CCMD_ICC;
1337
1338         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340
1341         /* Make sure hardware complete it */
1342         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344
1345         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346 }
1347
1348 /* return value determine if we need a write buffer flush */
1349 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350                                 u64 addr, unsigned int size_order, u64 type)
1351 {
1352         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353         u64 val = 0, val_iva = 0;
1354         unsigned long flag;
1355
1356         switch (type) {
1357         case DMA_TLB_GLOBAL_FLUSH:
1358                 /* global flush doesn't need set IVA_REG */
1359                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360                 break;
1361         case DMA_TLB_DSI_FLUSH:
1362                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363                 break;
1364         case DMA_TLB_PSI_FLUSH:
1365                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366                 /* IH bit is passed in as part of address */
1367                 val_iva = size_order | addr;
1368                 break;
1369         default:
1370                 BUG();
1371         }
1372         /* Note: set drain read/write */
1373 #if 0
1374         /*
1375          * This is probably to be super secure.. Looks like we can
1376          * ignore it without any impact.
1377          */
1378         if (cap_read_drain(iommu->cap))
1379                 val |= DMA_TLB_READ_DRAIN;
1380 #endif
1381         if (cap_write_drain(iommu->cap))
1382                 val |= DMA_TLB_WRITE_DRAIN;
1383
1384         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385         /* Note: Only uses first TLB reg currently */
1386         if (val_iva)
1387                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389
1390         /* Make sure hardware complete it */
1391         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393
1394         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395
1396         /* check IOTLB invalidation granularity */
1397         if (DMA_TLB_IAIG(val) == 0)
1398                 pr_err("Flush IOTLB failed\n");
1399         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1401                         (unsigned long long)DMA_TLB_IIRG(type),
1402                         (unsigned long long)DMA_TLB_IAIG(val));
1403 }
1404
1405 static struct device_domain_info *
1406 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407                          u8 bus, u8 devfn)
1408 {
1409         struct device_domain_info *info;
1410
1411         assert_spin_locked(&device_domain_lock);
1412
1413         if (!iommu->qi)
1414                 return NULL;
1415
1416         list_for_each_entry(info, &domain->devices, link)
1417                 if (info->iommu == iommu && info->bus == bus &&
1418                     info->devfn == devfn) {
1419                         if (info->ats_supported && info->dev)
1420                                 return info;
1421                         break;
1422                 }
1423
1424         return NULL;
1425 }
1426
1427 static void domain_update_iotlb(struct dmar_domain *domain)
1428 {
1429         struct device_domain_info *info;
1430         bool has_iotlb_device = false;
1431
1432         assert_spin_locked(&device_domain_lock);
1433
1434         list_for_each_entry(info, &domain->devices, link) {
1435                 struct pci_dev *pdev;
1436
1437                 if (!info->dev || !dev_is_pci(info->dev))
1438                         continue;
1439
1440                 pdev = to_pci_dev(info->dev);
1441                 if (pdev->ats_enabled) {
1442                         has_iotlb_device = true;
1443                         break;
1444                 }
1445         }
1446
1447         domain->has_iotlb_device = has_iotlb_device;
1448 }
1449
1450 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451 {
1452         struct pci_dev *pdev;
1453
1454         assert_spin_locked(&device_domain_lock);
1455
1456         if (!info || !dev_is_pci(info->dev))
1457                 return;
1458
1459         pdev = to_pci_dev(info->dev);
1460         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463          * reserved, which should be set to 0.
1464          */
1465         if (!ecap_dit(info->iommu->ecap))
1466                 info->pfsid = 0;
1467         else {
1468                 struct pci_dev *pf_pdev;
1469
1470                 /* pdev will be returned if device is not a vf */
1471                 pf_pdev = pci_physfn(pdev);
1472                 info->pfsid = pci_dev_id(pf_pdev);
1473         }
1474
1475 #ifdef CONFIG_INTEL_IOMMU_SVM
1476         /* The PCIe spec, in its wisdom, declares that the behaviour of
1477            the device if you enable PASID support after ATS support is
1478            undefined. So always enable PASID support on devices which
1479            have it, even if we can't yet know if we're ever going to
1480            use it. */
1481         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482                 info->pasid_enabled = 1;
1483
1484         if (info->pri_supported &&
1485             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1486             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487                 info->pri_enabled = 1;
1488 #endif
1489         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491                 info->ats_enabled = 1;
1492                 domain_update_iotlb(info->domain);
1493                 info->ats_qdep = pci_ats_queue_depth(pdev);
1494         }
1495 }
1496
1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498 {
1499         struct pci_dev *pdev;
1500
1501         assert_spin_locked(&device_domain_lock);
1502
1503         if (!dev_is_pci(info->dev))
1504                 return;
1505
1506         pdev = to_pci_dev(info->dev);
1507
1508         if (info->ats_enabled) {
1509                 pci_disable_ats(pdev);
1510                 info->ats_enabled = 0;
1511                 domain_update_iotlb(info->domain);
1512         }
1513 #ifdef CONFIG_INTEL_IOMMU_SVM
1514         if (info->pri_enabled) {
1515                 pci_disable_pri(pdev);
1516                 info->pri_enabled = 0;
1517         }
1518         if (info->pasid_enabled) {
1519                 pci_disable_pasid(pdev);
1520                 info->pasid_enabled = 0;
1521         }
1522 #endif
1523 }
1524
1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526                                   u64 addr, unsigned mask)
1527 {
1528         u16 sid, qdep;
1529         unsigned long flags;
1530         struct device_domain_info *info;
1531
1532         if (!domain->has_iotlb_device)
1533                 return;
1534
1535         spin_lock_irqsave(&device_domain_lock, flags);
1536         list_for_each_entry(info, &domain->devices, link) {
1537                 if (!info->ats_enabled)
1538                         continue;
1539
1540                 sid = info->bus << 8 | info->devfn;
1541                 qdep = info->ats_qdep;
1542                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543                                 qdep, addr, mask);
1544         }
1545         spin_unlock_irqrestore(&device_domain_lock, flags);
1546 }
1547
1548 static void domain_flush_piotlb(struct intel_iommu *iommu,
1549                                 struct dmar_domain *domain,
1550                                 u64 addr, unsigned long npages, bool ih)
1551 {
1552         u16 did = domain->iommu_did[iommu->seq_id];
1553
1554         if (domain->default_pasid)
1555                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1556                                 addr, npages, ih);
1557
1558         if (!list_empty(&domain->devices))
1559                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1560 }
1561
1562 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563                                   struct dmar_domain *domain,
1564                                   unsigned long pfn, unsigned int pages,
1565                                   int ih, int map)
1566 {
1567         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569         u16 did = domain->iommu_did[iommu->seq_id];
1570
1571         BUG_ON(pages == 0);
1572
1573         if (ih)
1574                 ih = 1 << 6;
1575
1576         if (domain_use_first_level(domain)) {
1577                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578         } else {
1579                 /*
1580                  * Fallback to domain selective flush if no PSI support or
1581                  * the size is too big. PSI requires page size to be 2 ^ x,
1582                  * and the base address is naturally aligned to the size.
1583                  */
1584                 if (!cap_pgsel_inv(iommu->cap) ||
1585                     mask > cap_max_amask_val(iommu->cap))
1586                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587                                                         DMA_TLB_DSI_FLUSH);
1588                 else
1589                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590                                                         DMA_TLB_PSI_FLUSH);
1591         }
1592
1593         /*
1594          * In caching mode, changes of pages from non-present to present require
1595          * flush. However, device IOTLB doesn't need to be flushed in this case.
1596          */
1597         if (!cap_caching_mode(iommu->cap) || !map)
1598                 iommu_flush_dev_iotlb(domain, addr, mask);
1599 }
1600
1601 /* Notification for newly created mappings */
1602 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603                                         struct dmar_domain *domain,
1604                                         unsigned long pfn, unsigned int pages)
1605 {
1606         /*
1607          * It's a non-present to present mapping. Only flush if caching mode
1608          * and second level.
1609          */
1610         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612         else
1613                 iommu_flush_write_buffer(iommu);
1614 }
1615
1616 static void iommu_flush_iova(struct iova_domain *iovad)
1617 {
1618         struct dmar_domain *domain;
1619         int idx;
1620
1621         domain = container_of(iovad, struct dmar_domain, iovad);
1622
1623         for_each_domain_iommu(idx, domain) {
1624                 struct intel_iommu *iommu = g_iommus[idx];
1625                 u16 did = domain->iommu_did[iommu->seq_id];
1626
1627                 if (domain_use_first_level(domain))
1628                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629                 else
1630                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631                                                  DMA_TLB_DSI_FLUSH);
1632
1633                 if (!cap_caching_mode(iommu->cap))
1634                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635                                               0, MAX_AGAW_PFN_WIDTH);
1636         }
1637 }
1638
1639 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640 {
1641         u32 pmen;
1642         unsigned long flags;
1643
1644         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645                 return;
1646
1647         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649         pmen &= ~DMA_PMEN_EPM;
1650         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651
1652         /* wait for the protected region status bit to clear */
1653         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1655
1656         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657 }
1658
1659 static void iommu_enable_translation(struct intel_iommu *iommu)
1660 {
1661         u32 sts;
1662         unsigned long flags;
1663
1664         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665         iommu->gcmd |= DMA_GCMD_TE;
1666         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667
1668         /* Make sure hardware complete it */
1669         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670                       readl, (sts & DMA_GSTS_TES), sts);
1671
1672         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673 }
1674
1675 static void iommu_disable_translation(struct intel_iommu *iommu)
1676 {
1677         u32 sts;
1678         unsigned long flag;
1679
1680         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682                 return;
1683
1684         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685         iommu->gcmd &= ~DMA_GCMD_TE;
1686         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688         /* Make sure hardware complete it */
1689         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690                       readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693 }
1694
1695 static int iommu_init_domains(struct intel_iommu *iommu)
1696 {
1697         u32 ndomains, nlongs;
1698         size_t size;
1699
1700         ndomains = cap_ndoms(iommu->cap);
1701         pr_debug("%s: Number of Domains supported <%d>\n",
1702                  iommu->name, ndomains);
1703         nlongs = BITS_TO_LONGS(ndomains);
1704
1705         spin_lock_init(&iommu->lock);
1706
1707         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708         if (!iommu->domain_ids) {
1709                 pr_err("%s: Allocating domain id array failed\n",
1710                        iommu->name);
1711                 return -ENOMEM;
1712         }
1713
1714         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715         iommu->domains = kzalloc(size, GFP_KERNEL);
1716
1717         if (iommu->domains) {
1718                 size = 256 * sizeof(struct dmar_domain *);
1719                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720         }
1721
1722         if (!iommu->domains || !iommu->domains[0]) {
1723                 pr_err("%s: Allocating domain array failed\n",
1724                        iommu->name);
1725                 kfree(iommu->domain_ids);
1726                 kfree(iommu->domains);
1727                 iommu->domain_ids = NULL;
1728                 iommu->domains    = NULL;
1729                 return -ENOMEM;
1730         }
1731
1732         /*
1733          * If Caching mode is set, then invalid translations are tagged
1734          * with domain-id 0, hence we need to pre-allocate it. We also
1735          * use domain-id 0 as a marker for non-allocated domain-id, so
1736          * make sure it is not used for a real domain.
1737          */
1738         set_bit(0, iommu->domain_ids);
1739
1740         /*
1741          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742          * entry for first-level or pass-through translation modes should
1743          * be programmed with a domain id different from those used for
1744          * second-level or nested translation. We reserve a domain id for
1745          * this purpose.
1746          */
1747         if (sm_supported(iommu))
1748                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749
1750         return 0;
1751 }
1752
1753 static void disable_dmar_iommu(struct intel_iommu *iommu)
1754 {
1755         struct device_domain_info *info, *tmp;
1756         unsigned long flags;
1757
1758         if (!iommu->domains || !iommu->domain_ids)
1759                 return;
1760
1761         spin_lock_irqsave(&device_domain_lock, flags);
1762         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763                 if (info->iommu != iommu)
1764                         continue;
1765
1766                 if (!info->dev || !info->domain)
1767                         continue;
1768
1769                 __dmar_remove_one_dev_info(info);
1770         }
1771         spin_unlock_irqrestore(&device_domain_lock, flags);
1772
1773         if (iommu->gcmd & DMA_GCMD_TE)
1774                 iommu_disable_translation(iommu);
1775 }
1776
1777 static void free_dmar_iommu(struct intel_iommu *iommu)
1778 {
1779         if ((iommu->domains) && (iommu->domain_ids)) {
1780                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781                 int i;
1782
1783                 for (i = 0; i < elems; i++)
1784                         kfree(iommu->domains[i]);
1785                 kfree(iommu->domains);
1786                 kfree(iommu->domain_ids);
1787                 iommu->domains = NULL;
1788                 iommu->domain_ids = NULL;
1789         }
1790
1791         g_iommus[iommu->seq_id] = NULL;
1792
1793         /* free context mapping */
1794         free_context_table(iommu);
1795
1796 #ifdef CONFIG_INTEL_IOMMU_SVM
1797         if (pasid_supported(iommu)) {
1798                 if (ecap_prs(iommu->ecap))
1799                         intel_svm_finish_prq(iommu);
1800         }
1801         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1803
1804 #endif
1805 }
1806
1807 /*
1808  * Check and return whether first level is used by default for
1809  * DMA translation.
1810  */
1811 static bool first_level_by_default(void)
1812 {
1813         struct dmar_drhd_unit *drhd;
1814         struct intel_iommu *iommu;
1815         static int first_level_support = -1;
1816
1817         if (likely(first_level_support != -1))
1818                 return first_level_support;
1819
1820         first_level_support = 1;
1821
1822         rcu_read_lock();
1823         for_each_active_iommu(iommu, drhd) {
1824                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825                         first_level_support = 0;
1826                         break;
1827                 }
1828         }
1829         rcu_read_unlock();
1830
1831         return first_level_support;
1832 }
1833
1834 static struct dmar_domain *alloc_domain(int flags)
1835 {
1836         struct dmar_domain *domain;
1837
1838         domain = alloc_domain_mem();
1839         if (!domain)
1840                 return NULL;
1841
1842         memset(domain, 0, sizeof(*domain));
1843         domain->nid = NUMA_NO_NODE;
1844         domain->flags = flags;
1845         if (first_level_by_default())
1846                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847         domain->has_iotlb_device = false;
1848         INIT_LIST_HEAD(&domain->devices);
1849
1850         return domain;
1851 }
1852
1853 /* Must be called with iommu->lock */
1854 static int domain_attach_iommu(struct dmar_domain *domain,
1855                                struct intel_iommu *iommu)
1856 {
1857         unsigned long ndomains;
1858         int num;
1859
1860         assert_spin_locked(&device_domain_lock);
1861         assert_spin_locked(&iommu->lock);
1862
1863         domain->iommu_refcnt[iommu->seq_id] += 1;
1864         domain->iommu_count += 1;
1865         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866                 ndomains = cap_ndoms(iommu->cap);
1867                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1868
1869                 if (num >= ndomains) {
1870                         pr_err("%s: No free domain ids\n", iommu->name);
1871                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1872                         domain->iommu_count -= 1;
1873                         return -ENOSPC;
1874                 }
1875
1876                 set_bit(num, iommu->domain_ids);
1877                 set_iommu_domain(iommu, num, domain);
1878
1879                 domain->iommu_did[iommu->seq_id] = num;
1880                 domain->nid                      = iommu->node;
1881
1882                 domain_update_iommu_cap(domain);
1883         }
1884
1885         return 0;
1886 }
1887
1888 static int domain_detach_iommu(struct dmar_domain *domain,
1889                                struct intel_iommu *iommu)
1890 {
1891         int num, count;
1892
1893         assert_spin_locked(&device_domain_lock);
1894         assert_spin_locked(&iommu->lock);
1895
1896         domain->iommu_refcnt[iommu->seq_id] -= 1;
1897         count = --domain->iommu_count;
1898         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899                 num = domain->iommu_did[iommu->seq_id];
1900                 clear_bit(num, iommu->domain_ids);
1901                 set_iommu_domain(iommu, num, NULL);
1902
1903                 domain_update_iommu_cap(domain);
1904                 domain->iommu_did[iommu->seq_id] = 0;
1905         }
1906
1907         return count;
1908 }
1909
1910 static struct iova_domain reserved_iova_list;
1911 static struct lock_class_key reserved_rbtree_key;
1912
1913 static int dmar_init_reserved_ranges(void)
1914 {
1915         struct pci_dev *pdev = NULL;
1916         struct iova *iova;
1917         int i;
1918
1919         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920
1921         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922                 &reserved_rbtree_key);
1923
1924         /* IOAPIC ranges shouldn't be accessed by DMA */
1925         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926                 IOVA_PFN(IOAPIC_RANGE_END));
1927         if (!iova) {
1928                 pr_err("Reserve IOAPIC range failed\n");
1929                 return -ENODEV;
1930         }
1931
1932         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1933         for_each_pci_dev(pdev) {
1934                 struct resource *r;
1935
1936                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937                         r = &pdev->resource[i];
1938                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939                                 continue;
1940                         iova = reserve_iova(&reserved_iova_list,
1941                                             IOVA_PFN(r->start),
1942                                             IOVA_PFN(r->end));
1943                         if (!iova) {
1944                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945                                 return -ENODEV;
1946                         }
1947                 }
1948         }
1949         return 0;
1950 }
1951
1952 static inline int guestwidth_to_adjustwidth(int gaw)
1953 {
1954         int agaw;
1955         int r = (gaw - 12) % 9;
1956
1957         if (r == 0)
1958                 agaw = gaw;
1959         else
1960                 agaw = gaw + 9 - r;
1961         if (agaw > 64)
1962                 agaw = 64;
1963         return agaw;
1964 }
1965
1966 static void domain_exit(struct dmar_domain *domain)
1967 {
1968
1969         /* Remove associated devices and clear attached or cached domains */
1970         domain_remove_dev_info(domain);
1971
1972         /* destroy iovas */
1973         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974                 put_iova_domain(&domain->iovad);
1975
1976         if (domain->pgd) {
1977                 struct page *freelist;
1978
1979                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980                 dma_free_pagelist(freelist);
1981         }
1982
1983         free_domain_mem(domain);
1984 }
1985
1986 /*
1987  * Get the PASID directory size for scalable mode context entry.
1988  * Value of X in the PDTS field of a scalable mode context entry
1989  * indicates PASID directory with 2^(X + 7) entries.
1990  */
1991 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992 {
1993         int pds, max_pde;
1994
1995         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997         if (pds < 7)
1998                 return 0;
1999
2000         return pds - 7;
2001 }
2002
2003 /*
2004  * Set the RID_PASID field of a scalable mode context entry. The
2005  * IOMMU hardware will use the PASID value set in this field for
2006  * DMA translations of DMA requests without PASID.
2007  */
2008 static inline void
2009 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010 {
2011         context->hi |= pasid & ((1 << 20) - 1);
2012 }
2013
2014 /*
2015  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016  * entry.
2017  */
2018 static inline void context_set_sm_dte(struct context_entry *context)
2019 {
2020         context->lo |= (1 << 2);
2021 }
2022
2023 /*
2024  * Set the PRE(Page Request Enable) field of a scalable mode context
2025  * entry.
2026  */
2027 static inline void context_set_sm_pre(struct context_entry *context)
2028 {
2029         context->lo |= (1 << 4);
2030 }
2031
2032 /* Convert value to context PASID directory size field coding. */
2033 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2034
2035 static int domain_context_mapping_one(struct dmar_domain *domain,
2036                                       struct intel_iommu *iommu,
2037                                       struct pasid_table *table,
2038                                       u8 bus, u8 devfn)
2039 {
2040         u16 did = domain->iommu_did[iommu->seq_id];
2041         int translation = CONTEXT_TT_MULTI_LEVEL;
2042         struct device_domain_info *info = NULL;
2043         struct context_entry *context;
2044         unsigned long flags;
2045         int ret;
2046
2047         WARN_ON(did == 0);
2048
2049         if (hw_pass_through && domain_type_is_si(domain))
2050                 translation = CONTEXT_TT_PASS_THROUGH;
2051
2052         pr_debug("Set context mapping for %02x:%02x.%d\n",
2053                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054
2055         BUG_ON(!domain->pgd);
2056
2057         spin_lock_irqsave(&device_domain_lock, flags);
2058         spin_lock(&iommu->lock);
2059
2060         ret = -ENOMEM;
2061         context = iommu_context_addr(iommu, bus, devfn, 1);
2062         if (!context)
2063                 goto out_unlock;
2064
2065         ret = 0;
2066         if (context_present(context))
2067                 goto out_unlock;
2068
2069         /*
2070          * For kdump cases, old valid entries may be cached due to the
2071          * in-flight DMA and copied pgtable, but there is no unmapping
2072          * behaviour for them, thus we need an explicit cache flush for
2073          * the newly-mapped device. For kdump, at this point, the device
2074          * is supposed to finish reset at its driver probe stage, so no
2075          * in-flight DMA will exist, and we don't need to worry anymore
2076          * hereafter.
2077          */
2078         if (context_copied(context)) {
2079                 u16 did_old = context_domain_id(context);
2080
2081                 if (did_old < cap_ndoms(iommu->cap)) {
2082                         iommu->flush.flush_context(iommu, did_old,
2083                                                    (((u16)bus) << 8) | devfn,
2084                                                    DMA_CCMD_MASK_NOBIT,
2085                                                    DMA_CCMD_DEVICE_INVL);
2086                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087                                                  DMA_TLB_DSI_FLUSH);
2088                 }
2089         }
2090
2091         context_clear_entry(context);
2092
2093         if (sm_supported(iommu)) {
2094                 unsigned long pds;
2095
2096                 WARN_ON(!table);
2097
2098                 /* Setup the PASID DIR pointer: */
2099                 pds = context_get_sm_pds(table);
2100                 context->lo = (u64)virt_to_phys(table->table) |
2101                                 context_pdts(pds);
2102
2103                 /* Setup the RID_PASID field: */
2104                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105
2106                 /*
2107                  * Setup the Device-TLB enable bit and Page request
2108                  * Enable bit:
2109                  */
2110                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111                 if (info && info->ats_supported)
2112                         context_set_sm_dte(context);
2113                 if (info && info->pri_supported)
2114                         context_set_sm_pre(context);
2115         } else {
2116                 struct dma_pte *pgd = domain->pgd;
2117                 int agaw;
2118
2119                 context_set_domain_id(context, did);
2120
2121                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2122                         /*
2123                          * Skip top levels of page tables for iommu which has
2124                          * less agaw than default. Unnecessary for PT mode.
2125                          */
2126                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127                                 ret = -ENOMEM;
2128                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2129                                 if (!dma_pte_present(pgd))
2130                                         goto out_unlock;
2131                         }
2132
2133                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134                         if (info && info->ats_supported)
2135                                 translation = CONTEXT_TT_DEV_IOTLB;
2136                         else
2137                                 translation = CONTEXT_TT_MULTI_LEVEL;
2138
2139                         context_set_address_root(context, virt_to_phys(pgd));
2140                         context_set_address_width(context, agaw);
2141                 } else {
2142                         /*
2143                          * In pass through mode, AW must be programmed to
2144                          * indicate the largest AGAW value supported by
2145                          * hardware. And ASR is ignored by hardware.
2146                          */
2147                         context_set_address_width(context, iommu->msagaw);
2148                 }
2149
2150                 context_set_translation_type(context, translation);
2151         }
2152
2153         context_set_fault_enable(context);
2154         context_set_present(context);
2155         if (!ecap_coherent(iommu->ecap))
2156                 clflush_cache_range(context, sizeof(*context));
2157
2158         /*
2159          * It's a non-present to present mapping. If hardware doesn't cache
2160          * non-present entry we only need to flush the write-buffer. If the
2161          * _does_ cache non-present entries, then it does so in the special
2162          * domain #0, which we have to flush:
2163          */
2164         if (cap_caching_mode(iommu->cap)) {
2165                 iommu->flush.flush_context(iommu, 0,
2166                                            (((u16)bus) << 8) | devfn,
2167                                            DMA_CCMD_MASK_NOBIT,
2168                                            DMA_CCMD_DEVICE_INVL);
2169                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170         } else {
2171                 iommu_flush_write_buffer(iommu);
2172         }
2173         iommu_enable_dev_iotlb(info);
2174
2175         ret = 0;
2176
2177 out_unlock:
2178         spin_unlock(&iommu->lock);
2179         spin_unlock_irqrestore(&device_domain_lock, flags);
2180
2181         return ret;
2182 }
2183
2184 struct domain_context_mapping_data {
2185         struct dmar_domain *domain;
2186         struct intel_iommu *iommu;
2187         struct pasid_table *table;
2188 };
2189
2190 static int domain_context_mapping_cb(struct pci_dev *pdev,
2191                                      u16 alias, void *opaque)
2192 {
2193         struct domain_context_mapping_data *data = opaque;
2194
2195         return domain_context_mapping_one(data->domain, data->iommu,
2196                                           data->table, PCI_BUS_NUM(alias),
2197                                           alias & 0xff);
2198 }
2199
2200 static int
2201 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202 {
2203         struct domain_context_mapping_data data;
2204         struct pasid_table *table;
2205         struct intel_iommu *iommu;
2206         u8 bus, devfn;
2207
2208         iommu = device_to_iommu(dev, &bus, &devfn);
2209         if (!iommu)
2210                 return -ENODEV;
2211
2212         table = intel_pasid_get_table(dev);
2213
2214         if (!dev_is_pci(dev))
2215                 return domain_context_mapping_one(domain, iommu, table,
2216                                                   bus, devfn);
2217
2218         data.domain = domain;
2219         data.iommu = iommu;
2220         data.table = table;
2221
2222         return pci_for_each_dma_alias(to_pci_dev(dev),
2223                                       &domain_context_mapping_cb, &data);
2224 }
2225
2226 static int domain_context_mapped_cb(struct pci_dev *pdev,
2227                                     u16 alias, void *opaque)
2228 {
2229         struct intel_iommu *iommu = opaque;
2230
2231         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232 }
2233
2234 static int domain_context_mapped(struct device *dev)
2235 {
2236         struct intel_iommu *iommu;
2237         u8 bus, devfn;
2238
2239         iommu = device_to_iommu(dev, &bus, &devfn);
2240         if (!iommu)
2241                 return -ENODEV;
2242
2243         if (!dev_is_pci(dev))
2244                 return device_context_mapped(iommu, bus, devfn);
2245
2246         return !pci_for_each_dma_alias(to_pci_dev(dev),
2247                                        domain_context_mapped_cb, iommu);
2248 }
2249
2250 /* Returns a number of VTD pages, but aligned to MM page size */
2251 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252                                             size_t size)
2253 {
2254         host_addr &= ~PAGE_MASK;
2255         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256 }
2257
2258 /* Return largest possible superpage level for a given mapping */
2259 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260                                           unsigned long iov_pfn,
2261                                           unsigned long phy_pfn,
2262                                           unsigned long pages)
2263 {
2264         int support, level = 1;
2265         unsigned long pfnmerge;
2266
2267         support = domain->iommu_superpage;
2268
2269         /* To use a large page, the virtual *and* physical addresses
2270            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271            of them will mean we have to use smaller pages. So just
2272            merge them and check both at once. */
2273         pfnmerge = iov_pfn | phy_pfn;
2274
2275         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276                 pages >>= VTD_STRIDE_SHIFT;
2277                 if (!pages)
2278                         break;
2279                 pfnmerge >>= VTD_STRIDE_SHIFT;
2280                 level++;
2281                 support--;
2282         }
2283         return level;
2284 }
2285
2286 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287                             struct scatterlist *sg, unsigned long phys_pfn,
2288                             unsigned long nr_pages, int prot)
2289 {
2290         struct dma_pte *first_pte = NULL, *pte = NULL;
2291         phys_addr_t pteval;
2292         unsigned long sg_res = 0;
2293         unsigned int largepage_lvl = 0;
2294         unsigned long lvl_pages = 0;
2295         u64 attr;
2296
2297         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298
2299         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300                 return -EINVAL;
2301
2302         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303         if (domain_use_first_level(domain))
2304                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2305
2306         if (!sg) {
2307                 sg_res = nr_pages;
2308                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309         }
2310
2311         while (nr_pages > 0) {
2312                 uint64_t tmp;
2313
2314                 if (!sg_res) {
2315                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316
2317                         sg_res = aligned_nrpages(sg->offset, sg->length);
2318                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319                         sg->dma_length = sg->length;
2320                         pteval = (sg_phys(sg) - pgoff) | attr;
2321                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322                 }
2323
2324                 if (!pte) {
2325                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2326
2327                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328                         if (!pte)
2329                                 return -ENOMEM;
2330                         /* It is large page*/
2331                         if (largepage_lvl > 1) {
2332                                 unsigned long nr_superpages, end_pfn;
2333
2334                                 pteval |= DMA_PTE_LARGE_PAGE;
2335                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336
2337                                 nr_superpages = sg_res / lvl_pages;
2338                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339
2340                                 /*
2341                                  * Ensure that old small page tables are
2342                                  * removed to make room for superpage(s).
2343                                  * We're adding new large pages, so make sure
2344                                  * we don't remove their parent tables.
2345                                  */
2346                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347                                                        largepage_lvl + 1);
2348                         } else {
2349                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350                         }
2351
2352                 }
2353                 /* We don't need lock here, nobody else
2354                  * touches the iova range
2355                  */
2356                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357                 if (tmp) {
2358                         static int dumps = 5;
2359                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360                                 iov_pfn, tmp, (unsigned long long)pteval);
2361                         if (dumps) {
2362                                 dumps--;
2363                                 debug_dma_dump_mappings(NULL);
2364                         }
2365                         WARN_ON(1);
2366                 }
2367
2368                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369
2370                 BUG_ON(nr_pages < lvl_pages);
2371                 BUG_ON(sg_res < lvl_pages);
2372
2373                 nr_pages -= lvl_pages;
2374                 iov_pfn += lvl_pages;
2375                 phys_pfn += lvl_pages;
2376                 pteval += lvl_pages * VTD_PAGE_SIZE;
2377                 sg_res -= lvl_pages;
2378
2379                 /* If the next PTE would be the first in a new page, then we
2380                    need to flush the cache on the entries we've just written.
2381                    And then we'll need to recalculate 'pte', so clear it and
2382                    let it get set again in the if (!pte) block above.
2383
2384                    If we're done (!nr_pages) we need to flush the cache too.
2385
2386                    Also if we've been setting superpages, we may need to
2387                    recalculate 'pte' and switch back to smaller pages for the
2388                    end of the mapping, if the trailing size is not enough to
2389                    use another superpage (i.e. sg_res < lvl_pages). */
2390                 pte++;
2391                 if (!nr_pages || first_pte_in_page(pte) ||
2392                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393                         domain_flush_cache(domain, first_pte,
2394                                            (void *)pte - (void *)first_pte);
2395                         pte = NULL;
2396                 }
2397
2398                 if (!sg_res && nr_pages)
2399                         sg = sg_next(sg);
2400         }
2401         return 0;
2402 }
2403
2404 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405                           struct scatterlist *sg, unsigned long phys_pfn,
2406                           unsigned long nr_pages, int prot)
2407 {
2408         int iommu_id, ret;
2409         struct intel_iommu *iommu;
2410
2411         /* Do the real mapping first */
2412         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413         if (ret)
2414                 return ret;
2415
2416         for_each_domain_iommu(iommu_id, domain) {
2417                 iommu = g_iommus[iommu_id];
2418                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419         }
2420
2421         return 0;
2422 }
2423
2424 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425                                     struct scatterlist *sg, unsigned long nr_pages,
2426                                     int prot)
2427 {
2428         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429 }
2430
2431 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432                                      unsigned long phys_pfn, unsigned long nr_pages,
2433                                      int prot)
2434 {
2435         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436 }
2437
2438 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439 {
2440         unsigned long flags;
2441         struct context_entry *context;
2442         u16 did_old;
2443
2444         if (!iommu)
2445                 return;
2446
2447         spin_lock_irqsave(&iommu->lock, flags);
2448         context = iommu_context_addr(iommu, bus, devfn, 0);
2449         if (!context) {
2450                 spin_unlock_irqrestore(&iommu->lock, flags);
2451                 return;
2452         }
2453         did_old = context_domain_id(context);
2454         context_clear_entry(context);
2455         __iommu_flush_cache(iommu, context, sizeof(*context));
2456         spin_unlock_irqrestore(&iommu->lock, flags);
2457         iommu->flush.flush_context(iommu,
2458                                    did_old,
2459                                    (((u16)bus) << 8) | devfn,
2460                                    DMA_CCMD_MASK_NOBIT,
2461                                    DMA_CCMD_DEVICE_INVL);
2462         iommu->flush.flush_iotlb(iommu,
2463                                  did_old,
2464                                  0,
2465                                  0,
2466                                  DMA_TLB_DSI_FLUSH);
2467 }
2468
2469 static inline void unlink_domain_info(struct device_domain_info *info)
2470 {
2471         assert_spin_locked(&device_domain_lock);
2472         list_del(&info->link);
2473         list_del(&info->global);
2474         if (info->dev)
2475                 dev_iommu_priv_set(info->dev, NULL);
2476 }
2477
2478 static void domain_remove_dev_info(struct dmar_domain *domain)
2479 {
2480         struct device_domain_info *info, *tmp;
2481         unsigned long flags;
2482
2483         spin_lock_irqsave(&device_domain_lock, flags);
2484         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485                 __dmar_remove_one_dev_info(info);
2486         spin_unlock_irqrestore(&device_domain_lock, flags);
2487 }
2488
2489 struct dmar_domain *find_domain(struct device *dev)
2490 {
2491         struct device_domain_info *info;
2492
2493         if (unlikely(attach_deferred(dev)))
2494                 return NULL;
2495
2496         /* No lock here, assumes no domain exit in normal case */
2497         info = get_domain_info(dev);
2498         if (likely(info))
2499                 return info->domain;
2500
2501         return NULL;
2502 }
2503
2504 static void do_deferred_attach(struct device *dev)
2505 {
2506         struct iommu_domain *domain;
2507
2508         dev_iommu_priv_set(dev, NULL);
2509         domain = iommu_get_domain_for_dev(dev);
2510         if (domain)
2511                 intel_iommu_attach_device(domain, dev);
2512 }
2513
2514 static inline struct device_domain_info *
2515 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516 {
2517         struct device_domain_info *info;
2518
2519         list_for_each_entry(info, &device_domain_list, global)
2520                 if (info->segment == segment && info->bus == bus &&
2521                     info->devfn == devfn)
2522                         return info;
2523
2524         return NULL;
2525 }
2526
2527 static int domain_setup_first_level(struct intel_iommu *iommu,
2528                                     struct dmar_domain *domain,
2529                                     struct device *dev,
2530                                     u32 pasid)
2531 {
2532         int flags = PASID_FLAG_SUPERVISOR_MODE;
2533         struct dma_pte *pgd = domain->pgd;
2534         int agaw, level;
2535
2536         /*
2537          * Skip top levels of page tables for iommu which has
2538          * less agaw than default. Unnecessary for PT mode.
2539          */
2540         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541                 pgd = phys_to_virt(dma_pte_addr(pgd));
2542                 if (!dma_pte_present(pgd))
2543                         return -ENOMEM;
2544         }
2545
2546         level = agaw_to_level(agaw);
2547         if (level != 4 && level != 5)
2548                 return -EINVAL;
2549
2550         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2551
2552         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553                                              domain->iommu_did[iommu->seq_id],
2554                                              flags);
2555 }
2556
2557 static bool dev_is_real_dma_subdevice(struct device *dev)
2558 {
2559         return dev && dev_is_pci(dev) &&
2560                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561 }
2562
2563 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564                                                     int bus, int devfn,
2565                                                     struct device *dev,
2566                                                     struct dmar_domain *domain)
2567 {
2568         struct dmar_domain *found = NULL;
2569         struct device_domain_info *info;
2570         unsigned long flags;
2571         int ret;
2572
2573         info = alloc_devinfo_mem();
2574         if (!info)
2575                 return NULL;
2576
2577         if (!dev_is_real_dma_subdevice(dev)) {
2578                 info->bus = bus;
2579                 info->devfn = devfn;
2580                 info->segment = iommu->segment;
2581         } else {
2582                 struct pci_dev *pdev = to_pci_dev(dev);
2583
2584                 info->bus = pdev->bus->number;
2585                 info->devfn = pdev->devfn;
2586                 info->segment = pci_domain_nr(pdev->bus);
2587         }
2588
2589         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591         info->ats_qdep = 0;
2592         info->dev = dev;
2593         info->domain = domain;
2594         info->iommu = iommu;
2595         info->pasid_table = NULL;
2596         info->auxd_enabled = 0;
2597         INIT_LIST_HEAD(&info->auxiliary_domains);
2598
2599         if (dev && dev_is_pci(dev)) {
2600                 struct pci_dev *pdev = to_pci_dev(info->dev);
2601
2602                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2603                     pci_ats_supported(pdev) &&
2604                     dmar_find_matched_atsr_unit(pdev))
2605                         info->ats_supported = 1;
2606
2607                 if (sm_supported(iommu)) {
2608                         if (pasid_supported(iommu)) {
2609                                 int features = pci_pasid_features(pdev);
2610                                 if (features >= 0)
2611                                         info->pasid_supported = features | 1;
2612                         }
2613
2614                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615                             pci_pri_supported(pdev))
2616                                 info->pri_supported = 1;
2617                 }
2618         }
2619
2620         spin_lock_irqsave(&device_domain_lock, flags);
2621         if (dev)
2622                 found = find_domain(dev);
2623
2624         if (!found) {
2625                 struct device_domain_info *info2;
2626                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627                                                        info->devfn);
2628                 if (info2) {
2629                         found      = info2->domain;
2630                         info2->dev = dev;
2631                 }
2632         }
2633
2634         if (found) {
2635                 spin_unlock_irqrestore(&device_domain_lock, flags);
2636                 free_devinfo_mem(info);
2637                 /* Caller must free the original domain */
2638                 return found;
2639         }
2640
2641         spin_lock(&iommu->lock);
2642         ret = domain_attach_iommu(domain, iommu);
2643         spin_unlock(&iommu->lock);
2644
2645         if (ret) {
2646                 spin_unlock_irqrestore(&device_domain_lock, flags);
2647                 free_devinfo_mem(info);
2648                 return NULL;
2649         }
2650
2651         list_add(&info->link, &domain->devices);
2652         list_add(&info->global, &device_domain_list);
2653         if (dev)
2654                 dev_iommu_priv_set(dev, info);
2655         spin_unlock_irqrestore(&device_domain_lock, flags);
2656
2657         /* PASID table is mandatory for a PCI device in scalable mode. */
2658         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659                 ret = intel_pasid_alloc_table(dev);
2660                 if (ret) {
2661                         dev_err(dev, "PASID table allocation failed\n");
2662                         dmar_remove_one_dev_info(dev);
2663                         return NULL;
2664                 }
2665
2666                 /* Setup the PASID entry for requests without PASID: */
2667                 spin_lock_irqsave(&iommu->lock, flags);
2668                 if (hw_pass_through && domain_type_is_si(domain))
2669                         ret = intel_pasid_setup_pass_through(iommu, domain,
2670                                         dev, PASID_RID2PASID);
2671                 else if (domain_use_first_level(domain))
2672                         ret = domain_setup_first_level(iommu, domain, dev,
2673                                         PASID_RID2PASID);
2674                 else
2675                         ret = intel_pasid_setup_second_level(iommu, domain,
2676                                         dev, PASID_RID2PASID);
2677                 spin_unlock_irqrestore(&iommu->lock, flags);
2678                 if (ret) {
2679                         dev_err(dev, "Setup RID2PASID failed\n");
2680                         dmar_remove_one_dev_info(dev);
2681                         return NULL;
2682                 }
2683         }
2684
2685         if (dev && domain_context_mapping(domain, dev)) {
2686                 dev_err(dev, "Domain context map failed\n");
2687                 dmar_remove_one_dev_info(dev);
2688                 return NULL;
2689         }
2690
2691         return domain;
2692 }
2693
2694 static int iommu_domain_identity_map(struct dmar_domain *domain,
2695                                      unsigned long first_vpfn,
2696                                      unsigned long last_vpfn)
2697 {
2698         /*
2699          * RMRR range might have overlap with physical memory range,
2700          * clear it first
2701          */
2702         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703
2704         return __domain_mapping(domain, first_vpfn, NULL,
2705                                 first_vpfn, last_vpfn - first_vpfn + 1,
2706                                 DMA_PTE_READ|DMA_PTE_WRITE);
2707 }
2708
2709 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710
2711 static int __init si_domain_init(int hw)
2712 {
2713         struct dmar_rmrr_unit *rmrr;
2714         struct device *dev;
2715         int i, nid, ret;
2716
2717         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718         if (!si_domain)
2719                 return -EFAULT;
2720
2721         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722                 domain_exit(si_domain);
2723                 return -EFAULT;
2724         }
2725
2726         if (hw)
2727                 return 0;
2728
2729         for_each_online_node(nid) {
2730                 unsigned long start_pfn, end_pfn;
2731                 int i;
2732
2733                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734                         ret = iommu_domain_identity_map(si_domain,
2735                                         mm_to_dma_pfn(start_pfn),
2736                                         mm_to_dma_pfn(end_pfn));
2737                         if (ret)
2738                                 return ret;
2739                 }
2740         }
2741
2742         /*
2743          * Identity map the RMRRs so that devices with RMRRs could also use
2744          * the si_domain.
2745          */
2746         for_each_rmrr_units(rmrr) {
2747                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748                                           i, dev) {
2749                         unsigned long long start = rmrr->base_address;
2750                         unsigned long long end = rmrr->end_address;
2751
2752                         if (WARN_ON(end < start ||
2753                                     end >> agaw_to_width(si_domain->agaw)))
2754                                 continue;
2755
2756                         ret = iommu_domain_identity_map(si_domain,
2757                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2758                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2759                         if (ret)
2760                                 return ret;
2761                 }
2762         }
2763
2764         return 0;
2765 }
2766
2767 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2768 {
2769         struct dmar_domain *ndomain;
2770         struct intel_iommu *iommu;
2771         u8 bus, devfn;
2772
2773         iommu = device_to_iommu(dev, &bus, &devfn);
2774         if (!iommu)
2775                 return -ENODEV;
2776
2777         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778         if (ndomain != domain)
2779                 return -EBUSY;
2780
2781         return 0;
2782 }
2783
2784 static bool device_has_rmrr(struct device *dev)
2785 {
2786         struct dmar_rmrr_unit *rmrr;
2787         struct device *tmp;
2788         int i;
2789
2790         rcu_read_lock();
2791         for_each_rmrr_units(rmrr) {
2792                 /*
2793                  * Return TRUE if this RMRR contains the device that
2794                  * is passed in.
2795                  */
2796                 for_each_active_dev_scope(rmrr->devices,
2797                                           rmrr->devices_cnt, i, tmp)
2798                         if (tmp == dev ||
2799                             is_downstream_to_pci_bridge(dev, tmp)) {
2800                                 rcu_read_unlock();
2801                                 return true;
2802                         }
2803         }
2804         rcu_read_unlock();
2805         return false;
2806 }
2807
2808 /**
2809  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810  * is relaxable (ie. is allowed to be not enforced under some conditions)
2811  * @dev: device handle
2812  *
2813  * We assume that PCI USB devices with RMRRs have them largely
2814  * for historical reasons and that the RMRR space is not actively used post
2815  * boot.  This exclusion may change if vendors begin to abuse it.
2816  *
2817  * The same exception is made for graphics devices, with the requirement that
2818  * any use of the RMRR regions will be torn down before assigning the device
2819  * to a guest.
2820  *
2821  * Return: true if the RMRR is relaxable, false otherwise
2822  */
2823 static bool device_rmrr_is_relaxable(struct device *dev)
2824 {
2825         struct pci_dev *pdev;
2826
2827         if (!dev_is_pci(dev))
2828                 return false;
2829
2830         pdev = to_pci_dev(dev);
2831         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832                 return true;
2833         else
2834                 return false;
2835 }
2836
2837 /*
2838  * There are a couple cases where we need to restrict the functionality of
2839  * devices associated with RMRRs.  The first is when evaluating a device for
2840  * identity mapping because problems exist when devices are moved in and out
2841  * of domains and their respective RMRR information is lost.  This means that
2842  * a device with associated RMRRs will never be in a "passthrough" domain.
2843  * The second is use of the device through the IOMMU API.  This interface
2844  * expects to have full control of the IOVA space for the device.  We cannot
2845  * satisfy both the requirement that RMRR access is maintained and have an
2846  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2847  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848  * We therefore prevent devices associated with an RMRR from participating in
2849  * the IOMMU API, which eliminates them from device assignment.
2850  *
2851  * In both cases, devices which have relaxable RMRRs are not concerned by this
2852  * restriction. See device_rmrr_is_relaxable comment.
2853  */
2854 static bool device_is_rmrr_locked(struct device *dev)
2855 {
2856         if (!device_has_rmrr(dev))
2857                 return false;
2858
2859         if (device_rmrr_is_relaxable(dev))
2860                 return false;
2861
2862         return true;
2863 }
2864
2865 /*
2866  * Return the required default domain type for a specific device.
2867  *
2868  * @dev: the device in query
2869  * @startup: true if this is during early boot
2870  *
2871  * Returns:
2872  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874  *  - 0: both identity and dynamic domains work for this device
2875  */
2876 static int device_def_domain_type(struct device *dev)
2877 {
2878         if (dev_is_pci(dev)) {
2879                 struct pci_dev *pdev = to_pci_dev(dev);
2880
2881                 /*
2882                  * Prevent any device marked as untrusted from getting
2883                  * placed into the statically identity mapping domain.
2884                  */
2885                 if (pdev->untrusted)
2886                         return IOMMU_DOMAIN_DMA;
2887
2888                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889                         return IOMMU_DOMAIN_IDENTITY;
2890
2891                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892                         return IOMMU_DOMAIN_IDENTITY;
2893         }
2894
2895         return 0;
2896 }
2897
2898 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899 {
2900         /*
2901          * Start from the sane iommu hardware state.
2902          * If the queued invalidation is already initialized by us
2903          * (for example, while enabling interrupt-remapping) then
2904          * we got the things already rolling from a sane state.
2905          */
2906         if (!iommu->qi) {
2907                 /*
2908                  * Clear any previous faults.
2909                  */
2910                 dmar_fault(-1, iommu);
2911                 /*
2912                  * Disable queued invalidation if supported and already enabled
2913                  * before OS handover.
2914                  */
2915                 dmar_disable_qi(iommu);
2916         }
2917
2918         if (dmar_enable_qi(iommu)) {
2919                 /*
2920                  * Queued Invalidate not enabled, use Register Based Invalidate
2921                  */
2922                 iommu->flush.flush_context = __iommu_flush_context;
2923                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924                 pr_info("%s: Using Register based invalidation\n",
2925                         iommu->name);
2926         } else {
2927                 iommu->flush.flush_context = qi_flush_context;
2928                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2929                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2930         }
2931 }
2932
2933 static int copy_context_table(struct intel_iommu *iommu,
2934                               struct root_entry *old_re,
2935                               struct context_entry **tbl,
2936                               int bus, bool ext)
2937 {
2938         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939         struct context_entry *new_ce = NULL, ce;
2940         struct context_entry *old_ce = NULL;
2941         struct root_entry re;
2942         phys_addr_t old_ce_phys;
2943
2944         tbl_idx = ext ? bus * 2 : bus;
2945         memcpy(&re, old_re, sizeof(re));
2946
2947         for (devfn = 0; devfn < 256; devfn++) {
2948                 /* First calculate the correct index */
2949                 idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951                 if (idx == 0) {
2952                         /* First save what we may have and clean up */
2953                         if (new_ce) {
2954                                 tbl[tbl_idx] = new_ce;
2955                                 __iommu_flush_cache(iommu, new_ce,
2956                                                     VTD_PAGE_SIZE);
2957                                 pos = 1;
2958                         }
2959
2960                         if (old_ce)
2961                                 memunmap(old_ce);
2962
2963                         ret = 0;
2964                         if (devfn < 0x80)
2965                                 old_ce_phys = root_entry_lctp(&re);
2966                         else
2967                                 old_ce_phys = root_entry_uctp(&re);
2968
2969                         if (!old_ce_phys) {
2970                                 if (ext && devfn == 0) {
2971                                         /* No LCTP, try UCTP */
2972                                         devfn = 0x7f;
2973                                         continue;
2974                                 } else {
2975                                         goto out;
2976                                 }
2977                         }
2978
2979                         ret = -ENOMEM;
2980                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981                                         MEMREMAP_WB);
2982                         if (!old_ce)
2983                                 goto out;
2984
2985                         new_ce = alloc_pgtable_page(iommu->node);
2986                         if (!new_ce)
2987                                 goto out_unmap;
2988
2989                         ret = 0;
2990                 }
2991
2992                 /* Now copy the context entry */
2993                 memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995                 if (!__context_present(&ce))
2996                         continue;
2997
2998                 did = context_domain_id(&ce);
2999                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3000                         set_bit(did, iommu->domain_ids);
3001
3002                 /*
3003                  * We need a marker for copied context entries. This
3004                  * marker needs to work for the old format as well as
3005                  * for extended context entries.
3006                  *
3007                  * Bit 67 of the context entry is used. In the old
3008                  * format this bit is available to software, in the
3009                  * extended format it is the PGE bit, but PGE is ignored
3010                  * by HW if PASIDs are disabled (and thus still
3011                  * available).
3012                  *
3013                  * So disable PASIDs first and then mark the entry
3014                  * copied. This means that we don't copy PASID
3015                  * translations from the old kernel, but this is fine as
3016                  * faults there are not fatal.
3017                  */
3018                 context_clear_pasid_enable(&ce);
3019                 context_set_copied(&ce);
3020
3021                 new_ce[idx] = ce;
3022         }
3023
3024         tbl[tbl_idx + pos] = new_ce;
3025
3026         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028 out_unmap:
3029         memunmap(old_ce);
3030
3031 out:
3032         return ret;
3033 }
3034
3035 static int copy_translation_tables(struct intel_iommu *iommu)
3036 {
3037         struct context_entry **ctxt_tbls;
3038         struct root_entry *old_rt;
3039         phys_addr_t old_rt_phys;
3040         int ctxt_table_entries;
3041         unsigned long flags;
3042         u64 rtaddr_reg;
3043         int bus, ret;
3044         bool new_ext, ext;
3045
3046         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048         new_ext    = !!ecap_ecs(iommu->ecap);
3049
3050         /*
3051          * The RTT bit can only be changed when translation is disabled,
3052          * but disabling translation means to open a window for data
3053          * corruption. So bail out and don't copy anything if we would
3054          * have to change the bit.
3055          */
3056         if (new_ext != ext)
3057                 return -EINVAL;
3058
3059         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060         if (!old_rt_phys)
3061                 return -EINVAL;
3062
3063         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064         if (!old_rt)
3065                 return -ENOMEM;
3066
3067         /* This is too big for the stack - allocate it from slab */
3068         ctxt_table_entries = ext ? 512 : 256;
3069         ret = -ENOMEM;
3070         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071         if (!ctxt_tbls)
3072                 goto out_unmap;
3073
3074         for (bus = 0; bus < 256; bus++) {
3075                 ret = copy_context_table(iommu, &old_rt[bus],
3076                                          ctxt_tbls, bus, ext);
3077                 if (ret) {
3078                         pr_err("%s: Failed to copy context table for bus %d\n",
3079                                 iommu->name, bus);
3080                         continue;
3081                 }
3082         }
3083
3084         spin_lock_irqsave(&iommu->lock, flags);
3085
3086         /* Context tables are copied, now write them to the root_entry table */
3087         for (bus = 0; bus < 256; bus++) {
3088                 int idx = ext ? bus * 2 : bus;
3089                 u64 val;
3090
3091                 if (ctxt_tbls[idx]) {
3092                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093                         iommu->root_entry[bus].lo = val;
3094                 }
3095
3096                 if (!ext || !ctxt_tbls[idx + 1])
3097                         continue;
3098
3099                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100                 iommu->root_entry[bus].hi = val;
3101         }
3102
3103         spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105         kfree(ctxt_tbls);
3106
3107         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109         ret = 0;
3110
3111 out_unmap:
3112         memunmap(old_rt);
3113
3114         return ret;
3115 }
3116
3117 #ifdef CONFIG_INTEL_IOMMU_SVM
3118 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119 {
3120         struct intel_iommu *iommu = data;
3121         ioasid_t ioasid;
3122
3123         if (!iommu)
3124                 return INVALID_IOASID;
3125         /*
3126          * VT-d virtual command interface always uses the full 20 bit
3127          * PASID range. Host can partition guest PASID range based on
3128          * policies but it is out of guest's control.
3129          */
3130         if (min < PASID_MIN || max > intel_pasid_max_id)
3131                 return INVALID_IOASID;
3132
3133         if (vcmd_alloc_pasid(iommu, &ioasid))
3134                 return INVALID_IOASID;
3135
3136         return ioasid;
3137 }
3138
3139 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140 {
3141         struct intel_iommu *iommu = data;
3142
3143         if (!iommu)
3144                 return;
3145         /*
3146          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147          * We can only free the PASID when all the devices are unbound.
3148          */
3149         if (ioasid_find(NULL, ioasid, NULL)) {
3150                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3151                 return;
3152         }
3153         vcmd_free_pasid(iommu, ioasid);
3154 }
3155
3156 static void register_pasid_allocator(struct intel_iommu *iommu)
3157 {
3158         /*
3159          * If we are running in the host, no need for custom allocator
3160          * in that PASIDs are allocated from the host system-wide.
3161          */
3162         if (!cap_caching_mode(iommu->cap))
3163                 return;
3164
3165         if (!sm_supported(iommu)) {
3166                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167                 return;
3168         }
3169
3170         /*
3171          * Register a custom PASID allocator if we are running in a guest,
3172          * guest PASID must be obtained via virtual command interface.
3173          * There can be multiple vIOMMUs in each guest but only one allocator
3174          * is active. All vIOMMU allocators will eventually be calling the same
3175          * host allocator.
3176          */
3177         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178                 return;
3179
3180         pr_info("Register custom PASID allocator\n");
3181         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183         iommu->pasid_allocator.pdata = (void *)iommu;
3184         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186                 /*
3187                  * Disable scalable mode on this IOMMU if there
3188                  * is no custom allocator. Mixing SM capable vIOMMU
3189                  * and non-SM vIOMMU are not supported.
3190                  */
3191                 intel_iommu_sm = 0;
3192         }
3193 }
3194 #endif
3195
3196 static int __init init_dmars(void)
3197 {
3198         struct dmar_drhd_unit *drhd;
3199         struct intel_iommu *iommu;
3200         int ret;
3201
3202         /*
3203          * for each drhd
3204          *    allocate root
3205          *    initialize and program root entry to not present
3206          * endfor
3207          */
3208         for_each_drhd_unit(drhd) {
3209                 /*
3210                  * lock not needed as this is only incremented in the single
3211                  * threaded kernel __init code path all other access are read
3212                  * only
3213                  */
3214                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215                         g_num_of_iommus++;
3216                         continue;
3217                 }
3218                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219         }
3220
3221         /* Preallocate enough resources for IOMMU hot-addition */
3222         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224
3225         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226                         GFP_KERNEL);
3227         if (!g_iommus) {
3228                 pr_err("Allocating global iommu array failed\n");
3229                 ret = -ENOMEM;
3230                 goto error;
3231         }
3232
3233         for_each_iommu(iommu, drhd) {
3234                 if (drhd->ignored) {
3235                         iommu_disable_translation(iommu);
3236                         continue;
3237                 }
3238
3239                 /*
3240                  * Find the max pasid size of all IOMMU's in the system.
3241                  * We need to ensure the system pasid table is no bigger
3242                  * than the smallest supported.
3243                  */
3244                 if (pasid_supported(iommu)) {
3245                         u32 temp = 2 << ecap_pss(iommu->ecap);
3246
3247                         intel_pasid_max_id = min_t(u32, temp,
3248                                                    intel_pasid_max_id);
3249                 }
3250
3251                 g_iommus[iommu->seq_id] = iommu;
3252
3253                 intel_iommu_init_qi(iommu);
3254
3255                 ret = iommu_init_domains(iommu);
3256                 if (ret)
3257                         goto free_iommu;
3258
3259                 init_translation_status(iommu);
3260
3261                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262                         iommu_disable_translation(iommu);
3263                         clear_translation_pre_enabled(iommu);
3264                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265                                 iommu->name);
3266                 }
3267
3268                 /*
3269                  * TBD:
3270                  * we could share the same root & context tables
3271                  * among all IOMMU's. Need to Split it later.
3272                  */
3273                 ret = iommu_alloc_root_entry(iommu);
3274                 if (ret)
3275                         goto free_iommu;
3276
3277                 if (translation_pre_enabled(iommu)) {
3278                         pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280                         ret = copy_translation_tables(iommu);
3281                         if (ret) {
3282                                 /*
3283                                  * We found the IOMMU with translation
3284                                  * enabled - but failed to copy over the
3285                                  * old root-entry table. Try to proceed
3286                                  * by disabling translation now and
3287                                  * allocating a clean root-entry table.
3288                                  * This might cause DMAR faults, but
3289                                  * probably the dump will still succeed.
3290                                  */
3291                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292                                        iommu->name);
3293                                 iommu_disable_translation(iommu);
3294                                 clear_translation_pre_enabled(iommu);
3295                         } else {
3296                                 pr_info("Copied translation tables from previous kernel for %s\n",
3297                                         iommu->name);
3298                         }
3299                 }
3300
3301                 if (!ecap_pass_through(iommu->ecap))
3302                         hw_pass_through = 0;
3303                 intel_svm_check(iommu);
3304         }
3305
3306         /*
3307          * Now that qi is enabled on all iommus, set the root entry and flush
3308          * caches. This is required on some Intel X58 chipsets, otherwise the
3309          * flush_context function will loop forever and the boot hangs.
3310          */
3311         for_each_active_iommu(iommu, drhd) {
3312                 iommu_flush_write_buffer(iommu);
3313 #ifdef CONFIG_INTEL_IOMMU_SVM
3314                 register_pasid_allocator(iommu);
3315 #endif
3316                 iommu_set_root_entry(iommu);
3317                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319         }
3320
3321 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322         dmar_map_gfx = 0;
3323 #endif
3324
3325         if (!dmar_map_gfx)
3326                 iommu_identity_mapping |= IDENTMAP_GFX;
3327
3328         check_tylersburg_isoch();
3329
3330         ret = si_domain_init(hw_pass_through);
3331         if (ret)
3332                 goto free_iommu;
3333
3334         /*
3335          * for each drhd
3336          *   enable fault log
3337          *   global invalidate context cache
3338          *   global invalidate iotlb
3339          *   enable translation
3340          */
3341         for_each_iommu(iommu, drhd) {
3342                 if (drhd->ignored) {
3343                         /*
3344                          * we always have to disable PMRs or DMA may fail on
3345                          * this device
3346                          */
3347                         if (force_on)
3348                                 iommu_disable_protect_mem_regions(iommu);
3349                         continue;
3350                 }
3351
3352                 iommu_flush_write_buffer(iommu);
3353
3354 #ifdef CONFIG_INTEL_IOMMU_SVM
3355                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356                         /*
3357                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358                          * could cause possible lock race condition.
3359                          */
3360                         up_write(&dmar_global_lock);
3361                         ret = intel_svm_enable_prq(iommu);
3362                         down_write(&dmar_global_lock);
3363                         if (ret)
3364                                 goto free_iommu;
3365                 }
3366 #endif
3367                 ret = dmar_set_interrupt(iommu);
3368                 if (ret)
3369                         goto free_iommu;
3370         }
3371
3372         return 0;
3373
3374 free_iommu:
3375         for_each_active_iommu(iommu, drhd) {
3376                 disable_dmar_iommu(iommu);
3377                 free_dmar_iommu(iommu);
3378         }
3379
3380         kfree(g_iommus);
3381
3382 error:
3383         return ret;
3384 }
3385
3386 /* This takes a number of _MM_ pages, not VTD pages */
3387 static unsigned long intel_alloc_iova(struct device *dev,
3388                                      struct dmar_domain *domain,
3389                                      unsigned long nrpages, uint64_t dma_mask)
3390 {
3391         unsigned long iova_pfn;
3392
3393         /*
3394          * Restrict dma_mask to the width that the iommu can handle.
3395          * First-level translation restricts the input-address to a
3396          * canonical address (i.e., address bits 63:N have the same
3397          * value as address bit [N-1], where N is 48-bits with 4-level
3398          * paging and 57-bits with 5-level paging). Hence, skip bit
3399          * [N-1].
3400          */
3401         if (domain_use_first_level(domain))
3402                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403                                  dma_mask);
3404         else
3405                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406                                  dma_mask);
3407
3408         /* Ensure we reserve the whole size-aligned region */
3409         nrpages = __roundup_pow_of_two(nrpages);
3410
3411         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412                 /*
3413                  * First try to allocate an io virtual address in
3414                  * DMA_BIT_MASK(32) and if that fails then try allocating
3415                  * from higher range
3416                  */
3417                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3419                 if (iova_pfn)
3420                         return iova_pfn;
3421         }
3422         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423                                    IOVA_PFN(dma_mask), true);
3424         if (unlikely(!iova_pfn)) {
3425                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426                              nrpages);
3427                 return 0;
3428         }
3429
3430         return iova_pfn;
3431 }
3432
3433 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434                                      size_t size, int dir, u64 dma_mask)
3435 {
3436         struct dmar_domain *domain;
3437         phys_addr_t start_paddr;
3438         unsigned long iova_pfn;
3439         int prot = 0;
3440         int ret;
3441         struct intel_iommu *iommu;
3442         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443
3444         BUG_ON(dir == DMA_NONE);
3445
3446         if (unlikely(attach_deferred(dev)))
3447                 do_deferred_attach(dev);
3448
3449         domain = find_domain(dev);
3450         if (!domain)
3451                 return DMA_MAPPING_ERROR;
3452
3453         iommu = domain_get_iommu(domain);
3454         size = aligned_nrpages(paddr, size);
3455
3456         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457         if (!iova_pfn)
3458                 goto error;
3459
3460         /*
3461          * Check if DMAR supports zero-length reads on write only
3462          * mappings..
3463          */
3464         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465                         !cap_zlr(iommu->cap))
3466                 prot |= DMA_PTE_READ;
3467         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468                 prot |= DMA_PTE_WRITE;
3469         /*
3470          * paddr - (paddr + size) might be partial page, we should map the whole
3471          * page.  Note: if two part of one page are separately mapped, we
3472          * might have two guest_addr mapping to the same host paddr, but this
3473          * is not a big problem
3474          */
3475         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3477         if (ret)
3478                 goto error;
3479
3480         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481         start_paddr += paddr & ~PAGE_MASK;
3482
3483         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484
3485         return start_paddr;
3486
3487 error:
3488         if (iova_pfn)
3489                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491                 size, (unsigned long long)paddr, dir);
3492         return DMA_MAPPING_ERROR;
3493 }
3494
3495 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496                                  unsigned long offset, size_t size,
3497                                  enum dma_data_direction dir,
3498                                  unsigned long attrs)
3499 {
3500         return __intel_map_single(dev, page_to_phys(page) + offset,
3501                                   size, dir, *dev->dma_mask);
3502 }
3503
3504 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505                                      size_t size, enum dma_data_direction dir,
3506                                      unsigned long attrs)
3507 {
3508         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509 }
3510
3511 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512 {
3513         struct dmar_domain *domain;
3514         unsigned long start_pfn, last_pfn;
3515         unsigned long nrpages;
3516         unsigned long iova_pfn;
3517         struct intel_iommu *iommu;
3518         struct page *freelist;
3519         struct pci_dev *pdev = NULL;
3520
3521         domain = find_domain(dev);
3522         BUG_ON(!domain);
3523
3524         iommu = domain_get_iommu(domain);
3525
3526         iova_pfn = IOVA_PFN(dev_addr);
3527
3528         nrpages = aligned_nrpages(dev_addr, size);
3529         start_pfn = mm_to_dma_pfn(iova_pfn);
3530         last_pfn = start_pfn + nrpages - 1;
3531
3532         if (dev_is_pci(dev))
3533                 pdev = to_pci_dev(dev);
3534
3535         freelist = domain_unmap(domain, start_pfn, last_pfn);
3536         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537                         !has_iova_flush_queue(&domain->iovad)) {
3538                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539                                       nrpages, !freelist, 0);
3540                 /* free iova */
3541                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542                 dma_free_pagelist(freelist);
3543         } else {
3544                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3545                            (unsigned long)freelist);
3546                 /*
3547                  * queue up the release of the unmap to save the 1/6th of the
3548                  * cpu used up by the iotlb flush operation...
3549                  */
3550         }
3551
3552         trace_unmap_single(dev, dev_addr, size);
3553 }
3554
3555 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556                              size_t size, enum dma_data_direction dir,
3557                              unsigned long attrs)
3558 {
3559         intel_unmap(dev, dev_addr, size);
3560 }
3561
3562 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3564 {
3565         intel_unmap(dev, dev_addr, size);
3566 }
3567
3568 static void *intel_alloc_coherent(struct device *dev, size_t size,
3569                                   dma_addr_t *dma_handle, gfp_t flags,
3570                                   unsigned long attrs)
3571 {
3572         struct page *page = NULL;
3573         int order;
3574
3575         if (unlikely(attach_deferred(dev)))
3576                 do_deferred_attach(dev);
3577
3578         size = PAGE_ALIGN(size);
3579         order = get_order(size);
3580
3581         if (gfpflags_allow_blocking(flags)) {
3582                 unsigned int count = size >> PAGE_SHIFT;
3583
3584                 page = dma_alloc_from_contiguous(dev, count, order,
3585                                                  flags & __GFP_NOWARN);
3586         }
3587
3588         if (!page)
3589                 page = alloc_pages(flags, order);
3590         if (!page)
3591                 return NULL;
3592         memset(page_address(page), 0, size);
3593
3594         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595                                          DMA_BIDIRECTIONAL,
3596                                          dev->coherent_dma_mask);
3597         if (*dma_handle != DMA_MAPPING_ERROR)
3598                 return page_address(page);
3599         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600                 __free_pages(page, order);
3601
3602         return NULL;
3603 }
3604
3605 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606                                 dma_addr_t dma_handle, unsigned long attrs)
3607 {
3608         int order;
3609         struct page *page = virt_to_page(vaddr);
3610
3611         size = PAGE_ALIGN(size);
3612         order = get_order(size);
3613
3614         intel_unmap(dev, dma_handle, size);
3615         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616                 __free_pages(page, order);
3617 }
3618
3619 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620                            int nelems, enum dma_data_direction dir,
3621                            unsigned long attrs)
3622 {
3623         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624         unsigned long nrpages = 0;
3625         struct scatterlist *sg;
3626         int i;
3627
3628         for_each_sg(sglist, sg, nelems, i) {
3629                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630         }
3631
3632         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633
3634         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635 }
3636
3637 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638                         enum dma_data_direction dir, unsigned long attrs)
3639 {
3640         int i;
3641         struct dmar_domain *domain;
3642         size_t size = 0;
3643         int prot = 0;
3644         unsigned long iova_pfn;
3645         int ret;
3646         struct scatterlist *sg;
3647         unsigned long start_vpfn;
3648         struct intel_iommu *iommu;
3649
3650         BUG_ON(dir == DMA_NONE);
3651
3652         if (unlikely(attach_deferred(dev)))
3653                 do_deferred_attach(dev);
3654
3655         domain = find_domain(dev);
3656         if (!domain)
3657                 return 0;
3658
3659         iommu = domain_get_iommu(domain);
3660
3661         for_each_sg(sglist, sg, nelems, i)
3662                 size += aligned_nrpages(sg->offset, sg->length);
3663
3664         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665                                 *dev->dma_mask);
3666         if (!iova_pfn) {
3667                 sglist->dma_length = 0;
3668                 return 0;
3669         }
3670
3671         /*
3672          * Check if DMAR supports zero-length reads on write only
3673          * mappings..
3674          */
3675         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676                         !cap_zlr(iommu->cap))
3677                 prot |= DMA_PTE_READ;
3678         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679                 prot |= DMA_PTE_WRITE;
3680
3681         start_vpfn = mm_to_dma_pfn(iova_pfn);
3682
3683         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684         if (unlikely(ret)) {
3685                 dma_pte_free_pagetable(domain, start_vpfn,
3686                                        start_vpfn + size - 1,
3687                                        agaw_to_level(domain->agaw) + 1);
3688                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689                 return 0;
3690         }
3691
3692         for_each_sg(sglist, sg, nelems, i)
3693                 trace_map_sg(dev, i + 1, nelems, sg);
3694
3695         return nelems;
3696 }
3697
3698 static u64 intel_get_required_mask(struct device *dev)
3699 {
3700         return DMA_BIT_MASK(32);
3701 }
3702
3703 static const struct dma_map_ops intel_dma_ops = {
3704         .alloc = intel_alloc_coherent,
3705         .free = intel_free_coherent,
3706         .map_sg = intel_map_sg,
3707         .unmap_sg = intel_unmap_sg,
3708         .map_page = intel_map_page,
3709         .unmap_page = intel_unmap_page,
3710         .map_resource = intel_map_resource,
3711         .unmap_resource = intel_unmap_resource,
3712         .dma_supported = dma_direct_supported,
3713         .mmap = dma_common_mmap,
3714         .get_sgtable = dma_common_get_sgtable,
3715         .get_required_mask = intel_get_required_mask,
3716 };
3717
3718 static void
3719 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720                    enum dma_data_direction dir, enum dma_sync_target target)
3721 {
3722         struct dmar_domain *domain;
3723         phys_addr_t tlb_addr;
3724
3725         domain = find_domain(dev);
3726         if (WARN_ON(!domain))
3727                 return;
3728
3729         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730         if (is_swiotlb_buffer(tlb_addr))
3731                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732 }
3733
3734 static dma_addr_t
3735 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736                   enum dma_data_direction dir, unsigned long attrs,
3737                   u64 dma_mask)
3738 {
3739         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740         struct dmar_domain *domain;
3741         struct intel_iommu *iommu;
3742         unsigned long iova_pfn;
3743         unsigned long nrpages;
3744         phys_addr_t tlb_addr;
3745         int prot = 0;
3746         int ret;
3747
3748         if (unlikely(attach_deferred(dev)))
3749                 do_deferred_attach(dev);
3750
3751         domain = find_domain(dev);
3752
3753         if (WARN_ON(dir == DMA_NONE || !domain))
3754                 return DMA_MAPPING_ERROR;
3755
3756         iommu = domain_get_iommu(domain);
3757         if (WARN_ON(!iommu))
3758                 return DMA_MAPPING_ERROR;
3759
3760         nrpages = aligned_nrpages(0, size);
3761         iova_pfn = intel_alloc_iova(dev, domain,
3762                                     dma_to_mm_pfn(nrpages), dma_mask);
3763         if (!iova_pfn)
3764                 return DMA_MAPPING_ERROR;
3765
3766         /*
3767          * Check if DMAR supports zero-length reads on write only
3768          * mappings..
3769          */
3770         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771                         !cap_zlr(iommu->cap))
3772                 prot |= DMA_PTE_READ;
3773         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774                 prot |= DMA_PTE_WRITE;
3775
3776         /*
3777          * If both the physical buffer start address and size are
3778          * page aligned, we don't need to use a bounce page.
3779          */
3780         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781                 tlb_addr = swiotlb_tbl_map_single(dev,
3782                                 __phys_to_dma(dev, io_tlb_start),
3783                                 paddr, size, aligned_size, dir, attrs);
3784                 if (tlb_addr == DMA_MAPPING_ERROR) {
3785                         goto swiotlb_error;
3786                 } else {
3787                         /* Cleanup the padding area. */
3788                         void *padding_start = phys_to_virt(tlb_addr);
3789                         size_t padding_size = aligned_size;
3790
3791                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792                             (dir == DMA_TO_DEVICE ||
3793                              dir == DMA_BIDIRECTIONAL)) {
3794                                 padding_start += size;
3795                                 padding_size -= size;
3796                         }
3797
3798                         memset(padding_start, 0, padding_size);
3799                 }
3800         } else {
3801                 tlb_addr = paddr;
3802         }
3803
3804         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806         if (ret)
3807                 goto mapping_error;
3808
3809         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810
3811         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812
3813 mapping_error:
3814         if (is_swiotlb_buffer(tlb_addr))
3815                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816                                          aligned_size, dir, attrs);
3817 swiotlb_error:
3818         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820                 size, (unsigned long long)paddr, dir);
3821
3822         return DMA_MAPPING_ERROR;
3823 }
3824
3825 static void
3826 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827                     enum dma_data_direction dir, unsigned long attrs)
3828 {
3829         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830         struct dmar_domain *domain;
3831         phys_addr_t tlb_addr;
3832
3833         domain = find_domain(dev);
3834         if (WARN_ON(!domain))
3835                 return;
3836
3837         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838         if (WARN_ON(!tlb_addr))
3839                 return;
3840
3841         intel_unmap(dev, dev_addr, size);
3842         if (is_swiotlb_buffer(tlb_addr))
3843                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844                                          aligned_size, dir, attrs);
3845
3846         trace_bounce_unmap_single(dev, dev_addr, size);
3847 }
3848
3849 static dma_addr_t
3850 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3852 {
3853         return bounce_map_single(dev, page_to_phys(page) + offset,
3854                                  size, dir, attrs, *dev->dma_mask);
3855 }
3856
3857 static dma_addr_t
3858 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859                     enum dma_data_direction dir, unsigned long attrs)
3860 {
3861         return bounce_map_single(dev, phys_addr, size,
3862                                  dir, attrs, *dev->dma_mask);
3863 }
3864
3865 static void
3866 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867                   enum dma_data_direction dir, unsigned long attrs)
3868 {
3869         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870 }
3871
3872 static void
3873 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874                       enum dma_data_direction dir, unsigned long attrs)
3875 {
3876         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877 }
3878
3879 static void
3880 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881                 enum dma_data_direction dir, unsigned long attrs)
3882 {
3883         struct scatterlist *sg;
3884         int i;
3885
3886         for_each_sg(sglist, sg, nelems, i)
3887                 bounce_unmap_page(dev, sg->dma_address,
3888                                   sg_dma_len(sg), dir, attrs);
3889 }
3890
3891 static int
3892 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893               enum dma_data_direction dir, unsigned long attrs)
3894 {
3895         int i;
3896         struct scatterlist *sg;
3897
3898         for_each_sg(sglist, sg, nelems, i) {
3899                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900                                                   sg->offset, sg->length,
3901                                                   dir, attrs);
3902                 if (sg->dma_address == DMA_MAPPING_ERROR)
3903                         goto out_unmap;
3904                 sg_dma_len(sg) = sg->length;
3905         }
3906
3907         for_each_sg(sglist, sg, nelems, i)
3908                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909
3910         return nelems;
3911
3912 out_unmap:
3913         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914         return 0;
3915 }
3916
3917 static void
3918 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919                            size_t size, enum dma_data_direction dir)
3920 {
3921         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922 }
3923
3924 static void
3925 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926                               size_t size, enum dma_data_direction dir)
3927 {
3928         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929 }
3930
3931 static void
3932 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933                        int nelems, enum dma_data_direction dir)
3934 {
3935         struct scatterlist *sg;
3936         int i;
3937
3938         for_each_sg(sglist, sg, nelems, i)
3939                 bounce_sync_single(dev, sg_dma_address(sg),
3940                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941 }
3942
3943 static void
3944 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945                           int nelems, enum dma_data_direction dir)
3946 {
3947         struct scatterlist *sg;
3948         int i;
3949
3950         for_each_sg(sglist, sg, nelems, i)
3951                 bounce_sync_single(dev, sg_dma_address(sg),
3952                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953 }
3954
3955 static const struct dma_map_ops bounce_dma_ops = {
3956         .alloc                  = intel_alloc_coherent,
3957         .free                   = intel_free_coherent,
3958         .map_sg                 = bounce_map_sg,
3959         .unmap_sg               = bounce_unmap_sg,
3960         .map_page               = bounce_map_page,
3961         .unmap_page             = bounce_unmap_page,
3962         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3963         .sync_single_for_device = bounce_sync_single_for_device,
3964         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3965         .sync_sg_for_device     = bounce_sync_sg_for_device,
3966         .map_resource           = bounce_map_resource,
3967         .unmap_resource         = bounce_unmap_resource,
3968         .dma_supported          = dma_direct_supported,
3969 };
3970
3971 static inline int iommu_domain_cache_init(void)
3972 {
3973         int ret = 0;
3974
3975         iommu_domain_cache = kmem_cache_create("iommu_domain",
3976                                          sizeof(struct dmar_domain),
3977                                          0,
3978                                          SLAB_HWCACHE_ALIGN,
3979
3980                                          NULL);
3981         if (!iommu_domain_cache) {
3982                 pr_err("Couldn't create iommu_domain cache\n");
3983                 ret = -ENOMEM;
3984         }
3985
3986         return ret;
3987 }
3988
3989 static inline int iommu_devinfo_cache_init(void)
3990 {
3991         int ret = 0;
3992
3993         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994                                          sizeof(struct device_domain_info),
3995                                          0,
3996                                          SLAB_HWCACHE_ALIGN,
3997                                          NULL);
3998         if (!iommu_devinfo_cache) {
3999                 pr_err("Couldn't create devinfo cache\n");
4000                 ret = -ENOMEM;
4001         }
4002
4003         return ret;
4004 }
4005
4006 static int __init iommu_init_mempool(void)
4007 {
4008         int ret;
4009         ret = iova_cache_get();
4010         if (ret)
4011                 return ret;
4012
4013         ret = iommu_domain_cache_init();
4014         if (ret)
4015                 goto domain_error;
4016
4017         ret = iommu_devinfo_cache_init();
4018         if (!ret)
4019                 return ret;
4020
4021         kmem_cache_destroy(iommu_domain_cache);
4022 domain_error:
4023         iova_cache_put();
4024
4025         return -ENOMEM;
4026 }
4027
4028 static void __init iommu_exit_mempool(void)
4029 {
4030         kmem_cache_destroy(iommu_devinfo_cache);
4031         kmem_cache_destroy(iommu_domain_cache);
4032         iova_cache_put();
4033 }
4034
4035 static void __init init_no_remapping_devices(void)
4036 {
4037         struct dmar_drhd_unit *drhd;
4038         struct device *dev;
4039         int i;
4040
4041         for_each_drhd_unit(drhd) {
4042                 if (!drhd->include_all) {
4043                         for_each_active_dev_scope(drhd->devices,
4044                                                   drhd->devices_cnt, i, dev)
4045                                 break;
4046                         /* ignore DMAR unit if no devices exist */
4047                         if (i == drhd->devices_cnt)
4048                                 drhd->ignored = 1;
4049                 }
4050         }
4051
4052         for_each_active_drhd_unit(drhd) {
4053                 if (drhd->include_all)
4054                         continue;
4055
4056                 for_each_active_dev_scope(drhd->devices,
4057                                           drhd->devices_cnt, i, dev)
4058                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059                                 break;
4060                 if (i < drhd->devices_cnt)
4061                         continue;
4062
4063                 /* This IOMMU has *only* gfx devices. Either bypass it or
4064                    set the gfx_mapped flag, as appropriate */
4065                 drhd->gfx_dedicated = 1;
4066                 if (!dmar_map_gfx)
4067                         drhd->ignored = 1;
4068         }
4069 }
4070
4071 #ifdef CONFIG_SUSPEND
4072 static int init_iommu_hw(void)
4073 {
4074         struct dmar_drhd_unit *drhd;
4075         struct intel_iommu *iommu = NULL;
4076
4077         for_each_active_iommu(iommu, drhd)
4078                 if (iommu->qi)
4079                         dmar_reenable_qi(iommu);
4080
4081         for_each_iommu(iommu, drhd) {
4082                 if (drhd->ignored) {
4083                         /*
4084                          * we always have to disable PMRs or DMA may fail on
4085                          * this device
4086                          */
4087                         if (force_on)
4088                                 iommu_disable_protect_mem_regions(iommu);
4089                         continue;
4090                 }
4091
4092                 iommu_flush_write_buffer(iommu);
4093
4094                 iommu_set_root_entry(iommu);
4095
4096                 iommu->flush.flush_context(iommu, 0, 0, 0,
4097                                            DMA_CCMD_GLOBAL_INVL);
4098                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099                 iommu_enable_translation(iommu);
4100                 iommu_disable_protect_mem_regions(iommu);
4101         }
4102
4103         return 0;
4104 }
4105
4106 static void iommu_flush_all(void)
4107 {
4108         struct dmar_drhd_unit *drhd;
4109         struct intel_iommu *iommu;
4110
4111         for_each_active_iommu(iommu, drhd) {
4112                 iommu->flush.flush_context(iommu, 0, 0, 0,
4113                                            DMA_CCMD_GLOBAL_INVL);
4114                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115                                          DMA_TLB_GLOBAL_FLUSH);
4116         }
4117 }
4118
4119 static int iommu_suspend(void)
4120 {
4121         struct dmar_drhd_unit *drhd;
4122         struct intel_iommu *iommu = NULL;
4123         unsigned long flag;
4124
4125         for_each_active_iommu(iommu, drhd) {
4126                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127                                                  GFP_ATOMIC);
4128                 if (!iommu->iommu_state)
4129                         goto nomem;
4130         }
4131
4132         iommu_flush_all();
4133
4134         for_each_active_iommu(iommu, drhd) {
4135                 iommu_disable_translation(iommu);
4136
4137                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138
4139                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140                         readl(iommu->reg + DMAR_FECTL_REG);
4141                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142                         readl(iommu->reg + DMAR_FEDATA_REG);
4143                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144                         readl(iommu->reg + DMAR_FEADDR_REG);
4145                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146                         readl(iommu->reg + DMAR_FEUADDR_REG);
4147
4148                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149         }
4150         return 0;
4151
4152 nomem:
4153         for_each_active_iommu(iommu, drhd)
4154                 kfree(iommu->iommu_state);
4155
4156         return -ENOMEM;
4157 }
4158
4159 static void iommu_resume(void)
4160 {
4161         struct dmar_drhd_unit *drhd;
4162         struct intel_iommu *iommu = NULL;
4163         unsigned long flag;
4164
4165         if (init_iommu_hw()) {
4166                 if (force_on)
4167                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168                 else
4169                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170                 return;
4171         }
4172
4173         for_each_active_iommu(iommu, drhd) {
4174
4175                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176
4177                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178                         iommu->reg + DMAR_FECTL_REG);
4179                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180                         iommu->reg + DMAR_FEDATA_REG);
4181                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182                         iommu->reg + DMAR_FEADDR_REG);
4183                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184                         iommu->reg + DMAR_FEUADDR_REG);
4185
4186                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187         }
4188
4189         for_each_active_iommu(iommu, drhd)
4190                 kfree(iommu->iommu_state);
4191 }
4192
4193 static struct syscore_ops iommu_syscore_ops = {
4194         .resume         = iommu_resume,
4195         .suspend        = iommu_suspend,
4196 };
4197
4198 static void __init init_iommu_pm_ops(void)
4199 {
4200         register_syscore_ops(&iommu_syscore_ops);
4201 }
4202
4203 #else
4204 static inline void init_iommu_pm_ops(void) {}
4205 #endif  /* CONFIG_PM */
4206
4207 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208 {
4209         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211             rmrr->end_address <= rmrr->base_address ||
4212             arch_rmrr_sanity_check(rmrr))
4213                 return -EINVAL;
4214
4215         return 0;
4216 }
4217
4218 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219 {
4220         struct acpi_dmar_reserved_memory *rmrr;
4221         struct dmar_rmrr_unit *rmrru;
4222
4223         rmrr = (struct acpi_dmar_reserved_memory *)header;
4224         if (rmrr_sanity_check(rmrr)) {
4225                 pr_warn(FW_BUG
4226                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228                            rmrr->base_address, rmrr->end_address,
4229                            dmi_get_system_info(DMI_BIOS_VENDOR),
4230                            dmi_get_system_info(DMI_BIOS_VERSION),
4231                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4232                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233         }
4234
4235         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236         if (!rmrru)
4237                 goto out;
4238
4239         rmrru->hdr = header;
4240
4241         rmrru->base_address = rmrr->base_address;
4242         rmrru->end_address = rmrr->end_address;
4243
4244         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245                                 ((void *)rmrr) + rmrr->header.length,
4246                                 &rmrru->devices_cnt);
4247         if (rmrru->devices_cnt && rmrru->devices == NULL)
4248                 goto free_rmrru;
4249
4250         list_add(&rmrru->list, &dmar_rmrr_units);
4251
4252         return 0;
4253 free_rmrru:
4254         kfree(rmrru);
4255 out:
4256         return -ENOMEM;
4257 }
4258
4259 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260 {
4261         struct dmar_atsr_unit *atsru;
4262         struct acpi_dmar_atsr *tmp;
4263
4264         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265                                 dmar_rcu_check()) {
4266                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267                 if (atsr->segment != tmp->segment)
4268                         continue;
4269                 if (atsr->header.length != tmp->header.length)
4270                         continue;
4271                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272                         return atsru;
4273         }
4274
4275         return NULL;
4276 }
4277
4278 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279 {
4280         struct acpi_dmar_atsr *atsr;
4281         struct dmar_atsr_unit *atsru;
4282
4283         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284                 return 0;
4285
4286         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287         atsru = dmar_find_atsr(atsr);
4288         if (atsru)
4289                 return 0;
4290
4291         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292         if (!atsru)
4293                 return -ENOMEM;
4294
4295         /*
4296          * If memory is allocated from slab by ACPI _DSM method, we need to
4297          * copy the memory content because the memory buffer will be freed
4298          * on return.
4299          */
4300         atsru->hdr = (void *)(atsru + 1);
4301         memcpy(atsru->hdr, hdr, hdr->length);
4302         atsru->include_all = atsr->flags & 0x1;
4303         if (!atsru->include_all) {
4304                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305                                 (void *)atsr + atsr->header.length,
4306                                 &atsru->devices_cnt);
4307                 if (atsru->devices_cnt && atsru->devices == NULL) {
4308                         kfree(atsru);
4309                         return -ENOMEM;
4310                 }
4311         }
4312
4313         list_add_rcu(&atsru->list, &dmar_atsr_units);
4314
4315         return 0;
4316 }
4317
4318 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319 {
4320         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321         kfree(atsru);
4322 }
4323
4324 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325 {
4326         struct acpi_dmar_atsr *atsr;
4327         struct dmar_atsr_unit *atsru;
4328
4329         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330         atsru = dmar_find_atsr(atsr);
4331         if (atsru) {
4332                 list_del_rcu(&atsru->list);
4333                 synchronize_rcu();
4334                 intel_iommu_free_atsr(atsru);
4335         }
4336
4337         return 0;
4338 }
4339
4340 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341 {
4342         int i;
4343         struct device *dev;
4344         struct acpi_dmar_atsr *atsr;
4345         struct dmar_atsr_unit *atsru;
4346
4347         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348         atsru = dmar_find_atsr(atsr);
4349         if (!atsru)
4350                 return 0;
4351
4352         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354                                           i, dev)
4355                         return -EBUSY;
4356         }
4357
4358         return 0;
4359 }
4360
4361 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362 {
4363         int sp, ret;
4364         struct intel_iommu *iommu = dmaru->iommu;
4365
4366         if (g_iommus[iommu->seq_id])
4367                 return 0;
4368
4369         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370                 pr_warn("%s: Doesn't support hardware pass through.\n",
4371                         iommu->name);
4372                 return -ENXIO;
4373         }
4374         if (!ecap_sc_support(iommu->ecap) &&
4375             domain_update_iommu_snooping(iommu)) {
4376                 pr_warn("%s: Doesn't support snooping.\n",
4377                         iommu->name);
4378                 return -ENXIO;
4379         }
4380         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382                 pr_warn("%s: Doesn't support large page.\n",
4383                         iommu->name);
4384                 return -ENXIO;
4385         }
4386
4387         /*
4388          * Disable translation if already enabled prior to OS handover.
4389          */
4390         if (iommu->gcmd & DMA_GCMD_TE)
4391                 iommu_disable_translation(iommu);
4392
4393         g_iommus[iommu->seq_id] = iommu;
4394         ret = iommu_init_domains(iommu);
4395         if (ret == 0)
4396                 ret = iommu_alloc_root_entry(iommu);
4397         if (ret)
4398                 goto out;
4399
4400         intel_svm_check(iommu);
4401
4402         if (dmaru->ignored) {
4403                 /*
4404                  * we always have to disable PMRs or DMA may fail on this device
4405                  */
4406                 if (force_on)
4407                         iommu_disable_protect_mem_regions(iommu);
4408                 return 0;
4409         }
4410
4411         intel_iommu_init_qi(iommu);
4412         iommu_flush_write_buffer(iommu);
4413
4414 #ifdef CONFIG_INTEL_IOMMU_SVM
4415         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416                 ret = intel_svm_enable_prq(iommu);
4417                 if (ret)
4418                         goto disable_iommu;
4419         }
4420 #endif
4421         ret = dmar_set_interrupt(iommu);
4422         if (ret)
4423                 goto disable_iommu;
4424
4425         iommu_set_root_entry(iommu);
4426         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428         iommu_enable_translation(iommu);
4429
4430         iommu_disable_protect_mem_regions(iommu);
4431         return 0;
4432
4433 disable_iommu:
4434         disable_dmar_iommu(iommu);
4435 out:
4436         free_dmar_iommu(iommu);
4437         return ret;
4438 }
4439
4440 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441 {
4442         int ret = 0;
4443         struct intel_iommu *iommu = dmaru->iommu;
4444
4445         if (!intel_iommu_enabled)
4446                 return 0;
4447         if (iommu == NULL)
4448                 return -EINVAL;
4449
4450         if (insert) {
4451                 ret = intel_iommu_add(dmaru);
4452         } else {
4453                 disable_dmar_iommu(iommu);
4454                 free_dmar_iommu(iommu);
4455         }
4456
4457         return ret;
4458 }
4459
4460 static void intel_iommu_free_dmars(void)
4461 {
4462         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463         struct dmar_atsr_unit *atsru, *atsr_n;
4464
4465         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466                 list_del(&rmrru->list);
4467                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468                 kfree(rmrru);
4469         }
4470
4471         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472                 list_del(&atsru->list);
4473                 intel_iommu_free_atsr(atsru);
4474         }
4475 }
4476
4477 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478 {
4479         int i, ret = 1;
4480         struct pci_bus *bus;
4481         struct pci_dev *bridge = NULL;
4482         struct device *tmp;
4483         struct acpi_dmar_atsr *atsr;
4484         struct dmar_atsr_unit *atsru;
4485
4486         dev = pci_physfn(dev);
4487         for (bus = dev->bus; bus; bus = bus->parent) {
4488                 bridge = bus->self;
4489                 /* If it's an integrated device, allow ATS */
4490                 if (!bridge)
4491                         return 1;
4492                 /* Connected via non-PCIe: no ATS */
4493                 if (!pci_is_pcie(bridge) ||
4494                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495                         return 0;
4496                 /* If we found the root port, look it up in the ATSR */
4497                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498                         break;
4499         }
4500
4501         rcu_read_lock();
4502         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504                 if (atsr->segment != pci_domain_nr(dev->bus))
4505                         continue;
4506
4507                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508                         if (tmp == &bridge->dev)
4509                                 goto out;
4510
4511                 if (atsru->include_all)
4512                         goto out;
4513         }
4514         ret = 0;
4515 out:
4516         rcu_read_unlock();
4517
4518         return ret;
4519 }
4520
4521 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522 {
4523         int ret;
4524         struct dmar_rmrr_unit *rmrru;
4525         struct dmar_atsr_unit *atsru;
4526         struct acpi_dmar_atsr *atsr;
4527         struct acpi_dmar_reserved_memory *rmrr;
4528
4529         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530                 return 0;
4531
4532         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533                 rmrr = container_of(rmrru->hdr,
4534                                     struct acpi_dmar_reserved_memory, header);
4535                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537                                 ((void *)rmrr) + rmrr->header.length,
4538                                 rmrr->segment, rmrru->devices,
4539                                 rmrru->devices_cnt);
4540                         if (ret < 0)
4541                                 return ret;
4542                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543                         dmar_remove_dev_scope(info, rmrr->segment,
4544                                 rmrru->devices, rmrru->devices_cnt);
4545                 }
4546         }
4547
4548         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549                 if (atsru->include_all)
4550                         continue;
4551
4552                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555                                         (void *)atsr + atsr->header.length,
4556                                         atsr->segment, atsru->devices,
4557                                         atsru->devices_cnt);
4558                         if (ret > 0)
4559                                 break;
4560                         else if (ret < 0)
4561                                 return ret;
4562                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563                         if (dmar_remove_dev_scope(info, atsr->segment,
4564                                         atsru->devices, atsru->devices_cnt))
4565                                 break;
4566                 }
4567         }
4568
4569         return 0;
4570 }
4571
4572 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573                                        unsigned long val, void *v)
4574 {
4575         struct memory_notify *mhp = v;
4576         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578                         mhp->nr_pages - 1);
4579
4580         switch (val) {
4581         case MEM_GOING_ONLINE:
4582                 if (iommu_domain_identity_map(si_domain,
4583                                               start_vpfn, last_vpfn)) {
4584                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585                                 start_vpfn, last_vpfn);
4586                         return NOTIFY_BAD;
4587                 }
4588                 break;
4589
4590         case MEM_OFFLINE:
4591         case MEM_CANCEL_ONLINE:
4592                 {
4593                         struct dmar_drhd_unit *drhd;
4594                         struct intel_iommu *iommu;
4595                         struct page *freelist;
4596
4597                         freelist = domain_unmap(si_domain,
4598                                                 start_vpfn, last_vpfn);
4599
4600                         rcu_read_lock();
4601                         for_each_active_iommu(iommu, drhd)
4602                                 iommu_flush_iotlb_psi(iommu, si_domain,
4603                                         start_vpfn, mhp->nr_pages,
4604                                         !freelist, 0);
4605                         rcu_read_unlock();
4606                         dma_free_pagelist(freelist);
4607                 }
4608                 break;
4609         }
4610
4611         return NOTIFY_OK;
4612 }
4613
4614 static struct notifier_block intel_iommu_memory_nb = {
4615         .notifier_call = intel_iommu_memory_notifier,
4616         .priority = 0
4617 };
4618
4619 static void free_all_cpu_cached_iovas(unsigned int cpu)
4620 {
4621         int i;
4622
4623         for (i = 0; i < g_num_of_iommus; i++) {
4624                 struct intel_iommu *iommu = g_iommus[i];
4625                 struct dmar_domain *domain;
4626                 int did;
4627
4628                 if (!iommu)
4629                         continue;
4630
4631                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632                         domain = get_iommu_domain(iommu, (u16)did);
4633
4634                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635                                 continue;
4636
4637                         free_cpu_cached_iovas(cpu, &domain->iovad);
4638                 }
4639         }
4640 }
4641
4642 static int intel_iommu_cpu_dead(unsigned int cpu)
4643 {
4644         free_all_cpu_cached_iovas(cpu);
4645         return 0;
4646 }
4647
4648 static void intel_disable_iommus(void)
4649 {
4650         struct intel_iommu *iommu = NULL;
4651         struct dmar_drhd_unit *drhd;
4652
4653         for_each_iommu(iommu, drhd)
4654                 iommu_disable_translation(iommu);
4655 }
4656
4657 void intel_iommu_shutdown(void)
4658 {
4659         struct dmar_drhd_unit *drhd;
4660         struct intel_iommu *iommu = NULL;
4661
4662         if (no_iommu || dmar_disabled)
4663                 return;
4664
4665         down_write(&dmar_global_lock);
4666
4667         /* Disable PMRs explicitly here. */
4668         for_each_iommu(iommu, drhd)
4669                 iommu_disable_protect_mem_regions(iommu);
4670
4671         /* Make sure the IOMMUs are switched off */
4672         intel_disable_iommus();
4673
4674         up_write(&dmar_global_lock);
4675 }
4676
4677 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678 {
4679         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680
4681         return container_of(iommu_dev, struct intel_iommu, iommu);
4682 }
4683
4684 static ssize_t intel_iommu_show_version(struct device *dev,
4685                                         struct device_attribute *attr,
4686                                         char *buf)
4687 {
4688         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690         return sprintf(buf, "%d:%d\n",
4691                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692 }
4693 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694
4695 static ssize_t intel_iommu_show_address(struct device *dev,
4696                                         struct device_attribute *attr,
4697                                         char *buf)
4698 {
4699         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700         return sprintf(buf, "%llx\n", iommu->reg_phys);
4701 }
4702 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703
4704 static ssize_t intel_iommu_show_cap(struct device *dev,
4705                                     struct device_attribute *attr,
4706                                     char *buf)
4707 {
4708         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709         return sprintf(buf, "%llx\n", iommu->cap);
4710 }
4711 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712
4713 static ssize_t intel_iommu_show_ecap(struct device *dev,
4714                                     struct device_attribute *attr,
4715                                     char *buf)
4716 {
4717         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718         return sprintf(buf, "%llx\n", iommu->ecap);
4719 }
4720 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721
4722 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723                                       struct device_attribute *attr,
4724                                       char *buf)
4725 {
4726         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728 }
4729 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730
4731 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732                                            struct device_attribute *attr,
4733                                            char *buf)
4734 {
4735         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737                                                   cap_ndoms(iommu->cap)));
4738 }
4739 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740
4741 static struct attribute *intel_iommu_attrs[] = {
4742         &dev_attr_version.attr,
4743         &dev_attr_address.attr,
4744         &dev_attr_cap.attr,
4745         &dev_attr_ecap.attr,
4746         &dev_attr_domains_supported.attr,
4747         &dev_attr_domains_used.attr,
4748         NULL,
4749 };
4750
4751 static struct attribute_group intel_iommu_group = {
4752         .name = "intel-iommu",
4753         .attrs = intel_iommu_attrs,
4754 };
4755
4756 const struct attribute_group *intel_iommu_groups[] = {
4757         &intel_iommu_group,
4758         NULL,
4759 };
4760
4761 static inline bool has_external_pci(void)
4762 {
4763         struct pci_dev *pdev = NULL;
4764
4765         for_each_pci_dev(pdev)
4766                 if (pdev->external_facing)
4767                         return true;
4768
4769         return false;
4770 }
4771
4772 static int __init platform_optin_force_iommu(void)
4773 {
4774         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775                 return 0;
4776
4777         if (no_iommu || dmar_disabled)
4778                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779
4780         /*
4781          * If Intel-IOMMU is disabled by default, we will apply identity
4782          * map for all devices except those marked as being untrusted.
4783          */
4784         if (dmar_disabled)
4785                 iommu_set_default_passthrough(false);
4786
4787         dmar_disabled = 0;
4788         no_iommu = 0;
4789
4790         return 1;
4791 }
4792
4793 static int __init probe_acpi_namespace_devices(void)
4794 {
4795         struct dmar_drhd_unit *drhd;
4796         /* To avoid a -Wunused-but-set-variable warning. */
4797         struct intel_iommu *iommu __maybe_unused;
4798         struct device *dev;
4799         int i, ret = 0;
4800
4801         for_each_active_iommu(iommu, drhd) {
4802                 for_each_active_dev_scope(drhd->devices,
4803                                           drhd->devices_cnt, i, dev) {
4804                         struct acpi_device_physical_node *pn;
4805                         struct iommu_group *group;
4806                         struct acpi_device *adev;
4807
4808                         if (dev->bus != &acpi_bus_type)
4809                                 continue;
4810
4811                         adev = to_acpi_device(dev);
4812                         mutex_lock(&adev->physical_node_lock);
4813                         list_for_each_entry(pn,
4814                                             &adev->physical_node_list, node) {
4815                                 group = iommu_group_get(pn->dev);
4816                                 if (group) {
4817                                         iommu_group_put(group);
4818                                         continue;
4819                                 }
4820
4821                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822                                 ret = iommu_probe_device(pn->dev);
4823                                 if (ret)
4824                                         break;
4825                         }
4826                         mutex_unlock(&adev->physical_node_lock);
4827
4828                         if (ret)
4829                                 return ret;
4830                 }
4831         }
4832
4833         return 0;
4834 }
4835
4836 int __init intel_iommu_init(void)
4837 {
4838         int ret = -ENODEV;
4839         struct dmar_drhd_unit *drhd;
4840         struct intel_iommu *iommu;
4841
4842         /*
4843          * Intel IOMMU is required for a TXT/tboot launch or platform
4844          * opt in, so enforce that.
4845          */
4846         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847
4848         if (iommu_init_mempool()) {
4849                 if (force_on)
4850                         panic("tboot: Failed to initialize iommu memory\n");
4851                 return -ENOMEM;
4852         }
4853
4854         down_write(&dmar_global_lock);
4855         if (dmar_table_init()) {
4856                 if (force_on)
4857                         panic("tboot: Failed to initialize DMAR table\n");
4858                 goto out_free_dmar;
4859         }
4860
4861         if (dmar_dev_scope_init() < 0) {
4862                 if (force_on)
4863                         panic("tboot: Failed to initialize DMAR device scope\n");
4864                 goto out_free_dmar;
4865         }
4866
4867         up_write(&dmar_global_lock);
4868
4869         /*
4870          * The bus notifier takes the dmar_global_lock, so lockdep will
4871          * complain later when we register it under the lock.
4872          */
4873         dmar_register_bus_notifier();
4874
4875         down_write(&dmar_global_lock);
4876
4877         if (!no_iommu)
4878                 intel_iommu_debugfs_init();
4879
4880         if (no_iommu || dmar_disabled) {
4881                 /*
4882                  * We exit the function here to ensure IOMMU's remapping and
4883                  * mempool aren't setup, which means that the IOMMU's PMRs
4884                  * won't be disabled via the call to init_dmars(). So disable
4885                  * it explicitly here. The PMRs were setup by tboot prior to
4886                  * calling SENTER, but the kernel is expected to reset/tear
4887                  * down the PMRs.
4888                  */
4889                 if (intel_iommu_tboot_noforce) {
4890                         for_each_iommu(iommu, drhd)
4891                                 iommu_disable_protect_mem_regions(iommu);
4892                 }
4893
4894                 /*
4895                  * Make sure the IOMMUs are switched off, even when we
4896                  * boot into a kexec kernel and the previous kernel left
4897                  * them enabled
4898                  */
4899                 intel_disable_iommus();
4900                 goto out_free_dmar;
4901         }
4902
4903         if (list_empty(&dmar_rmrr_units))
4904                 pr_info("No RMRR found\n");
4905
4906         if (list_empty(&dmar_atsr_units))
4907                 pr_info("No ATSR found\n");
4908
4909         if (dmar_init_reserved_ranges()) {
4910                 if (force_on)
4911                         panic("tboot: Failed to reserve iommu ranges\n");
4912                 goto out_free_reserved_range;
4913         }
4914
4915         if (dmar_map_gfx)
4916                 intel_iommu_gfx_mapped = 1;
4917
4918         init_no_remapping_devices();
4919
4920         ret = init_dmars();
4921         if (ret) {
4922                 if (force_on)
4923                         panic("tboot: Failed to initialize DMARs\n");
4924                 pr_err("Initialization failed\n");
4925                 goto out_free_reserved_range;
4926         }
4927         up_write(&dmar_global_lock);
4928
4929         init_iommu_pm_ops();
4930
4931         down_read(&dmar_global_lock);
4932         for_each_active_iommu(iommu, drhd) {
4933                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4934                                        intel_iommu_groups,
4935                                        "%s", iommu->name);
4936                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937                 iommu_device_register(&iommu->iommu);
4938         }
4939         up_read(&dmar_global_lock);
4940
4941         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942         if (si_domain && !hw_pass_through)
4943                 register_memory_notifier(&intel_iommu_memory_nb);
4944         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945                           intel_iommu_cpu_dead);
4946
4947         down_read(&dmar_global_lock);
4948         if (probe_acpi_namespace_devices())
4949                 pr_warn("ACPI name space devices didn't probe correctly\n");
4950
4951         /* Finally, we enable the DMA remapping hardware. */
4952         for_each_iommu(iommu, drhd) {
4953                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4954                         iommu_enable_translation(iommu);
4955
4956                 iommu_disable_protect_mem_regions(iommu);
4957         }
4958         up_read(&dmar_global_lock);
4959
4960         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961
4962         intel_iommu_enabled = 1;
4963
4964         return 0;
4965
4966 out_free_reserved_range:
4967         put_iova_domain(&reserved_iova_list);
4968 out_free_dmar:
4969         intel_iommu_free_dmars();
4970         up_write(&dmar_global_lock);
4971         iommu_exit_mempool();
4972         return ret;
4973 }
4974
4975 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976 {
4977         struct intel_iommu *iommu = opaque;
4978
4979         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980         return 0;
4981 }
4982
4983 /*
4984  * NB - intel-iommu lacks any sort of reference counting for the users of
4985  * dependent devices.  If multiple endpoints have intersecting dependent
4986  * devices, unbinding the driver from any one of them will possibly leave
4987  * the others unable to operate.
4988  */
4989 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990 {
4991         if (!iommu || !dev || !dev_is_pci(dev))
4992                 return;
4993
4994         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4995 }
4996
4997 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998 {
4999         struct dmar_domain *domain;
5000         struct intel_iommu *iommu;
5001         unsigned long flags;
5002
5003         assert_spin_locked(&device_domain_lock);
5004
5005         if (WARN_ON(!info))
5006                 return;
5007
5008         iommu = info->iommu;
5009         domain = info->domain;
5010
5011         if (info->dev) {
5012                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5013                         intel_pasid_tear_down_entry(iommu, info->dev,
5014                                         PASID_RID2PASID, false);
5015
5016                 iommu_disable_dev_iotlb(info);
5017                 if (!dev_is_real_dma_subdevice(info->dev))
5018                         domain_context_clear(iommu, info->dev);
5019                 intel_pasid_free_table(info->dev);
5020         }
5021
5022         unlink_domain_info(info);
5023
5024         spin_lock_irqsave(&iommu->lock, flags);
5025         domain_detach_iommu(domain, iommu);
5026         spin_unlock_irqrestore(&iommu->lock, flags);
5027
5028         free_devinfo_mem(info);
5029 }
5030
5031 static void dmar_remove_one_dev_info(struct device *dev)
5032 {
5033         struct device_domain_info *info;
5034         unsigned long flags;
5035
5036         spin_lock_irqsave(&device_domain_lock, flags);
5037         info = get_domain_info(dev);
5038         if (info)
5039                 __dmar_remove_one_dev_info(info);
5040         spin_unlock_irqrestore(&device_domain_lock, flags);
5041 }
5042
5043 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044 {
5045         int adjust_width;
5046
5047         /* calculate AGAW */
5048         domain->gaw = guest_width;
5049         adjust_width = guestwidth_to_adjustwidth(guest_width);
5050         domain->agaw = width_to_agaw(adjust_width);
5051
5052         domain->iommu_coherency = 0;
5053         domain->iommu_snooping = 0;
5054         domain->iommu_superpage = 0;
5055         domain->max_addr = 0;
5056
5057         /* always allocate the top pgd */
5058         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059         if (!domain->pgd)
5060                 return -ENOMEM;
5061         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062         return 0;
5063 }
5064
5065 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5066 {
5067         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5069
5070         if (!intel_iommu_strict &&
5071             init_iova_flush_queue(&dmar_domain->iovad,
5072                                   iommu_flush_iova, iova_entry_free))
5073                 pr_info("iova flush queue initialization failed\n");
5074 }
5075
5076 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077 {
5078         struct dmar_domain *dmar_domain;
5079         struct iommu_domain *domain;
5080
5081         switch (type) {
5082         case IOMMU_DOMAIN_DMA:
5083         case IOMMU_DOMAIN_UNMANAGED:
5084                 dmar_domain = alloc_domain(0);
5085                 if (!dmar_domain) {
5086                         pr_err("Can't allocate dmar_domain\n");
5087                         return NULL;
5088                 }
5089                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090                         pr_err("Domain initialization failed\n");
5091                         domain_exit(dmar_domain);
5092                         return NULL;
5093                 }
5094
5095                 if (type == IOMMU_DOMAIN_DMA)
5096                         intel_init_iova_domain(dmar_domain);
5097
5098                 domain_update_iommu_cap(dmar_domain);
5099
5100                 domain = &dmar_domain->domain;
5101                 domain->geometry.aperture_start = 0;
5102                 domain->geometry.aperture_end   =
5103                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104                 domain->geometry.force_aperture = true;
5105
5106                 return domain;
5107         case IOMMU_DOMAIN_IDENTITY:
5108                 return &si_domain->domain;
5109         default:
5110                 return NULL;
5111         }
5112
5113         return NULL;
5114 }
5115
5116 static void intel_iommu_domain_free(struct iommu_domain *domain)
5117 {
5118         if (domain != &si_domain->domain)
5119                 domain_exit(to_dmar_domain(domain));
5120 }
5121
5122 /*
5123  * Check whether a @domain could be attached to the @dev through the
5124  * aux-domain attach/detach APIs.
5125  */
5126 static inline bool
5127 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128 {
5129         struct device_domain_info *info = get_domain_info(dev);
5130
5131         return info && info->auxd_enabled &&
5132                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5133 }
5134
5135 static void auxiliary_link_device(struct dmar_domain *domain,
5136                                   struct device *dev)
5137 {
5138         struct device_domain_info *info = get_domain_info(dev);
5139
5140         assert_spin_locked(&device_domain_lock);
5141         if (WARN_ON(!info))
5142                 return;
5143
5144         domain->auxd_refcnt++;
5145         list_add(&domain->auxd, &info->auxiliary_domains);
5146 }
5147
5148 static void auxiliary_unlink_device(struct dmar_domain *domain,
5149                                     struct device *dev)
5150 {
5151         struct device_domain_info *info = get_domain_info(dev);
5152
5153         assert_spin_locked(&device_domain_lock);
5154         if (WARN_ON(!info))
5155                 return;
5156
5157         list_del(&domain->auxd);
5158         domain->auxd_refcnt--;
5159
5160         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161                 ioasid_free(domain->default_pasid);
5162 }
5163
5164 static int aux_domain_add_dev(struct dmar_domain *domain,
5165                               struct device *dev)
5166 {
5167         int ret;
5168         unsigned long flags;
5169         struct intel_iommu *iommu;
5170
5171         iommu = device_to_iommu(dev, NULL, NULL);
5172         if (!iommu)
5173                 return -ENODEV;
5174
5175         if (domain->default_pasid <= 0) {
5176                 u32 pasid;
5177
5178                 /* No private data needed for the default pasid */
5179                 pasid = ioasid_alloc(NULL, PASID_MIN,
5180                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5181                                      NULL);
5182                 if (pasid == INVALID_IOASID) {
5183                         pr_err("Can't allocate default pasid\n");
5184                         return -ENODEV;
5185                 }
5186                 domain->default_pasid = pasid;
5187         }
5188
5189         spin_lock_irqsave(&device_domain_lock, flags);
5190         /*
5191          * iommu->lock must be held to attach domain to iommu and setup the
5192          * pasid entry for second level translation.
5193          */
5194         spin_lock(&iommu->lock);
5195         ret = domain_attach_iommu(domain, iommu);
5196         if (ret)
5197                 goto attach_failed;
5198
5199         /* Setup the PASID entry for mediated devices: */
5200         if (domain_use_first_level(domain))
5201                 ret = domain_setup_first_level(iommu, domain, dev,
5202                                                domain->default_pasid);
5203         else
5204                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205                                                      domain->default_pasid);
5206         if (ret)
5207                 goto table_failed;
5208         spin_unlock(&iommu->lock);
5209
5210         auxiliary_link_device(domain, dev);
5211
5212         spin_unlock_irqrestore(&device_domain_lock, flags);
5213
5214         return 0;
5215
5216 table_failed:
5217         domain_detach_iommu(domain, iommu);
5218 attach_failed:
5219         spin_unlock(&iommu->lock);
5220         spin_unlock_irqrestore(&device_domain_lock, flags);
5221         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222                 ioasid_free(domain->default_pasid);
5223
5224         return ret;
5225 }
5226
5227 static void aux_domain_remove_dev(struct dmar_domain *domain,
5228                                   struct device *dev)
5229 {
5230         struct device_domain_info *info;
5231         struct intel_iommu *iommu;
5232         unsigned long flags;
5233
5234         if (!is_aux_domain(dev, &domain->domain))
5235                 return;
5236
5237         spin_lock_irqsave(&device_domain_lock, flags);
5238         info = get_domain_info(dev);
5239         iommu = info->iommu;
5240
5241         auxiliary_unlink_device(domain, dev);
5242
5243         spin_lock(&iommu->lock);
5244         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245         domain_detach_iommu(domain, iommu);
5246         spin_unlock(&iommu->lock);
5247
5248         spin_unlock_irqrestore(&device_domain_lock, flags);
5249 }
5250
5251 static int prepare_domain_attach_device(struct iommu_domain *domain,
5252                                         struct device *dev)
5253 {
5254         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255         struct intel_iommu *iommu;
5256         int addr_width;
5257
5258         iommu = device_to_iommu(dev, NULL, NULL);
5259         if (!iommu)
5260                 return -ENODEV;
5261
5262         /* check if this iommu agaw is sufficient for max mapped address */
5263         addr_width = agaw_to_width(iommu->agaw);
5264         if (addr_width > cap_mgaw(iommu->cap))
5265                 addr_width = cap_mgaw(iommu->cap);
5266
5267         if (dmar_domain->max_addr > (1LL << addr_width)) {
5268                 dev_err(dev, "%s: iommu width (%d) is not "
5269                         "sufficient for the mapped address (%llx)\n",
5270                         __func__, addr_width, dmar_domain->max_addr);
5271                 return -EFAULT;
5272         }
5273         dmar_domain->gaw = addr_width;
5274
5275         /*
5276          * Knock out extra levels of page tables if necessary
5277          */
5278         while (iommu->agaw < dmar_domain->agaw) {
5279                 struct dma_pte *pte;
5280
5281                 pte = dmar_domain->pgd;
5282                 if (dma_pte_present(pte)) {
5283                         dmar_domain->pgd = (struct dma_pte *)
5284                                 phys_to_virt(dma_pte_addr(pte));
5285                         free_pgtable_page(pte);
5286                 }
5287                 dmar_domain->agaw--;
5288         }
5289
5290         return 0;
5291 }
5292
5293 static int intel_iommu_attach_device(struct iommu_domain *domain,
5294                                      struct device *dev)
5295 {
5296         int ret;
5297
5298         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299             device_is_rmrr_locked(dev)) {
5300                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5301                 return -EPERM;
5302         }
5303
5304         if (is_aux_domain(dev, domain))
5305                 return -EPERM;
5306
5307         /* normally dev is not mapped */
5308         if (unlikely(domain_context_mapped(dev))) {
5309                 struct dmar_domain *old_domain;
5310
5311                 old_domain = find_domain(dev);
5312                 if (old_domain)
5313                         dmar_remove_one_dev_info(dev);
5314         }
5315
5316         ret = prepare_domain_attach_device(domain, dev);
5317         if (ret)
5318                 return ret;
5319
5320         return domain_add_dev_info(to_dmar_domain(domain), dev);
5321 }
5322
5323 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324                                          struct device *dev)
5325 {
5326         int ret;
5327
5328         if (!is_aux_domain(dev, domain))
5329                 return -EPERM;
5330
5331         ret = prepare_domain_attach_device(domain, dev);
5332         if (ret)
5333                 return ret;
5334
5335         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336 }
5337
5338 static void intel_iommu_detach_device(struct iommu_domain *domain,
5339                                       struct device *dev)
5340 {
5341         dmar_remove_one_dev_info(dev);
5342 }
5343
5344 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345                                           struct device *dev)
5346 {
5347         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348 }
5349
5350 /*
5351  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352  * VT-d granularity. Invalidation is typically included in the unmap operation
5353  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354  * owns the first level page tables. Invalidations of translation caches in the
5355  * guest are trapped and passed down to the host.
5356  *
5357  * vIOMMU in the guest will only expose first level page tables, therefore
5358  * we do not support IOTLB granularity for request without PASID (second level).
5359  *
5360  * For example, to find the VT-d granularity encoding for IOTLB
5361  * type and page selective granularity within PASID:
5362  * X: indexed by iommu cache type
5363  * Y: indexed by enum iommu_inv_granularity
5364  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365  */
5366
5367 static const int
5368 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369         /*
5370          * PASID based IOTLB invalidation: PASID selective (per PASID),
5371          * page selective (address granularity)
5372          */
5373         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374         /* PASID based dev TLBs */
5375         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376         /* PASID cache */
5377         {-EINVAL, -EINVAL, -EINVAL}
5378 };
5379
5380 static inline int to_vtd_granularity(int type, int granu)
5381 {
5382         return inv_type_granu_table[type][granu];
5383 }
5384
5385 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386 {
5387         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388
5389         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391          * granu size in contiguous memory.
5392          */
5393         return order_base_2(nr_pages);
5394 }
5395
5396 #ifdef CONFIG_INTEL_IOMMU_SVM
5397 static int
5398 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399                            struct iommu_cache_invalidate_info *inv_info)
5400 {
5401         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402         struct device_domain_info *info;
5403         struct intel_iommu *iommu;
5404         unsigned long flags;
5405         int cache_type;
5406         u8 bus, devfn;
5407         u16 did, sid;
5408         int ret = 0;
5409         u64 size = 0;
5410
5411         if (!inv_info || !dmar_domain ||
5412             inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413                 return -EINVAL;
5414
5415         if (!dev || !dev_is_pci(dev))
5416                 return -ENODEV;
5417
5418         iommu = device_to_iommu(dev, &bus, &devfn);
5419         if (!iommu)
5420                 return -ENODEV;
5421
5422         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423                 return -EINVAL;
5424
5425         spin_lock_irqsave(&device_domain_lock, flags);
5426         spin_lock(&iommu->lock);
5427         info = get_domain_info(dev);
5428         if (!info) {
5429                 ret = -EINVAL;
5430                 goto out_unlock;
5431         }
5432         did = dmar_domain->iommu_did[iommu->seq_id];
5433         sid = PCI_DEVID(bus, devfn);
5434
5435         /* Size is only valid in address selective invalidation */
5436         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437                 size = to_vtd_size(inv_info->addr_info.granule_size,
5438                                    inv_info->addr_info.nb_granules);
5439
5440         for_each_set_bit(cache_type,
5441                          (unsigned long *)&inv_info->cache,
5442                          IOMMU_CACHE_INV_TYPE_NR) {
5443                 int granu = 0;
5444                 u64 pasid = 0;
5445                 u64 addr = 0;
5446
5447                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448                 if (granu == -EINVAL) {
5449                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450                                            cache_type, inv_info->granularity);
5451                         break;
5452                 }
5453
5454                 /*
5455                  * PASID is stored in different locations based on the
5456                  * granularity.
5457                  */
5458                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459                     (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460                         pasid = inv_info->pasid_info.pasid;
5461                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462                          (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463                         pasid = inv_info->addr_info.pasid;
5464
5465                 switch (BIT(cache_type)) {
5466                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5467                         /* HW will ignore LSB bits based on address mask */
5468                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469                             size &&
5470                             (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472                                                    inv_info->addr_info.addr, size);
5473                         }
5474
5475                         /*
5476                          * If granu is PASID-selective, address is ignored.
5477                          * We use npages = -1 to indicate that.
5478                          */
5479                         qi_flush_piotlb(iommu, did, pasid,
5480                                         mm_to_dma_pfn(inv_info->addr_info.addr),
5481                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482                                         inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483
5484                         if (!info->ats_enabled)
5485                                 break;
5486                         /*
5487                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5488                          * in the guest may assume IOTLB flush is inclusive,
5489                          * which is more efficient.
5490                          */
5491                         fallthrough;
5492                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493                         /*
5494                          * PASID based device TLB invalidation does not support
5495                          * IOMMU_INV_GRANU_PASID granularity but only supports
5496                          * IOMMU_INV_GRANU_ADDR.
5497                          * The equivalent of that is we set the size to be the
5498                          * entire range of 64 bit. User only provides PASID info
5499                          * without address info. So we set addr to 0.
5500                          */
5501                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502                                 size = 64 - VTD_PAGE_SHIFT;
5503                                 addr = 0;
5504                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505                                 addr = inv_info->addr_info.addr;
5506                         }
5507
5508                         if (info->ats_enabled)
5509                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5510                                                 info->pfsid, pasid,
5511                                                 info->ats_qdep, addr,
5512                                                 size);
5513                         else
5514                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515                         break;
5516                 default:
5517                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518                                             cache_type);
5519                         ret = -EINVAL;
5520                 }
5521         }
5522 out_unlock:
5523         spin_unlock(&iommu->lock);
5524         spin_unlock_irqrestore(&device_domain_lock, flags);
5525
5526         return ret;
5527 }
5528 #endif
5529
5530 static int intel_iommu_map(struct iommu_domain *domain,
5531                            unsigned long iova, phys_addr_t hpa,
5532                            size_t size, int iommu_prot, gfp_t gfp)
5533 {
5534         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535         u64 max_addr;
5536         int prot = 0;
5537         int ret;
5538
5539         if (iommu_prot & IOMMU_READ)
5540                 prot |= DMA_PTE_READ;
5541         if (iommu_prot & IOMMU_WRITE)
5542                 prot |= DMA_PTE_WRITE;
5543         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544                 prot |= DMA_PTE_SNP;
5545
5546         max_addr = iova + size;
5547         if (dmar_domain->max_addr < max_addr) {
5548                 u64 end;
5549
5550                 /* check if minimum agaw is sufficient for mapped address */
5551                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552                 if (end < max_addr) {
5553                         pr_err("%s: iommu width (%d) is not "
5554                                "sufficient for the mapped address (%llx)\n",
5555                                __func__, dmar_domain->gaw, max_addr);
5556                         return -EFAULT;
5557                 }
5558                 dmar_domain->max_addr = max_addr;
5559         }
5560         /* Round up size to next multiple of PAGE_SIZE, if it and
5561            the low bits of hpa would take us onto the next page */
5562         size = aligned_nrpages(hpa, size);
5563         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5565         return ret;
5566 }
5567
5568 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569                                 unsigned long iova, size_t size,
5570                                 struct iommu_iotlb_gather *gather)
5571 {
5572         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573         struct page *freelist = NULL;
5574         unsigned long start_pfn, last_pfn;
5575         unsigned int npages;
5576         int iommu_id, level = 0;
5577
5578         /* Cope with horrid API which requires us to unmap more than the
5579            size argument if it happens to be a large-page mapping. */
5580         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581
5582         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584
5585         start_pfn = iova >> VTD_PAGE_SHIFT;
5586         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587
5588         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5589
5590         npages = last_pfn - start_pfn + 1;
5591
5592         for_each_domain_iommu(iommu_id, dmar_domain)
5593                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594                                       start_pfn, npages, !freelist, 0);
5595
5596         dma_free_pagelist(freelist);
5597
5598         if (dmar_domain->max_addr == iova + size)
5599                 dmar_domain->max_addr = iova;
5600
5601         return size;
5602 }
5603
5604 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605                                             dma_addr_t iova)
5606 {
5607         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608         struct dma_pte *pte;
5609         int level = 0;
5610         u64 phys = 0;
5611
5612         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613         if (pte && dma_pte_present(pte))
5614                 phys = dma_pte_addr(pte) +
5615                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5616                                                 VTD_PAGE_SHIFT) - 1));
5617
5618         return phys;
5619 }
5620
5621 static inline bool scalable_mode_support(void)
5622 {
5623         struct dmar_drhd_unit *drhd;
5624         struct intel_iommu *iommu;
5625         bool ret = true;
5626
5627         rcu_read_lock();
5628         for_each_active_iommu(iommu, drhd) {
5629                 if (!sm_supported(iommu)) {
5630                         ret = false;
5631                         break;
5632                 }
5633         }
5634         rcu_read_unlock();
5635
5636         return ret;
5637 }
5638
5639 static inline bool iommu_pasid_support(void)
5640 {
5641         struct dmar_drhd_unit *drhd;
5642         struct intel_iommu *iommu;
5643         bool ret = true;
5644
5645         rcu_read_lock();
5646         for_each_active_iommu(iommu, drhd) {
5647                 if (!pasid_supported(iommu)) {
5648                         ret = false;
5649                         break;
5650                 }
5651         }
5652         rcu_read_unlock();
5653
5654         return ret;
5655 }
5656
5657 static inline bool nested_mode_support(void)
5658 {
5659         struct dmar_drhd_unit *drhd;
5660         struct intel_iommu *iommu;
5661         bool ret = true;
5662
5663         rcu_read_lock();
5664         for_each_active_iommu(iommu, drhd) {
5665                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666                         ret = false;
5667                         break;
5668                 }
5669         }
5670         rcu_read_unlock();
5671
5672         return ret;
5673 }
5674
5675 static bool intel_iommu_capable(enum iommu_cap cap)
5676 {
5677         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678                 return domain_update_iommu_snooping(NULL) == 1;
5679         if (cap == IOMMU_CAP_INTR_REMAP)
5680                 return irq_remapping_enabled == 1;
5681
5682         return false;
5683 }
5684
5685 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686 {
5687         struct intel_iommu *iommu;
5688
5689         iommu = device_to_iommu(dev, NULL, NULL);
5690         if (!iommu)
5691                 return ERR_PTR(-ENODEV);
5692
5693         if (translation_pre_enabled(iommu))
5694                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5695
5696         return &iommu->iommu;
5697 }
5698
5699 static void intel_iommu_release_device(struct device *dev)
5700 {
5701         struct intel_iommu *iommu;
5702
5703         iommu = device_to_iommu(dev, NULL, NULL);
5704         if (!iommu)
5705                 return;
5706
5707         dmar_remove_one_dev_info(dev);
5708
5709         set_dma_ops(dev, NULL);
5710 }
5711
5712 static void intel_iommu_probe_finalize(struct device *dev)
5713 {
5714         struct iommu_domain *domain;
5715
5716         domain = iommu_get_domain_for_dev(dev);
5717         if (device_needs_bounce(dev))
5718                 set_dma_ops(dev, &bounce_dma_ops);
5719         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720                 set_dma_ops(dev, &intel_dma_ops);
5721         else
5722                 set_dma_ops(dev, NULL);
5723 }
5724
5725 static void intel_iommu_get_resv_regions(struct device *device,
5726                                          struct list_head *head)
5727 {
5728         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729         struct iommu_resv_region *reg;
5730         struct dmar_rmrr_unit *rmrr;
5731         struct device *i_dev;
5732         int i;
5733
5734         down_read(&dmar_global_lock);
5735         for_each_rmrr_units(rmrr) {
5736                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737                                           i, i_dev) {
5738                         struct iommu_resv_region *resv;
5739                         enum iommu_resv_type type;
5740                         size_t length;
5741
5742                         if (i_dev != device &&
5743                             !is_downstream_to_pci_bridge(device, i_dev))
5744                                 continue;
5745
5746                         length = rmrr->end_address - rmrr->base_address + 1;
5747
5748                         type = device_rmrr_is_relaxable(device) ?
5749                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750
5751                         resv = iommu_alloc_resv_region(rmrr->base_address,
5752                                                        length, prot, type);
5753                         if (!resv)
5754                                 break;
5755
5756                         list_add_tail(&resv->list, head);
5757                 }
5758         }
5759         up_read(&dmar_global_lock);
5760
5761 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762         if (dev_is_pci(device)) {
5763                 struct pci_dev *pdev = to_pci_dev(device);
5764
5765                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5768                         if (reg)
5769                                 list_add_tail(&reg->list, head);
5770                 }
5771         }
5772 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773
5774         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776                                       0, IOMMU_RESV_MSI);
5777         if (!reg)
5778                 return;
5779         list_add_tail(&reg->list, head);
5780 }
5781
5782 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783 {
5784         struct device_domain_info *info;
5785         struct context_entry *context;
5786         struct dmar_domain *domain;
5787         unsigned long flags;
5788         u64 ctx_lo;
5789         int ret;
5790
5791         domain = find_domain(dev);
5792         if (!domain)
5793                 return -EINVAL;
5794
5795         spin_lock_irqsave(&device_domain_lock, flags);
5796         spin_lock(&iommu->lock);
5797
5798         ret = -EINVAL;
5799         info = get_domain_info(dev);
5800         if (!info || !info->pasid_supported)
5801                 goto out;
5802
5803         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804         if (WARN_ON(!context))
5805                 goto out;
5806
5807         ctx_lo = context[0].lo;
5808
5809         if (!(ctx_lo & CONTEXT_PASIDE)) {
5810                 ctx_lo |= CONTEXT_PASIDE;
5811                 context[0].lo = ctx_lo;
5812                 wmb();
5813                 iommu->flush.flush_context(iommu,
5814                                            domain->iommu_did[iommu->seq_id],
5815                                            PCI_DEVID(info->bus, info->devfn),
5816                                            DMA_CCMD_MASK_NOBIT,
5817                                            DMA_CCMD_DEVICE_INVL);
5818         }
5819
5820         /* Enable PASID support in the device, if it wasn't already */
5821         if (!info->pasid_enabled)
5822                 iommu_enable_dev_iotlb(info);
5823
5824         ret = 0;
5825
5826  out:
5827         spin_unlock(&iommu->lock);
5828         spin_unlock_irqrestore(&device_domain_lock, flags);
5829
5830         return ret;
5831 }
5832
5833 static void intel_iommu_apply_resv_region(struct device *dev,
5834                                           struct iommu_domain *domain,
5835                                           struct iommu_resv_region *region)
5836 {
5837         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838         unsigned long start, end;
5839
5840         start = IOVA_PFN(region->start);
5841         end   = IOVA_PFN(region->start + region->length - 1);
5842
5843         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844 }
5845
5846 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847 {
5848         if (dev_is_pci(dev))
5849                 return pci_device_group(dev);
5850         return generic_device_group(dev);
5851 }
5852
5853 static int intel_iommu_enable_auxd(struct device *dev)
5854 {
5855         struct device_domain_info *info;
5856         struct intel_iommu *iommu;
5857         unsigned long flags;
5858         int ret;
5859
5860         iommu = device_to_iommu(dev, NULL, NULL);
5861         if (!iommu || dmar_disabled)
5862                 return -EINVAL;
5863
5864         if (!sm_supported(iommu) || !pasid_supported(iommu))
5865                 return -EINVAL;
5866
5867         ret = intel_iommu_enable_pasid(iommu, dev);
5868         if (ret)
5869                 return -ENODEV;
5870
5871         spin_lock_irqsave(&device_domain_lock, flags);
5872         info = get_domain_info(dev);
5873         info->auxd_enabled = 1;
5874         spin_unlock_irqrestore(&device_domain_lock, flags);
5875
5876         return 0;
5877 }
5878
5879 static int intel_iommu_disable_auxd(struct device *dev)
5880 {
5881         struct device_domain_info *info;
5882         unsigned long flags;
5883
5884         spin_lock_irqsave(&device_domain_lock, flags);
5885         info = get_domain_info(dev);
5886         if (!WARN_ON(!info))
5887                 info->auxd_enabled = 0;
5888         spin_unlock_irqrestore(&device_domain_lock, flags);
5889
5890         return 0;
5891 }
5892
5893 /*
5894  * A PCI express designated vendor specific extended capability is defined
5895  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896  * for system software and tools to detect endpoint devices supporting the
5897  * Intel scalable IO virtualization without host driver dependency.
5898  *
5899  * Returns the address of the matching extended capability structure within
5900  * the device's PCI configuration space or 0 if the device does not support
5901  * it.
5902  */
5903 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904 {
5905         int pos;
5906         u16 vendor, id;
5907
5908         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909         while (pos) {
5910                 pci_read_config_word(pdev, pos + 4, &vendor);
5911                 pci_read_config_word(pdev, pos + 8, &id);
5912                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913                         return pos;
5914
5915                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916         }
5917
5918         return 0;
5919 }
5920
5921 static bool
5922 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923 {
5924         if (feat == IOMMU_DEV_FEAT_AUX) {
5925                 int ret;
5926
5927                 if (!dev_is_pci(dev) || dmar_disabled ||
5928                     !scalable_mode_support() || !iommu_pasid_support())
5929                         return false;
5930
5931                 ret = pci_pasid_features(to_pci_dev(dev));
5932                 if (ret < 0)
5933                         return false;
5934
5935                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936         }
5937
5938         if (feat == IOMMU_DEV_FEAT_SVA) {
5939                 struct device_domain_info *info = get_domain_info(dev);
5940
5941                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942                         info->pasid_supported && info->pri_supported &&
5943                         info->ats_supported;
5944         }
5945
5946         return false;
5947 }
5948
5949 static int
5950 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951 {
5952         if (feat == IOMMU_DEV_FEAT_AUX)
5953                 return intel_iommu_enable_auxd(dev);
5954
5955         if (feat == IOMMU_DEV_FEAT_SVA) {
5956                 struct device_domain_info *info = get_domain_info(dev);
5957
5958                 if (!info)
5959                         return -EINVAL;
5960
5961                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962                         return 0;
5963         }
5964
5965         return -ENODEV;
5966 }
5967
5968 static int
5969 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970 {
5971         if (feat == IOMMU_DEV_FEAT_AUX)
5972                 return intel_iommu_disable_auxd(dev);
5973
5974         return -ENODEV;
5975 }
5976
5977 static bool
5978 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979 {
5980         struct device_domain_info *info = get_domain_info(dev);
5981
5982         if (feat == IOMMU_DEV_FEAT_AUX)
5983                 return scalable_mode_support() && info && info->auxd_enabled;
5984
5985         return false;
5986 }
5987
5988 static int
5989 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990 {
5991         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992
5993         return dmar_domain->default_pasid > 0 ?
5994                         dmar_domain->default_pasid : -EINVAL;
5995 }
5996
5997 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998                                            struct device *dev)
5999 {
6000         return attach_deferred(dev);
6001 }
6002
6003 static int
6004 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005                             enum iommu_attr attr, void *data)
6006 {
6007         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008         unsigned long flags;
6009         int ret = 0;
6010
6011         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012                 return -EINVAL;
6013
6014         switch (attr) {
6015         case DOMAIN_ATTR_NESTING:
6016                 spin_lock_irqsave(&device_domain_lock, flags);
6017                 if (nested_mode_support() &&
6018                     list_empty(&dmar_domain->devices)) {
6019                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021                 } else {
6022                         ret = -ENODEV;
6023                 }
6024                 spin_unlock_irqrestore(&device_domain_lock, flags);
6025                 break;
6026         default:
6027                 ret = -EINVAL;
6028                 break;
6029         }
6030
6031         return ret;
6032 }
6033
6034 /*
6035  * Check that the device does not live on an external facing PCI port that is
6036  * marked as untrusted. Such devices should not be able to apply quirks and
6037  * thus not be able to bypass the IOMMU restrictions.
6038  */
6039 static bool risky_device(struct pci_dev *pdev)
6040 {
6041         if (pdev->untrusted) {
6042                 pci_info(pdev,
6043                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044                          pdev->vendor, pdev->device);
6045                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046                 return true;
6047         }
6048         return false;
6049 }
6050
6051 const struct iommu_ops intel_iommu_ops = {
6052         .capable                = intel_iommu_capable,
6053         .domain_alloc           = intel_iommu_domain_alloc,
6054         .domain_free            = intel_iommu_domain_free,
6055         .domain_set_attr        = intel_iommu_domain_set_attr,
6056         .attach_dev             = intel_iommu_attach_device,
6057         .detach_dev             = intel_iommu_detach_device,
6058         .aux_attach_dev         = intel_iommu_aux_attach_device,
6059         .aux_detach_dev         = intel_iommu_aux_detach_device,
6060         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6061         .map                    = intel_iommu_map,
6062         .unmap                  = intel_iommu_unmap,
6063         .iova_to_phys           = intel_iommu_iova_to_phys,
6064         .probe_device           = intel_iommu_probe_device,
6065         .probe_finalize         = intel_iommu_probe_finalize,
6066         .release_device         = intel_iommu_release_device,
6067         .get_resv_regions       = intel_iommu_get_resv_regions,
6068         .put_resv_regions       = generic_iommu_put_resv_regions,
6069         .apply_resv_region      = intel_iommu_apply_resv_region,
6070         .device_group           = intel_iommu_device_group,
6071         .dev_has_feat           = intel_iommu_dev_has_feat,
6072         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6073         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6074         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6075         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6076         .def_domain_type        = device_def_domain_type,
6077         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6078 #ifdef CONFIG_INTEL_IOMMU_SVM
6079         .cache_invalidate       = intel_iommu_sva_invalidate,
6080         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6081         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6082         .sva_bind               = intel_svm_bind,
6083         .sva_unbind             = intel_svm_unbind,
6084         .sva_get_pasid          = intel_svm_get_pasid,
6085         .page_response          = intel_svm_page_response,
6086 #endif
6087 };
6088
6089 static void quirk_iommu_igfx(struct pci_dev *dev)
6090 {
6091         if (risky_device(dev))
6092                 return;
6093
6094         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095         dmar_map_gfx = 0;
6096 }
6097
6098 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106
6107 /* Broadwell igfx malfunctions with dmar */
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132
6133 static void quirk_iommu_rwbf(struct pci_dev *dev)
6134 {
6135         if (risky_device(dev))
6136                 return;
6137
6138         /*
6139          * Mobile 4 Series Chipset neglects to set RWBF capability,
6140          * but needs it. Same seems to hold for the desktop versions.
6141          */
6142         pci_info(dev, "Forcing write-buffer flush capability\n");
6143         rwbf_quirk = 1;
6144 }
6145
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153
6154 #define GGC 0x52
6155 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6156 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6157 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6158 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6159 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6160 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6161 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6162 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6163
6164 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165 {
6166         unsigned short ggc;
6167
6168         if (risky_device(dev))
6169                 return;
6170
6171         if (pci_read_config_word(dev, GGC, &ggc))
6172                 return;
6173
6174         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176                 dmar_map_gfx = 0;
6177         } else if (dmar_map_gfx) {
6178                 /* we have to ensure the gfx device is idle before we flush */
6179                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180                 intel_iommu_strict = 1;
6181        }
6182 }
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187
6188 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189 {
6190         unsigned short ver;
6191
6192         if (!IS_GFX_DEVICE(dev))
6193                 return;
6194
6195         ver = (dev->device >> 8) & 0xff;
6196         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198             ver != 0x9a)
6199                 return;
6200
6201         if (risky_device(dev))
6202                 return;
6203
6204         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205         iommu_skip_te_disable = 1;
6206 }
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208
6209 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6210    ISOCH DMAR unit for the Azalia sound device, but not give it any
6211    TLB entries, which causes it to deadlock. Check for that.  We do
6212    this in a function called from init_dmars(), instead of in a PCI
6213    quirk, because we don't want to print the obnoxious "BIOS broken"
6214    message if VT-d is actually disabled.
6215 */
6216 static void __init check_tylersburg_isoch(void)
6217 {
6218         struct pci_dev *pdev;
6219         uint32_t vtisochctrl;
6220
6221         /* If there's no Azalia in the system anyway, forget it. */
6222         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223         if (!pdev)
6224                 return;
6225
6226         if (risky_device(pdev)) {
6227                 pci_dev_put(pdev);
6228                 return;
6229         }
6230
6231         pci_dev_put(pdev);
6232
6233         /* System Management Registers. Might be hidden, in which case
6234            we can't do the sanity check. But that's OK, because the
6235            known-broken BIOSes _don't_ actually hide it, so far. */
6236         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237         if (!pdev)
6238                 return;
6239
6240         if (risky_device(pdev)) {
6241                 pci_dev_put(pdev);
6242                 return;
6243         }
6244
6245         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246                 pci_dev_put(pdev);
6247                 return;
6248         }
6249
6250         pci_dev_put(pdev);
6251
6252         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253         if (vtisochctrl & 1)
6254                 return;
6255
6256         /* Drop all bits other than the number of TLB entries */
6257         vtisochctrl &= 0x1c;
6258
6259         /* If we have the recommended number of TLB entries (16), fine. */
6260         if (vtisochctrl == 0x10)
6261                 return;
6262
6263         /* Zero TLB entries? You get to ride the short bus to school. */
6264         if (!vtisochctrl) {
6265                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267                      dmi_get_system_info(DMI_BIOS_VENDOR),
6268                      dmi_get_system_info(DMI_BIOS_VERSION),
6269                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6270                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6271                 return;
6272         }
6273
6274         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275                vtisochctrl);
6276 }