Merge tag 'percpu-for-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis...
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN          (1)
63
64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE            (9)
68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
69
70 static inline int agaw_to_level(int agaw)
71 {
72         return agaw + 2;
73 }
74
75 static inline int agaw_to_width(int agaw)
76 {
77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
80 static inline int width_to_agaw(int width)
81 {
82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87         return (level - 1) * LEVEL_STRIDE;
88 }
89
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
95 static inline u64 level_mask(int level)
96 {
97         return -1ULL << level_to_offset_bits(level);
98 }
99
100 static inline u64 level_size(int level)
101 {
102         return 1ULL << level_to_offset_bits(level);
103 }
104
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107         return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127         return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131         return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_set_present(struct context_entry *context)
172 {
173         context->lo |= 1;
174 }
175
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178         context->lo &= (((u64)-1) << 2) | 1;
179 }
180
181 static inline void context_set_translation_type(struct context_entry *context,
182                                                 unsigned long value)
183 {
184         context->lo &= (((u64)-1) << 4) | 3;
185         context->lo |= (value & 3) << 2;
186 }
187
188 static inline void context_set_address_root(struct context_entry *context,
189                                             unsigned long value)
190 {
191         context->lo &= ~VTD_PAGE_MASK;
192         context->lo |= value & VTD_PAGE_MASK;
193 }
194
195 static inline void context_set_address_width(struct context_entry *context,
196                                              unsigned long value)
197 {
198         context->hi |= value & 7;
199 }
200
201 static inline void context_set_domain_id(struct context_entry *context,
202                                          unsigned long value)
203 {
204         context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209         context->lo |= CONTEXT_PASIDE;
210 }
211
212 static inline int context_domain_id(struct context_entry *c)
213 {
214         return((c->hi >> 8) & 0xffff);
215 }
216
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219         context->lo = 0;
220         context->hi = 0;
221 }
222
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         if (!iommu->copied_tables)
226                 return false;
227
228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244  * This domain is a statically identity mapping domain.
245  *      1. This domain creats a static 1:1 mapping to all usable memory.
246  *      2. It maps to each iommu if successful.
247  *      3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253         struct list_head list;          /* list of rmrr units   */
254         struct acpi_dmar_header *hdr;   /* ACPI header          */
255         u64     base_address;           /* reserved base address*/
256         u64     end_address;            /* reserved end address */
257         struct dmar_dev_scope *devices; /* target devices */
258         int     devices_cnt;            /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262         struct list_head list;          /* list of ATSR units */
263         struct acpi_dmar_header *hdr;   /* ACPI header */
264         struct dmar_dev_scope *devices; /* target devices */
265         int devices_cnt;                /* target device count */
266         u8 include_all:1;               /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270         struct list_head list;          /* list of SATC units */
271         struct acpi_dmar_header *hdr;   /* ACPI header */
272         struct dmar_dev_scope *devices; /* target devices */
273         struct intel_iommu *iommu;      /* the corresponding iommu */
274         int devices_cnt;                /* target device count */
275         u8 atc_required:1;              /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297
298 #define IDENTMAP_GFX            2
299 #define IDENTMAP_AZALIA         4
300
301 const struct iommu_ops intel_iommu_ops;
302 static const struct iommu_dirty_ops intel_dirty_ops;
303
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316         u32 gsts;
317
318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
319         if (gsts & DMA_GSTS_TES)
320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
323 static int __init intel_iommu_setup(char *str)
324 {
325         if (!str)
326                 return -EINVAL;
327
328         while (*str) {
329                 if (!strncmp(str, "on", 2)) {
330                         dmar_disabled = 0;
331                         pr_info("IOMMU enabled\n");
332                 } else if (!strncmp(str, "off", 3)) {
333                         dmar_disabled = 1;
334                         no_platform_optin = 1;
335                         pr_info("IOMMU disabled\n");
336                 } else if (!strncmp(str, "igfx_off", 8)) {
337                         dmar_map_gfx = 0;
338                         pr_info("Disable GFX device mapping\n");
339                 } else if (!strncmp(str, "forcedac", 8)) {
340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341                         iommu_dma_forcedac = true;
342                 } else if (!strncmp(str, "strict", 6)) {
343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344                         iommu_set_dma_strict();
345                 } else if (!strncmp(str, "sp_off", 6)) {
346                         pr_info("Disable supported super page\n");
347                         intel_iommu_superpage = 0;
348                 } else if (!strncmp(str, "sm_on", 5)) {
349                         pr_info("Enable scalable mode if hardware supports\n");
350                         intel_iommu_sm = 1;
351                 } else if (!strncmp(str, "sm_off", 6)) {
352                         pr_info("Scalable mode is disallowed\n");
353                         intel_iommu_sm = 0;
354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356                         intel_iommu_tboot_noforce = 1;
357                 } else {
358                         pr_notice("Unknown option - '%s'\n", str);
359                 }
360
361                 str += strcspn(str, ",");
362                 while (*str == ',')
363                         str++;
364         }
365
366         return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372         struct page *page;
373         void *vaddr = NULL;
374
375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376         if (page)
377                 vaddr = page_address(page);
378         return vaddr;
379 }
380
381 void free_pgtable_page(void *vaddr)
382 {
383         free_page((unsigned long)vaddr);
384 }
385
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392                                        unsigned long pfn)
393 {
394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406         unsigned long fl_sagaw, sl_sagaw;
407
408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409         sl_sagaw = cap_sagaw(iommu->cap);
410
411         /* Second level only. */
412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413                 return sl_sagaw;
414
415         /* First level only. */
416         if (!ecap_slts(iommu->ecap))
417                 return fl_sagaw;
418
419         return fl_sagaw & sl_sagaw;
420 }
421
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424         unsigned long sagaw;
425         int agaw;
426
427         sagaw = __iommu_calculate_sagaw(iommu);
428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429                 if (test_bit(agaw, &sagaw))
430                         break;
431         }
432
433         return agaw;
434 }
435
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456         return sm_supported(iommu) ?
457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462         struct iommu_domain_info *info;
463         struct dmar_drhd_unit *drhd;
464         struct intel_iommu *iommu;
465         bool found = false;
466         unsigned long i;
467
468         domain->iommu_coherency = true;
469         xa_for_each(&domain->iommu_array, i, info) {
470                 found = true;
471                 if (!iommu_paging_structure_coherency(info->iommu)) {
472                         domain->iommu_coherency = false;
473                         break;
474                 }
475         }
476         if (found)
477                 return;
478
479         /* No hardware attached; use lowest common denominator */
480         rcu_read_lock();
481         for_each_active_iommu(iommu, drhd) {
482                 if (!iommu_paging_structure_coherency(iommu)) {
483                         domain->iommu_coherency = false;
484                         break;
485                 }
486         }
487         rcu_read_unlock();
488 }
489
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491                                          struct intel_iommu *skip)
492 {
493         struct dmar_drhd_unit *drhd;
494         struct intel_iommu *iommu;
495         int mask = 0x3;
496
497         if (!intel_iommu_superpage)
498                 return 0;
499
500         /* set iommu_superpage to the smallest common denominator */
501         rcu_read_lock();
502         for_each_active_iommu(iommu, drhd) {
503                 if (iommu != skip) {
504                         if (domain && domain->use_first_level) {
505                                 if (!cap_fl1gp_support(iommu->cap))
506                                         mask = 0x1;
507                         } else {
508                                 mask &= cap_super_page_val(iommu->cap);
509                         }
510
511                         if (!mask)
512                                 break;
513                 }
514         }
515         rcu_read_unlock();
516
517         return fls(mask);
518 }
519
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522         struct device_domain_info *info;
523         int nid = NUMA_NO_NODE;
524         unsigned long flags;
525
526         spin_lock_irqsave(&domain->lock, flags);
527         list_for_each_entry(info, &domain->devices, link) {
528                 /*
529                  * There could possibly be multiple device numa nodes as devices
530                  * within the same domain may sit behind different IOMMUs. There
531                  * isn't perfect answer in such situation, so we select first
532                  * come first served policy.
533                  */
534                 nid = dev_to_node(info->dev);
535                 if (nid != NUMA_NO_NODE)
536                         break;
537         }
538         spin_unlock_irqrestore(&domain->lock, flags);
539
540         return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548         unsigned long bitmap = 0;
549
550         /*
551          * 1-level super page supports page size of 2MiB, 2-level super page
552          * supports page size of both 2MiB and 1GiB.
553          */
554         if (domain->iommu_superpage == 1)
555                 bitmap |= SZ_2M;
556         else if (domain->iommu_superpage == 2)
557                 bitmap |= SZ_2M | SZ_1G;
558
559         return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565         domain_update_iommu_coherency(domain);
566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568         /*
569          * If RHSA is missing, we should default to the device numa domain
570          * as fall back.
571          */
572         if (domain->nid == NUMA_NO_NODE)
573                 domain->nid = domain_update_device_node(domain);
574
575         /*
576          * First-level translation restricts the input-address to a
577          * canonical address (i.e., address bits 63:N have the same
578          * value as address bit [N-1], where N is 48-bits with 4-level
579          * paging and 57-bits with 5-level paging). Hence, skip bit
580          * [N-1].
581          */
582         if (domain->use_first_level)
583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584         else
585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588         domain_update_iotlb(domain);
589 }
590
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592                                          u8 devfn, int alloc)
593 {
594         struct root_entry *root = &iommu->root_entry[bus];
595         struct context_entry *context;
596         u64 *entry;
597
598         /*
599          * Except that the caller requested to allocate a new entry,
600          * returning a copied context entry makes no sense.
601          */
602         if (!alloc && context_copied(iommu, bus, devfn))
603                 return NULL;
604
605         entry = &root->lo;
606         if (sm_supported(iommu)) {
607                 if (devfn >= 0x80) {
608                         devfn -= 0x80;
609                         entry = &root->hi;
610                 }
611                 devfn *= 2;
612         }
613         if (*entry & 1)
614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615         else {
616                 unsigned long phy_addr;
617                 if (!alloc)
618                         return NULL;
619
620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621                 if (!context)
622                         return NULL;
623
624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625                 phy_addr = virt_to_phys((void *)context);
626                 *entry = phy_addr | 1;
627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628         }
629         return &context[devfn];
630 }
631
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *                               sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643         struct pci_dev *pdev, *pbridge;
644
645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646                 return false;
647
648         pdev = to_pci_dev(dev);
649         pbridge = to_pci_dev(bridge);
650
651         if (pbridge->subordinate &&
652             pbridge->subordinate->number <= pdev->bus->number &&
653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
654                 return true;
655
656         return false;
657 }
658
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661         struct dmar_drhd_unit *drhd;
662         u32 vtbar;
663         int rc;
664
665         /* We know that this device on this chipset has its own IOMMU.
666          * If we find it under a different IOMMU, then the BIOS is lying
667          * to us. Hope that the IOMMU for this device is actually
668          * disabled, and it needs no translation...
669          */
670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671         if (rc) {
672                 /* "can't" happen */
673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674                 return false;
675         }
676         vtbar &= 0xffff0000;
677
678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
679         drhd = dmar_find_matched_drhd_unit(pdev);
680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683                 return true;
684         }
685
686         return false;
687 }
688
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691         if (!iommu || iommu->drhd->ignored)
692                 return true;
693
694         if (dev_is_pci(dev)) {
695                 struct pci_dev *pdev = to_pci_dev(dev);
696
697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699                     quirk_ioat_snb_local_iommu(pdev))
700                         return true;
701         }
702
703         return false;
704 }
705
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708         struct dmar_drhd_unit *drhd = NULL;
709         struct pci_dev *pdev = NULL;
710         struct intel_iommu *iommu;
711         struct device *tmp;
712         u16 segment = 0;
713         int i;
714
715         if (!dev)
716                 return NULL;
717
718         if (dev_is_pci(dev)) {
719                 struct pci_dev *pf_pdev;
720
721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723                 /* VFs aren't listed in scope tables; we need to look up
724                  * the PF instead to find the IOMMU. */
725                 pf_pdev = pci_physfn(pdev);
726                 dev = &pf_pdev->dev;
727                 segment = pci_domain_nr(pdev->bus);
728         } else if (has_acpi_companion(dev))
729                 dev = &ACPI_COMPANION(dev)->dev;
730
731         rcu_read_lock();
732         for_each_iommu(iommu, drhd) {
733                 if (pdev && segment != drhd->segment)
734                         continue;
735
736                 for_each_active_dev_scope(drhd->devices,
737                                           drhd->devices_cnt, i, tmp) {
738                         if (tmp == dev) {
739                                 /* For a VF use its original BDF# not that of the PF
740                                  * which we used for the IOMMU lookup. Strictly speaking
741                                  * we could do this for all PCI devices; we only need to
742                                  * get the BDF# from the scope table for ACPI matches. */
743                                 if (pdev && pdev->is_virtfn)
744                                         goto got_pdev;
745
746                                 if (bus && devfn) {
747                                         *bus = drhd->devices[i].bus;
748                                         *devfn = drhd->devices[i].devfn;
749                                 }
750                                 goto out;
751                         }
752
753                         if (is_downstream_to_pci_bridge(dev, tmp))
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758 got_pdev:
759                         if (bus && devfn) {
760                                 *bus = pdev->bus->number;
761                                 *devfn = pdev->devfn;
762                         }
763                         goto out;
764                 }
765         }
766         iommu = NULL;
767 out:
768         if (iommu_is_dummy(iommu, dev))
769                 iommu = NULL;
770
771         rcu_read_unlock();
772
773         return iommu;
774 }
775
776 static void domain_flush_cache(struct dmar_domain *domain,
777                                void *addr, int size)
778 {
779         if (!domain->iommu_coherency)
780                 clflush_cache_range(addr, size);
781 }
782
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785         struct context_entry *context;
786         int i;
787
788         if (!iommu->root_entry)
789                 return;
790
791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
792                 context = iommu_context_addr(iommu, i, 0, 0);
793                 if (context)
794                         free_pgtable_page(context);
795
796                 if (!sm_supported(iommu))
797                         continue;
798
799                 context = iommu_context_addr(iommu, i, 0x80, 0);
800                 if (context)
801                         free_pgtable_page(context);
802         }
803
804         free_pgtable_page(iommu->root_entry);
805         iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812         struct dma_pte *pte;
813         int offset;
814
815         while (1) {
816                 offset = pfn_level_offset(pfn, level);
817                 pte = &parent[offset];
818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819                         pr_info("PTE not present at level %d\n", level);
820                         break;
821                 }
822
823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825                 if (level == 1)
826                         break;
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 level--;
830         }
831 }
832
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834                           unsigned long long addr, u32 pasid)
835 {
836         struct pasid_dir_entry *dir, *pde;
837         struct pasid_entry *entries, *pte;
838         struct context_entry *ctx_entry;
839         struct root_entry *rt_entry;
840         int i, dir_index, index, level;
841         u8 devfn = source_id & 0xff;
842         u8 bus = source_id >> 8;
843         struct dma_pte *pgtable;
844
845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847         /* root entry dump */
848         rt_entry = &iommu->root_entry[bus];
849         if (!rt_entry) {
850                 pr_info("root table entry is not present\n");
851                 return;
852         }
853
854         if (sm_supported(iommu))
855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856                         rt_entry->hi, rt_entry->lo);
857         else
858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860         /* context entry dump */
861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862         if (!ctx_entry) {
863                 pr_info("context table entry is not present\n");
864                 return;
865         }
866
867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868                 ctx_entry->hi, ctx_entry->lo);
869
870         /* legacy mode does not require PASID entries */
871         if (!sm_supported(iommu)) {
872                 level = agaw_to_level(ctx_entry->hi & 7);
873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874                 goto pgtable_walk;
875         }
876
877         /* get the pointer to pasid directory entry */
878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879         if (!dir) {
880                 pr_info("pasid directory entry is not present\n");
881                 return;
882         }
883         /* For request-without-pasid, get the pasid from context entry */
884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885                 pasid = IOMMU_NO_PASID;
886
887         dir_index = pasid >> PASID_PDE_SHIFT;
888         pde = &dir[dir_index];
889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891         /* get the pointer to the pasid table entry */
892         entries = get_pasid_table_from_pde(pde);
893         if (!entries) {
894                 pr_info("pasid table entry is not present\n");
895                 return;
896         }
897         index = pasid & PASID_PTE_MASK;
898         pte = &entries[index];
899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905         } else {
906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908         }
909
910 pgtable_walk:
911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916                                       unsigned long pfn, int *target_level,
917                                       gfp_t gfp)
918 {
919         struct dma_pte *parent, *pte;
920         int level = agaw_to_level(domain->agaw);
921         int offset;
922
923         if (!domain_pfn_supported(domain, pfn))
924                 /* Address beyond IOMMU's addressing capabilities. */
925                 return NULL;
926
927         parent = domain->pgd;
928
929         while (1) {
930                 void *tmp_page;
931
932                 offset = pfn_level_offset(pfn, level);
933                 pte = &parent[offset];
934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935                         break;
936                 if (level == *target_level)
937                         break;
938
939                 if (!dma_pte_present(pte)) {
940                         uint64_t pteval;
941
942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944                         if (!tmp_page)
945                                 return NULL;
946
947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949                         if (domain->use_first_level)
950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
953                                 /* Someone else set it while we were thinking; use theirs. */
954                                 free_pgtable_page(tmp_page);
955                         else
956                                 domain_flush_cache(domain, pte, sizeof(*pte));
957                 }
958                 if (level == 1)
959                         break;
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 level--;
963         }
964
965         if (!*target_level)
966                 *target_level = level;
967
968         return pte;
969 }
970
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973                                          unsigned long pfn,
974                                          int level, int *large_page)
975 {
976         struct dma_pte *parent, *pte;
977         int total = agaw_to_level(domain->agaw);
978         int offset;
979
980         parent = domain->pgd;
981         while (level <= total) {
982                 offset = pfn_level_offset(pfn, total);
983                 pte = &parent[offset];
984                 if (level == total)
985                         return pte;
986
987                 if (!dma_pte_present(pte)) {
988                         *large_page = total;
989                         break;
990                 }
991
992                 if (dma_pte_superpage(pte)) {
993                         *large_page = total;
994                         return pte;
995                 }
996
997                 parent = phys_to_virt(dma_pte_addr(pte));
998                 total--;
999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1782 {
1783         struct iommu_domain_info *info, *curr;
1784         unsigned long ndomains;
1785         int num, ret = -ENOSPC;
1786
1787         info = kzalloc(sizeof(*info), GFP_KERNEL);
1788         if (!info)
1789                 return -ENOMEM;
1790
1791         spin_lock(&iommu->lock);
1792         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793         if (curr) {
1794                 curr->refcnt++;
1795                 spin_unlock(&iommu->lock);
1796                 kfree(info);
1797                 return 0;
1798         }
1799
1800         ndomains = cap_ndoms(iommu->cap);
1801         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802         if (num >= ndomains) {
1803                 pr_err("%s: No free domain ids\n", iommu->name);
1804                 goto err_unlock;
1805         }
1806
1807         set_bit(num, iommu->domain_ids);
1808         info->refcnt    = 1;
1809         info->did       = num;
1810         info->iommu     = iommu;
1811         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812                           NULL, info, GFP_ATOMIC);
1813         if (curr) {
1814                 ret = xa_err(curr) ? : -EBUSY;
1815                 goto err_clear;
1816         }
1817         domain_update_iommu_cap(domain);
1818
1819         spin_unlock(&iommu->lock);
1820         return 0;
1821
1822 err_clear:
1823         clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825         spin_unlock(&iommu->lock);
1826         kfree(info);
1827         return ret;
1828 }
1829
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1831 {
1832         struct iommu_domain_info *info;
1833
1834         spin_lock(&iommu->lock);
1835         info = xa_load(&domain->iommu_array, iommu->seq_id);
1836         if (--info->refcnt == 0) {
1837                 clear_bit(info->did, iommu->domain_ids);
1838                 xa_erase(&domain->iommu_array, iommu->seq_id);
1839                 domain->nid = NUMA_NO_NODE;
1840                 domain_update_iommu_cap(domain);
1841                 kfree(info);
1842         }
1843         spin_unlock(&iommu->lock);
1844 }
1845
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1847 {
1848         int agaw;
1849         int r = (gaw - 12) % 9;
1850
1851         if (r == 0)
1852                 agaw = gaw;
1853         else
1854                 agaw = gaw + 9 - r;
1855         if (agaw > 64)
1856                 agaw = 64;
1857         return agaw;
1858 }
1859
1860 static void domain_exit(struct dmar_domain *domain)
1861 {
1862         if (domain->pgd) {
1863                 LIST_HEAD(freelist);
1864
1865                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866                 put_pages_list(&freelist);
1867         }
1868
1869         if (WARN_ON(!list_empty(&domain->devices)))
1870                 return;
1871
1872         kfree(domain);
1873 }
1874
1875 /*
1876  * Get the PASID directory size for scalable mode context entry.
1877  * Value of X in the PDTS field of a scalable mode context entry
1878  * indicates PASID directory with 2^(X + 7) entries.
1879  */
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1881 {
1882         unsigned long pds, max_pde;
1883
1884         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1886         if (pds < 7)
1887                 return 0;
1888
1889         return pds - 7;
1890 }
1891
1892 /*
1893  * Set the RID_PASID field of a scalable mode context entry. The
1894  * IOMMU hardware will use the PASID value set in this field for
1895  * DMA translations of DMA requests without PASID.
1896  */
1897 static inline void
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1899 {
1900         context->hi |= pasid & ((1 << 20) - 1);
1901 }
1902
1903 /*
1904  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1905  * entry.
1906  */
1907 static inline void context_set_sm_dte(struct context_entry *context)
1908 {
1909         context->lo |= BIT_ULL(2);
1910 }
1911
1912 /*
1913  * Set the PRE(Page Request Enable) field of a scalable mode context
1914  * entry.
1915  */
1916 static inline void context_set_sm_pre(struct context_entry *context)
1917 {
1918         context->lo |= BIT_ULL(4);
1919 }
1920
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1923
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925                                       struct intel_iommu *iommu,
1926                                       struct pasid_table *table,
1927                                       u8 bus, u8 devfn)
1928 {
1929         struct device_domain_info *info =
1930                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1931         u16 did = domain_id_iommu(domain, iommu);
1932         int translation = CONTEXT_TT_MULTI_LEVEL;
1933         struct context_entry *context;
1934         int ret;
1935
1936         if (hw_pass_through && domain_type_is_si(domain))
1937                 translation = CONTEXT_TT_PASS_THROUGH;
1938
1939         pr_debug("Set context mapping for %02x:%02x.%d\n",
1940                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941
1942         spin_lock(&iommu->lock);
1943         ret = -ENOMEM;
1944         context = iommu_context_addr(iommu, bus, devfn, 1);
1945         if (!context)
1946                 goto out_unlock;
1947
1948         ret = 0;
1949         if (context_present(context) && !context_copied(iommu, bus, devfn))
1950                 goto out_unlock;
1951
1952         /*
1953          * For kdump cases, old valid entries may be cached due to the
1954          * in-flight DMA and copied pgtable, but there is no unmapping
1955          * behaviour for them, thus we need an explicit cache flush for
1956          * the newly-mapped device. For kdump, at this point, the device
1957          * is supposed to finish reset at its driver probe stage, so no
1958          * in-flight DMA will exist, and we don't need to worry anymore
1959          * hereafter.
1960          */
1961         if (context_copied(iommu, bus, devfn)) {
1962                 u16 did_old = context_domain_id(context);
1963
1964                 if (did_old < cap_ndoms(iommu->cap)) {
1965                         iommu->flush.flush_context(iommu, did_old,
1966                                                    (((u16)bus) << 8) | devfn,
1967                                                    DMA_CCMD_MASK_NOBIT,
1968                                                    DMA_CCMD_DEVICE_INVL);
1969                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1970                                                  DMA_TLB_DSI_FLUSH);
1971                 }
1972
1973                 clear_context_copied(iommu, bus, devfn);
1974         }
1975
1976         context_clear_entry(context);
1977
1978         if (sm_supported(iommu)) {
1979                 unsigned long pds;
1980
1981                 /* Setup the PASID DIR pointer: */
1982                 pds = context_get_sm_pds(table);
1983                 context->lo = (u64)virt_to_phys(table->table) |
1984                                 context_pdts(pds);
1985
1986                 /* Setup the RID_PASID field: */
1987                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1988
1989                 /*
1990                  * Setup the Device-TLB enable bit and Page request
1991                  * Enable bit:
1992                  */
1993                 if (info && info->ats_supported)
1994                         context_set_sm_dte(context);
1995                 if (info && info->pri_supported)
1996                         context_set_sm_pre(context);
1997                 if (info && info->pasid_supported)
1998                         context_set_pasid(context);
1999         } else {
2000                 struct dma_pte *pgd = domain->pgd;
2001                 int agaw;
2002
2003                 context_set_domain_id(context, did);
2004
2005                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2006                         /*
2007                          * Skip top levels of page tables for iommu which has
2008                          * less agaw than default. Unnecessary for PT mode.
2009                          */
2010                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2011                                 ret = -ENOMEM;
2012                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2013                                 if (!dma_pte_present(pgd))
2014                                         goto out_unlock;
2015                         }
2016
2017                         if (info && info->ats_supported)
2018                                 translation = CONTEXT_TT_DEV_IOTLB;
2019                         else
2020                                 translation = CONTEXT_TT_MULTI_LEVEL;
2021
2022                         context_set_address_root(context, virt_to_phys(pgd));
2023                         context_set_address_width(context, agaw);
2024                 } else {
2025                         /*
2026                          * In pass through mode, AW must be programmed to
2027                          * indicate the largest AGAW value supported by
2028                          * hardware. And ASR is ignored by hardware.
2029                          */
2030                         context_set_address_width(context, iommu->msagaw);
2031                 }
2032
2033                 context_set_translation_type(context, translation);
2034         }
2035
2036         context_set_fault_enable(context);
2037         context_set_present(context);
2038         if (!ecap_coherent(iommu->ecap))
2039                 clflush_cache_range(context, sizeof(*context));
2040
2041         /*
2042          * It's a non-present to present mapping. If hardware doesn't cache
2043          * non-present entry we only need to flush the write-buffer. If the
2044          * _does_ cache non-present entries, then it does so in the special
2045          * domain #0, which we have to flush:
2046          */
2047         if (cap_caching_mode(iommu->cap)) {
2048                 iommu->flush.flush_context(iommu, 0,
2049                                            (((u16)bus) << 8) | devfn,
2050                                            DMA_CCMD_MASK_NOBIT,
2051                                            DMA_CCMD_DEVICE_INVL);
2052                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2053         } else {
2054                 iommu_flush_write_buffer(iommu);
2055         }
2056
2057         ret = 0;
2058
2059 out_unlock:
2060         spin_unlock(&iommu->lock);
2061
2062         return ret;
2063 }
2064
2065 struct domain_context_mapping_data {
2066         struct dmar_domain *domain;
2067         struct intel_iommu *iommu;
2068         struct pasid_table *table;
2069 };
2070
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072                                      u16 alias, void *opaque)
2073 {
2074         struct domain_context_mapping_data *data = opaque;
2075
2076         return domain_context_mapping_one(data->domain, data->iommu,
2077                                           data->table, PCI_BUS_NUM(alias),
2078                                           alias & 0xff);
2079 }
2080
2081 static int
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2083 {
2084         struct domain_context_mapping_data data;
2085         struct pasid_table *table;
2086         struct intel_iommu *iommu;
2087         u8 bus, devfn;
2088
2089         iommu = device_to_iommu(dev, &bus, &devfn);
2090         if (!iommu)
2091                 return -ENODEV;
2092
2093         table = intel_pasid_get_table(dev);
2094
2095         if (!dev_is_pci(dev))
2096                 return domain_context_mapping_one(domain, iommu, table,
2097                                                   bus, devfn);
2098
2099         data.domain = domain;
2100         data.iommu = iommu;
2101         data.table = table;
2102
2103         return pci_for_each_dma_alias(to_pci_dev(dev),
2104                                       &domain_context_mapping_cb, &data);
2105 }
2106
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109                                             size_t size)
2110 {
2111         host_addr &= ~PAGE_MASK;
2112         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117                                           unsigned long iov_pfn,
2118                                           unsigned long phy_pfn,
2119                                           unsigned long pages)
2120 {
2121         int support, level = 1;
2122         unsigned long pfnmerge;
2123
2124         support = domain->iommu_superpage;
2125
2126         /* To use a large page, the virtual *and* physical addresses
2127            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128            of them will mean we have to use smaller pages. So just
2129            merge them and check both at once. */
2130         pfnmerge = iov_pfn | phy_pfn;
2131
2132         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133                 pages >>= VTD_STRIDE_SHIFT;
2134                 if (!pages)
2135                         break;
2136                 pfnmerge >>= VTD_STRIDE_SHIFT;
2137                 level++;
2138                 support--;
2139         }
2140         return level;
2141 }
2142
2143 /*
2144  * Ensure that old small page tables are removed to make room for superpage(s).
2145  * We're going to add new large pages, so make sure we don't remove their parent
2146  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2147  */
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149                                  unsigned long start_pfn,
2150                                  unsigned long end_pfn, int level)
2151 {
2152         unsigned long lvl_pages = lvl_to_nr_pages(level);
2153         struct iommu_domain_info *info;
2154         struct dma_pte *pte = NULL;
2155         unsigned long i;
2156
2157         while (start_pfn <= end_pfn) {
2158                 if (!pte)
2159                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2160                                              GFP_ATOMIC);
2161
2162                 if (dma_pte_present(pte)) {
2163                         dma_pte_free_pagetable(domain, start_pfn,
2164                                                start_pfn + lvl_pages - 1,
2165                                                level + 1);
2166
2167                         xa_for_each(&domain->iommu_array, i, info)
2168                                 iommu_flush_iotlb_psi(info->iommu, domain,
2169                                                       start_pfn, lvl_pages,
2170                                                       0, 0);
2171                 }
2172
2173                 pte++;
2174                 start_pfn += lvl_pages;
2175                 if (first_pte_in_page(pte))
2176                         pte = NULL;
2177         }
2178 }
2179
2180 static int
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2183                  gfp_t gfp)
2184 {
2185         struct dma_pte *first_pte = NULL, *pte = NULL;
2186         unsigned int largepage_lvl = 0;
2187         unsigned long lvl_pages = 0;
2188         phys_addr_t pteval;
2189         u64 attr;
2190
2191         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2192                 return -EINVAL;
2193
2194         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2195                 return -EINVAL;
2196
2197         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2199                 return -EINVAL;
2200         }
2201
2202         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203         attr |= DMA_FL_PTE_PRESENT;
2204         if (domain->use_first_level) {
2205                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206                 if (prot & DMA_PTE_WRITE)
2207                         attr |= DMA_FL_PTE_DIRTY;
2208         }
2209
2210         domain->has_mappings = true;
2211
2212         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2213
2214         while (nr_pages > 0) {
2215                 uint64_t tmp;
2216
2217                 if (!pte) {
2218                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2219                                         phys_pfn, nr_pages);
2220
2221                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2222                                              gfp);
2223                         if (!pte)
2224                                 return -ENOMEM;
2225                         first_pte = pte;
2226
2227                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2228
2229                         /* It is large page*/
2230                         if (largepage_lvl > 1) {
2231                                 unsigned long end_pfn;
2232                                 unsigned long pages_to_remove;
2233
2234                                 pteval |= DMA_PTE_LARGE_PAGE;
2235                                 pages_to_remove = min_t(unsigned long, nr_pages,
2236                                                         nr_pte_to_next_page(pte) * lvl_pages);
2237                                 end_pfn = iov_pfn + pages_to_remove - 1;
2238                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2239                         } else {
2240                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2241                         }
2242
2243                 }
2244                 /* We don't need lock here, nobody else
2245                  * touches the iova range
2246                  */
2247                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2248                 if (tmp) {
2249                         static int dumps = 5;
2250                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2251                                 iov_pfn, tmp, (unsigned long long)pteval);
2252                         if (dumps) {
2253                                 dumps--;
2254                                 debug_dma_dump_mappings(NULL);
2255                         }
2256                         WARN_ON(1);
2257                 }
2258
2259                 nr_pages -= lvl_pages;
2260                 iov_pfn += lvl_pages;
2261                 phys_pfn += lvl_pages;
2262                 pteval += lvl_pages * VTD_PAGE_SIZE;
2263
2264                 /* If the next PTE would be the first in a new page, then we
2265                  * need to flush the cache on the entries we've just written.
2266                  * And then we'll need to recalculate 'pte', so clear it and
2267                  * let it get set again in the if (!pte) block above.
2268                  *
2269                  * If we're done (!nr_pages) we need to flush the cache too.
2270                  *
2271                  * Also if we've been setting superpages, we may need to
2272                  * recalculate 'pte' and switch back to smaller pages for the
2273                  * end of the mapping, if the trailing size is not enough to
2274                  * use another superpage (i.e. nr_pages < lvl_pages).
2275                  */
2276                 pte++;
2277                 if (!nr_pages || first_pte_in_page(pte) ||
2278                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2279                         domain_flush_cache(domain, first_pte,
2280                                            (void *)pte - (void *)first_pte);
2281                         pte = NULL;
2282                 }
2283         }
2284
2285         return 0;
2286 }
2287
2288 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2289 {
2290         struct intel_iommu *iommu = info->iommu;
2291         struct context_entry *context;
2292         u16 did_old;
2293
2294         if (!iommu)
2295                 return;
2296
2297         spin_lock(&iommu->lock);
2298         context = iommu_context_addr(iommu, bus, devfn, 0);
2299         if (!context) {
2300                 spin_unlock(&iommu->lock);
2301                 return;
2302         }
2303
2304         if (sm_supported(iommu)) {
2305                 if (hw_pass_through && domain_type_is_si(info->domain))
2306                         did_old = FLPT_DEFAULT_DID;
2307                 else
2308                         did_old = domain_id_iommu(info->domain, iommu);
2309         } else {
2310                 did_old = context_domain_id(context);
2311         }
2312
2313         context_clear_entry(context);
2314         __iommu_flush_cache(iommu, context, sizeof(*context));
2315         spin_unlock(&iommu->lock);
2316         iommu->flush.flush_context(iommu,
2317                                    did_old,
2318                                    (((u16)bus) << 8) | devfn,
2319                                    DMA_CCMD_MASK_NOBIT,
2320                                    DMA_CCMD_DEVICE_INVL);
2321
2322         if (sm_supported(iommu))
2323                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2324
2325         iommu->flush.flush_iotlb(iommu,
2326                                  did_old,
2327                                  0,
2328                                  0,
2329                                  DMA_TLB_DSI_FLUSH);
2330
2331         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2332 }
2333
2334 static int domain_setup_first_level(struct intel_iommu *iommu,
2335                                     struct dmar_domain *domain,
2336                                     struct device *dev,
2337                                     u32 pasid)
2338 {
2339         struct dma_pte *pgd = domain->pgd;
2340         int agaw, level;
2341         int flags = 0;
2342
2343         /*
2344          * Skip top levels of page tables for iommu which has
2345          * less agaw than default. Unnecessary for PT mode.
2346          */
2347         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2348                 pgd = phys_to_virt(dma_pte_addr(pgd));
2349                 if (!dma_pte_present(pgd))
2350                         return -ENOMEM;
2351         }
2352
2353         level = agaw_to_level(agaw);
2354         if (level != 4 && level != 5)
2355                 return -EINVAL;
2356
2357         if (level == 5)
2358                 flags |= PASID_FLAG_FL5LP;
2359
2360         if (domain->force_snooping)
2361                 flags |= PASID_FLAG_PAGE_SNOOP;
2362
2363         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2364                                              domain_id_iommu(domain, iommu),
2365                                              flags);
2366 }
2367
2368 static bool dev_is_real_dma_subdevice(struct device *dev)
2369 {
2370         return dev && dev_is_pci(dev) &&
2371                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 }
2373
2374 static int iommu_domain_identity_map(struct dmar_domain *domain,
2375                                      unsigned long first_vpfn,
2376                                      unsigned long last_vpfn)
2377 {
2378         /*
2379          * RMRR range might have overlap with physical memory range,
2380          * clear it first
2381          */
2382         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2383
2384         return __domain_mapping(domain, first_vpfn,
2385                                 first_vpfn, last_vpfn - first_vpfn + 1,
2386                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2387 }
2388
2389 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2390
2391 static int __init si_domain_init(int hw)
2392 {
2393         struct dmar_rmrr_unit *rmrr;
2394         struct device *dev;
2395         int i, nid, ret;
2396
2397         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2398         if (!si_domain)
2399                 return -EFAULT;
2400
2401         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2402                 domain_exit(si_domain);
2403                 si_domain = NULL;
2404                 return -EFAULT;
2405         }
2406
2407         if (hw)
2408                 return 0;
2409
2410         for_each_online_node(nid) {
2411                 unsigned long start_pfn, end_pfn;
2412                 int i;
2413
2414                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2415                         ret = iommu_domain_identity_map(si_domain,
2416                                         mm_to_dma_pfn_start(start_pfn),
2417                                         mm_to_dma_pfn_end(end_pfn));
2418                         if (ret)
2419                                 return ret;
2420                 }
2421         }
2422
2423         /*
2424          * Identity map the RMRRs so that devices with RMRRs could also use
2425          * the si_domain.
2426          */
2427         for_each_rmrr_units(rmrr) {
2428                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2429                                           i, dev) {
2430                         unsigned long long start = rmrr->base_address;
2431                         unsigned long long end = rmrr->end_address;
2432
2433                         if (WARN_ON(end < start ||
2434                                     end >> agaw_to_width(si_domain->agaw)))
2435                                 continue;
2436
2437                         ret = iommu_domain_identity_map(si_domain,
2438                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2439                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2440                         if (ret)
2441                                 return ret;
2442                 }
2443         }
2444
2445         return 0;
2446 }
2447
2448 static int dmar_domain_attach_device(struct dmar_domain *domain,
2449                                      struct device *dev)
2450 {
2451         struct device_domain_info *info = dev_iommu_priv_get(dev);
2452         struct intel_iommu *iommu;
2453         unsigned long flags;
2454         u8 bus, devfn;
2455         int ret;
2456
2457         iommu = device_to_iommu(dev, &bus, &devfn);
2458         if (!iommu)
2459                 return -ENODEV;
2460
2461         ret = domain_attach_iommu(domain, iommu);
2462         if (ret)
2463                 return ret;
2464         info->domain = domain;
2465         spin_lock_irqsave(&domain->lock, flags);
2466         list_add(&info->link, &domain->devices);
2467         spin_unlock_irqrestore(&domain->lock, flags);
2468
2469         /* PASID table is mandatory for a PCI device in scalable mode. */
2470         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2471                 /* Setup the PASID entry for requests without PASID: */
2472                 if (hw_pass_through && domain_type_is_si(domain))
2473                         ret = intel_pasid_setup_pass_through(iommu, domain,
2474                                         dev, IOMMU_NO_PASID);
2475                 else if (domain->use_first_level)
2476                         ret = domain_setup_first_level(iommu, domain, dev,
2477                                         IOMMU_NO_PASID);
2478                 else
2479                         ret = intel_pasid_setup_second_level(iommu, domain,
2480                                         dev, IOMMU_NO_PASID);
2481                 if (ret) {
2482                         dev_err(dev, "Setup RID2PASID failed\n");
2483                         device_block_translation(dev);
2484                         return ret;
2485                 }
2486         }
2487
2488         ret = domain_context_mapping(domain, dev);
2489         if (ret) {
2490                 dev_err(dev, "Domain context map failed\n");
2491                 device_block_translation(dev);
2492                 return ret;
2493         }
2494
2495         if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2496                 iommu_enable_pci_caps(info);
2497
2498         return 0;
2499 }
2500
2501 /**
2502  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2503  * is relaxable (ie. is allowed to be not enforced under some conditions)
2504  * @dev: device handle
2505  *
2506  * We assume that PCI USB devices with RMRRs have them largely
2507  * for historical reasons and that the RMRR space is not actively used post
2508  * boot.  This exclusion may change if vendors begin to abuse it.
2509  *
2510  * The same exception is made for graphics devices, with the requirement that
2511  * any use of the RMRR regions will be torn down before assigning the device
2512  * to a guest.
2513  *
2514  * Return: true if the RMRR is relaxable, false otherwise
2515  */
2516 static bool device_rmrr_is_relaxable(struct device *dev)
2517 {
2518         struct pci_dev *pdev;
2519
2520         if (!dev_is_pci(dev))
2521                 return false;
2522
2523         pdev = to_pci_dev(dev);
2524         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2525                 return true;
2526         else
2527                 return false;
2528 }
2529
2530 /*
2531  * Return the required default domain type for a specific device.
2532  *
2533  * @dev: the device in query
2534  * @startup: true if this is during early boot
2535  *
2536  * Returns:
2537  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2538  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2539  *  - 0: both identity and dynamic domains work for this device
2540  */
2541 static int device_def_domain_type(struct device *dev)
2542 {
2543         if (dev_is_pci(dev)) {
2544                 struct pci_dev *pdev = to_pci_dev(dev);
2545
2546                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2547                         return IOMMU_DOMAIN_IDENTITY;
2548
2549                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2550                         return IOMMU_DOMAIN_IDENTITY;
2551         }
2552
2553         return 0;
2554 }
2555
2556 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2557 {
2558         /*
2559          * Start from the sane iommu hardware state.
2560          * If the queued invalidation is already initialized by us
2561          * (for example, while enabling interrupt-remapping) then
2562          * we got the things already rolling from a sane state.
2563          */
2564         if (!iommu->qi) {
2565                 /*
2566                  * Clear any previous faults.
2567                  */
2568                 dmar_fault(-1, iommu);
2569                 /*
2570                  * Disable queued invalidation if supported and already enabled
2571                  * before OS handover.
2572                  */
2573                 dmar_disable_qi(iommu);
2574         }
2575
2576         if (dmar_enable_qi(iommu)) {
2577                 /*
2578                  * Queued Invalidate not enabled, use Register Based Invalidate
2579                  */
2580                 iommu->flush.flush_context = __iommu_flush_context;
2581                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2582                 pr_info("%s: Using Register based invalidation\n",
2583                         iommu->name);
2584         } else {
2585                 iommu->flush.flush_context = qi_flush_context;
2586                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2587                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2588         }
2589 }
2590
2591 static int copy_context_table(struct intel_iommu *iommu,
2592                               struct root_entry *old_re,
2593                               struct context_entry **tbl,
2594                               int bus, bool ext)
2595 {
2596         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2597         struct context_entry *new_ce = NULL, ce;
2598         struct context_entry *old_ce = NULL;
2599         struct root_entry re;
2600         phys_addr_t old_ce_phys;
2601
2602         tbl_idx = ext ? bus * 2 : bus;
2603         memcpy(&re, old_re, sizeof(re));
2604
2605         for (devfn = 0; devfn < 256; devfn++) {
2606                 /* First calculate the correct index */
2607                 idx = (ext ? devfn * 2 : devfn) % 256;
2608
2609                 if (idx == 0) {
2610                         /* First save what we may have and clean up */
2611                         if (new_ce) {
2612                                 tbl[tbl_idx] = new_ce;
2613                                 __iommu_flush_cache(iommu, new_ce,
2614                                                     VTD_PAGE_SIZE);
2615                                 pos = 1;
2616                         }
2617
2618                         if (old_ce)
2619                                 memunmap(old_ce);
2620
2621                         ret = 0;
2622                         if (devfn < 0x80)
2623                                 old_ce_phys = root_entry_lctp(&re);
2624                         else
2625                                 old_ce_phys = root_entry_uctp(&re);
2626
2627                         if (!old_ce_phys) {
2628                                 if (ext && devfn == 0) {
2629                                         /* No LCTP, try UCTP */
2630                                         devfn = 0x7f;
2631                                         continue;
2632                                 } else {
2633                                         goto out;
2634                                 }
2635                         }
2636
2637                         ret = -ENOMEM;
2638                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2639                                         MEMREMAP_WB);
2640                         if (!old_ce)
2641                                 goto out;
2642
2643                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2644                         if (!new_ce)
2645                                 goto out_unmap;
2646
2647                         ret = 0;
2648                 }
2649
2650                 /* Now copy the context entry */
2651                 memcpy(&ce, old_ce + idx, sizeof(ce));
2652
2653                 if (!context_present(&ce))
2654                         continue;
2655
2656                 did = context_domain_id(&ce);
2657                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2658                         set_bit(did, iommu->domain_ids);
2659
2660                 set_context_copied(iommu, bus, devfn);
2661                 new_ce[idx] = ce;
2662         }
2663
2664         tbl[tbl_idx + pos] = new_ce;
2665
2666         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2667
2668 out_unmap:
2669         memunmap(old_ce);
2670
2671 out:
2672         return ret;
2673 }
2674
2675 static int copy_translation_tables(struct intel_iommu *iommu)
2676 {
2677         struct context_entry **ctxt_tbls;
2678         struct root_entry *old_rt;
2679         phys_addr_t old_rt_phys;
2680         int ctxt_table_entries;
2681         u64 rtaddr_reg;
2682         int bus, ret;
2683         bool new_ext, ext;
2684
2685         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2686         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2687         new_ext    = !!sm_supported(iommu);
2688
2689         /*
2690          * The RTT bit can only be changed when translation is disabled,
2691          * but disabling translation means to open a window for data
2692          * corruption. So bail out and don't copy anything if we would
2693          * have to change the bit.
2694          */
2695         if (new_ext != ext)
2696                 return -EINVAL;
2697
2698         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2699         if (!iommu->copied_tables)
2700                 return -ENOMEM;
2701
2702         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2703         if (!old_rt_phys)
2704                 return -EINVAL;
2705
2706         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2707         if (!old_rt)
2708                 return -ENOMEM;
2709
2710         /* This is too big for the stack - allocate it from slab */
2711         ctxt_table_entries = ext ? 512 : 256;
2712         ret = -ENOMEM;
2713         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2714         if (!ctxt_tbls)
2715                 goto out_unmap;
2716
2717         for (bus = 0; bus < 256; bus++) {
2718                 ret = copy_context_table(iommu, &old_rt[bus],
2719                                          ctxt_tbls, bus, ext);
2720                 if (ret) {
2721                         pr_err("%s: Failed to copy context table for bus %d\n",
2722                                 iommu->name, bus);
2723                         continue;
2724                 }
2725         }
2726
2727         spin_lock(&iommu->lock);
2728
2729         /* Context tables are copied, now write them to the root_entry table */
2730         for (bus = 0; bus < 256; bus++) {
2731                 int idx = ext ? bus * 2 : bus;
2732                 u64 val;
2733
2734                 if (ctxt_tbls[idx]) {
2735                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2736                         iommu->root_entry[bus].lo = val;
2737                 }
2738
2739                 if (!ext || !ctxt_tbls[idx + 1])
2740                         continue;
2741
2742                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2743                 iommu->root_entry[bus].hi = val;
2744         }
2745
2746         spin_unlock(&iommu->lock);
2747
2748         kfree(ctxt_tbls);
2749
2750         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2751
2752         ret = 0;
2753
2754 out_unmap:
2755         memunmap(old_rt);
2756
2757         return ret;
2758 }
2759
2760 static int __init init_dmars(void)
2761 {
2762         struct dmar_drhd_unit *drhd;
2763         struct intel_iommu *iommu;
2764         int ret;
2765
2766         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2767         if (ret)
2768                 goto free_iommu;
2769
2770         for_each_iommu(iommu, drhd) {
2771                 if (drhd->ignored) {
2772                         iommu_disable_translation(iommu);
2773                         continue;
2774                 }
2775
2776                 /*
2777                  * Find the max pasid size of all IOMMU's in the system.
2778                  * We need to ensure the system pasid table is no bigger
2779                  * than the smallest supported.
2780                  */
2781                 if (pasid_supported(iommu)) {
2782                         u32 temp = 2 << ecap_pss(iommu->ecap);
2783
2784                         intel_pasid_max_id = min_t(u32, temp,
2785                                                    intel_pasid_max_id);
2786                 }
2787
2788                 intel_iommu_init_qi(iommu);
2789
2790                 ret = iommu_init_domains(iommu);
2791                 if (ret)
2792                         goto free_iommu;
2793
2794                 init_translation_status(iommu);
2795
2796                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2797                         iommu_disable_translation(iommu);
2798                         clear_translation_pre_enabled(iommu);
2799                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2800                                 iommu->name);
2801                 }
2802
2803                 /*
2804                  * TBD:
2805                  * we could share the same root & context tables
2806                  * among all IOMMU's. Need to Split it later.
2807                  */
2808                 ret = iommu_alloc_root_entry(iommu);
2809                 if (ret)
2810                         goto free_iommu;
2811
2812                 if (translation_pre_enabled(iommu)) {
2813                         pr_info("Translation already enabled - trying to copy translation structures\n");
2814
2815                         ret = copy_translation_tables(iommu);
2816                         if (ret) {
2817                                 /*
2818                                  * We found the IOMMU with translation
2819                                  * enabled - but failed to copy over the
2820                                  * old root-entry table. Try to proceed
2821                                  * by disabling translation now and
2822                                  * allocating a clean root-entry table.
2823                                  * This might cause DMAR faults, but
2824                                  * probably the dump will still succeed.
2825                                  */
2826                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2827                                        iommu->name);
2828                                 iommu_disable_translation(iommu);
2829                                 clear_translation_pre_enabled(iommu);
2830                         } else {
2831                                 pr_info("Copied translation tables from previous kernel for %s\n",
2832                                         iommu->name);
2833                         }
2834                 }
2835
2836                 if (!ecap_pass_through(iommu->ecap))
2837                         hw_pass_through = 0;
2838                 intel_svm_check(iommu);
2839         }
2840
2841         /*
2842          * Now that qi is enabled on all iommus, set the root entry and flush
2843          * caches. This is required on some Intel X58 chipsets, otherwise the
2844          * flush_context function will loop forever and the boot hangs.
2845          */
2846         for_each_active_iommu(iommu, drhd) {
2847                 iommu_flush_write_buffer(iommu);
2848                 iommu_set_root_entry(iommu);
2849         }
2850
2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2852         dmar_map_gfx = 0;
2853 #endif
2854
2855         if (!dmar_map_gfx)
2856                 iommu_identity_mapping |= IDENTMAP_GFX;
2857
2858         check_tylersburg_isoch();
2859
2860         ret = si_domain_init(hw_pass_through);
2861         if (ret)
2862                 goto free_iommu;
2863
2864         /*
2865          * for each drhd
2866          *   enable fault log
2867          *   global invalidate context cache
2868          *   global invalidate iotlb
2869          *   enable translation
2870          */
2871         for_each_iommu(iommu, drhd) {
2872                 if (drhd->ignored) {
2873                         /*
2874                          * we always have to disable PMRs or DMA may fail on
2875                          * this device
2876                          */
2877                         if (force_on)
2878                                 iommu_disable_protect_mem_regions(iommu);
2879                         continue;
2880                 }
2881
2882                 iommu_flush_write_buffer(iommu);
2883
2884 #ifdef CONFIG_INTEL_IOMMU_SVM
2885                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2886                         /*
2887                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2888                          * could cause possible lock race condition.
2889                          */
2890                         up_write(&dmar_global_lock);
2891                         ret = intel_svm_enable_prq(iommu);
2892                         down_write(&dmar_global_lock);
2893                         if (ret)
2894                                 goto free_iommu;
2895                 }
2896 #endif
2897                 ret = dmar_set_interrupt(iommu);
2898                 if (ret)
2899                         goto free_iommu;
2900         }
2901
2902         return 0;
2903
2904 free_iommu:
2905         for_each_active_iommu(iommu, drhd) {
2906                 disable_dmar_iommu(iommu);
2907                 free_dmar_iommu(iommu);
2908         }
2909         if (si_domain) {
2910                 domain_exit(si_domain);
2911                 si_domain = NULL;
2912         }
2913
2914         return ret;
2915 }
2916
2917 static void __init init_no_remapping_devices(void)
2918 {
2919         struct dmar_drhd_unit *drhd;
2920         struct device *dev;
2921         int i;
2922
2923         for_each_drhd_unit(drhd) {
2924                 if (!drhd->include_all) {
2925                         for_each_active_dev_scope(drhd->devices,
2926                                                   drhd->devices_cnt, i, dev)
2927                                 break;
2928                         /* ignore DMAR unit if no devices exist */
2929                         if (i == drhd->devices_cnt)
2930                                 drhd->ignored = 1;
2931                 }
2932         }
2933
2934         for_each_active_drhd_unit(drhd) {
2935                 if (drhd->include_all)
2936                         continue;
2937
2938                 for_each_active_dev_scope(drhd->devices,
2939                                           drhd->devices_cnt, i, dev)
2940                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2941                                 break;
2942                 if (i < drhd->devices_cnt)
2943                         continue;
2944
2945                 /* This IOMMU has *only* gfx devices. Either bypass it or
2946                    set the gfx_mapped flag, as appropriate */
2947                 drhd->gfx_dedicated = 1;
2948                 if (!dmar_map_gfx)
2949                         drhd->ignored = 1;
2950         }
2951 }
2952
2953 #ifdef CONFIG_SUSPEND
2954 static int init_iommu_hw(void)
2955 {
2956         struct dmar_drhd_unit *drhd;
2957         struct intel_iommu *iommu = NULL;
2958         int ret;
2959
2960         for_each_active_iommu(iommu, drhd) {
2961                 if (iommu->qi) {
2962                         ret = dmar_reenable_qi(iommu);
2963                         if (ret)
2964                                 return ret;
2965                 }
2966         }
2967
2968         for_each_iommu(iommu, drhd) {
2969                 if (drhd->ignored) {
2970                         /*
2971                          * we always have to disable PMRs or DMA may fail on
2972                          * this device
2973                          */
2974                         if (force_on)
2975                                 iommu_disable_protect_mem_regions(iommu);
2976                         continue;
2977                 }
2978
2979                 iommu_flush_write_buffer(iommu);
2980                 iommu_set_root_entry(iommu);
2981                 iommu_enable_translation(iommu);
2982                 iommu_disable_protect_mem_regions(iommu);
2983         }
2984
2985         return 0;
2986 }
2987
2988 static void iommu_flush_all(void)
2989 {
2990         struct dmar_drhd_unit *drhd;
2991         struct intel_iommu *iommu;
2992
2993         for_each_active_iommu(iommu, drhd) {
2994                 iommu->flush.flush_context(iommu, 0, 0, 0,
2995                                            DMA_CCMD_GLOBAL_INVL);
2996                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2997                                          DMA_TLB_GLOBAL_FLUSH);
2998         }
2999 }
3000
3001 static int iommu_suspend(void)
3002 {
3003         struct dmar_drhd_unit *drhd;
3004         struct intel_iommu *iommu = NULL;
3005         unsigned long flag;
3006
3007         iommu_flush_all();
3008
3009         for_each_active_iommu(iommu, drhd) {
3010                 iommu_disable_translation(iommu);
3011
3012                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3013
3014                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3015                         readl(iommu->reg + DMAR_FECTL_REG);
3016                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3017                         readl(iommu->reg + DMAR_FEDATA_REG);
3018                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3019                         readl(iommu->reg + DMAR_FEADDR_REG);
3020                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3021                         readl(iommu->reg + DMAR_FEUADDR_REG);
3022
3023                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3024         }
3025         return 0;
3026 }
3027
3028 static void iommu_resume(void)
3029 {
3030         struct dmar_drhd_unit *drhd;
3031         struct intel_iommu *iommu = NULL;
3032         unsigned long flag;
3033
3034         if (init_iommu_hw()) {
3035                 if (force_on)
3036                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3037                 else
3038                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3039                 return;
3040         }
3041
3042         for_each_active_iommu(iommu, drhd) {
3043
3044                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3045
3046                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3047                         iommu->reg + DMAR_FECTL_REG);
3048                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3049                         iommu->reg + DMAR_FEDATA_REG);
3050                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3051                         iommu->reg + DMAR_FEADDR_REG);
3052                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3053                         iommu->reg + DMAR_FEUADDR_REG);
3054
3055                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3056         }
3057 }
3058
3059 static struct syscore_ops iommu_syscore_ops = {
3060         .resume         = iommu_resume,
3061         .suspend        = iommu_suspend,
3062 };
3063
3064 static void __init init_iommu_pm_ops(void)
3065 {
3066         register_syscore_ops(&iommu_syscore_ops);
3067 }
3068
3069 #else
3070 static inline void init_iommu_pm_ops(void) {}
3071 #endif  /* CONFIG_PM */
3072
3073 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3074 {
3075         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3076             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3077             rmrr->end_address <= rmrr->base_address ||
3078             arch_rmrr_sanity_check(rmrr))
3079                 return -EINVAL;
3080
3081         return 0;
3082 }
3083
3084 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3085 {
3086         struct acpi_dmar_reserved_memory *rmrr;
3087         struct dmar_rmrr_unit *rmrru;
3088
3089         rmrr = (struct acpi_dmar_reserved_memory *)header;
3090         if (rmrr_sanity_check(rmrr)) {
3091                 pr_warn(FW_BUG
3092                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3093                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3094                            rmrr->base_address, rmrr->end_address,
3095                            dmi_get_system_info(DMI_BIOS_VENDOR),
3096                            dmi_get_system_info(DMI_BIOS_VERSION),
3097                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3098                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3099         }
3100
3101         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3102         if (!rmrru)
3103                 goto out;
3104
3105         rmrru->hdr = header;
3106
3107         rmrru->base_address = rmrr->base_address;
3108         rmrru->end_address = rmrr->end_address;
3109
3110         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3111                                 ((void *)rmrr) + rmrr->header.length,
3112                                 &rmrru->devices_cnt);
3113         if (rmrru->devices_cnt && rmrru->devices == NULL)
3114                 goto free_rmrru;
3115
3116         list_add(&rmrru->list, &dmar_rmrr_units);
3117
3118         return 0;
3119 free_rmrru:
3120         kfree(rmrru);
3121 out:
3122         return -ENOMEM;
3123 }
3124
3125 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3126 {
3127         struct dmar_atsr_unit *atsru;
3128         struct acpi_dmar_atsr *tmp;
3129
3130         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3131                                 dmar_rcu_check()) {
3132                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3133                 if (atsr->segment != tmp->segment)
3134                         continue;
3135                 if (atsr->header.length != tmp->header.length)
3136                         continue;
3137                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3138                         return atsru;
3139         }
3140
3141         return NULL;
3142 }
3143
3144 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3145 {
3146         struct acpi_dmar_atsr *atsr;
3147         struct dmar_atsr_unit *atsru;
3148
3149         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3150                 return 0;
3151
3152         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3153         atsru = dmar_find_atsr(atsr);
3154         if (atsru)
3155                 return 0;
3156
3157         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3158         if (!atsru)
3159                 return -ENOMEM;
3160
3161         /*
3162          * If memory is allocated from slab by ACPI _DSM method, we need to
3163          * copy the memory content because the memory buffer will be freed
3164          * on return.
3165          */
3166         atsru->hdr = (void *)(atsru + 1);
3167         memcpy(atsru->hdr, hdr, hdr->length);
3168         atsru->include_all = atsr->flags & 0x1;
3169         if (!atsru->include_all) {
3170                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3171                                 (void *)atsr + atsr->header.length,
3172                                 &atsru->devices_cnt);
3173                 if (atsru->devices_cnt && atsru->devices == NULL) {
3174                         kfree(atsru);
3175                         return -ENOMEM;
3176                 }
3177         }
3178
3179         list_add_rcu(&atsru->list, &dmar_atsr_units);
3180
3181         return 0;
3182 }
3183
3184 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3185 {
3186         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3187         kfree(atsru);
3188 }
3189
3190 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3191 {
3192         struct acpi_dmar_atsr *atsr;
3193         struct dmar_atsr_unit *atsru;
3194
3195         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3196         atsru = dmar_find_atsr(atsr);
3197         if (atsru) {
3198                 list_del_rcu(&atsru->list);
3199                 synchronize_rcu();
3200                 intel_iommu_free_atsr(atsru);
3201         }
3202
3203         return 0;
3204 }
3205
3206 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3207 {
3208         int i;
3209         struct device *dev;
3210         struct acpi_dmar_atsr *atsr;
3211         struct dmar_atsr_unit *atsru;
3212
3213         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3214         atsru = dmar_find_atsr(atsr);
3215         if (!atsru)
3216                 return 0;
3217
3218         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3219                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3220                                           i, dev)
3221                         return -EBUSY;
3222         }
3223
3224         return 0;
3225 }
3226
3227 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3228 {
3229         struct dmar_satc_unit *satcu;
3230         struct acpi_dmar_satc *tmp;
3231
3232         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3233                                 dmar_rcu_check()) {
3234                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3235                 if (satc->segment != tmp->segment)
3236                         continue;
3237                 if (satc->header.length != tmp->header.length)
3238                         continue;
3239                 if (memcmp(satc, tmp, satc->header.length) == 0)
3240                         return satcu;
3241         }
3242
3243         return NULL;
3244 }
3245
3246 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3247 {
3248         struct acpi_dmar_satc *satc;
3249         struct dmar_satc_unit *satcu;
3250
3251         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3252                 return 0;
3253
3254         satc = container_of(hdr, struct acpi_dmar_satc, header);
3255         satcu = dmar_find_satc(satc);
3256         if (satcu)
3257                 return 0;
3258
3259         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3260         if (!satcu)
3261                 return -ENOMEM;
3262
3263         satcu->hdr = (void *)(satcu + 1);
3264         memcpy(satcu->hdr, hdr, hdr->length);
3265         satcu->atc_required = satc->flags & 0x1;
3266         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3267                                               (void *)satc + satc->header.length,
3268                                               &satcu->devices_cnt);
3269         if (satcu->devices_cnt && !satcu->devices) {
3270                 kfree(satcu);
3271                 return -ENOMEM;
3272         }
3273         list_add_rcu(&satcu->list, &dmar_satc_units);
3274
3275         return 0;
3276 }
3277
3278 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3279 {
3280         int sp, ret;
3281         struct intel_iommu *iommu = dmaru->iommu;
3282
3283         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3284         if (ret)
3285                 goto out;
3286
3287         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3288                 pr_warn("%s: Doesn't support hardware pass through.\n",
3289                         iommu->name);
3290                 return -ENXIO;
3291         }
3292
3293         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3294         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3295                 pr_warn("%s: Doesn't support large page.\n",
3296                         iommu->name);
3297                 return -ENXIO;
3298         }
3299
3300         /*
3301          * Disable translation if already enabled prior to OS handover.
3302          */
3303         if (iommu->gcmd & DMA_GCMD_TE)
3304                 iommu_disable_translation(iommu);
3305
3306         ret = iommu_init_domains(iommu);
3307         if (ret == 0)
3308                 ret = iommu_alloc_root_entry(iommu);
3309         if (ret)
3310                 goto out;
3311
3312         intel_svm_check(iommu);
3313
3314         if (dmaru->ignored) {
3315                 /*
3316                  * we always have to disable PMRs or DMA may fail on this device
3317                  */
3318                 if (force_on)
3319                         iommu_disable_protect_mem_regions(iommu);
3320                 return 0;
3321         }
3322
3323         intel_iommu_init_qi(iommu);
3324         iommu_flush_write_buffer(iommu);
3325
3326 #ifdef CONFIG_INTEL_IOMMU_SVM
3327         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3328                 ret = intel_svm_enable_prq(iommu);
3329                 if (ret)
3330                         goto disable_iommu;
3331         }
3332 #endif
3333         ret = dmar_set_interrupt(iommu);
3334         if (ret)
3335                 goto disable_iommu;
3336
3337         iommu_set_root_entry(iommu);
3338         iommu_enable_translation(iommu);
3339
3340         iommu_disable_protect_mem_regions(iommu);
3341         return 0;
3342
3343 disable_iommu:
3344         disable_dmar_iommu(iommu);
3345 out:
3346         free_dmar_iommu(iommu);
3347         return ret;
3348 }
3349
3350 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3351 {
3352         int ret = 0;
3353         struct intel_iommu *iommu = dmaru->iommu;
3354
3355         if (!intel_iommu_enabled)
3356                 return 0;
3357         if (iommu == NULL)
3358                 return -EINVAL;
3359
3360         if (insert) {
3361                 ret = intel_iommu_add(dmaru);
3362         } else {
3363                 disable_dmar_iommu(iommu);
3364                 free_dmar_iommu(iommu);
3365         }
3366
3367         return ret;
3368 }
3369
3370 static void intel_iommu_free_dmars(void)
3371 {
3372         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3373         struct dmar_atsr_unit *atsru, *atsr_n;
3374         struct dmar_satc_unit *satcu, *satc_n;
3375
3376         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3377                 list_del(&rmrru->list);
3378                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3379                 kfree(rmrru);
3380         }
3381
3382         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3383                 list_del(&atsru->list);
3384                 intel_iommu_free_atsr(atsru);
3385         }
3386         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3387                 list_del(&satcu->list);
3388                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3389                 kfree(satcu);
3390         }
3391 }
3392
3393 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3394 {
3395         struct dmar_satc_unit *satcu;
3396         struct acpi_dmar_satc *satc;
3397         struct device *tmp;
3398         int i;
3399
3400         dev = pci_physfn(dev);
3401         rcu_read_lock();
3402
3403         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3404                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3405                 if (satc->segment != pci_domain_nr(dev->bus))
3406                         continue;
3407                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3408                         if (to_pci_dev(tmp) == dev)
3409                                 goto out;
3410         }
3411         satcu = NULL;
3412 out:
3413         rcu_read_unlock();
3414         return satcu;
3415 }
3416
3417 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3418 {
3419         int i, ret = 1;
3420         struct pci_bus *bus;
3421         struct pci_dev *bridge = NULL;
3422         struct device *tmp;
3423         struct acpi_dmar_atsr *atsr;
3424         struct dmar_atsr_unit *atsru;
3425         struct dmar_satc_unit *satcu;
3426
3427         dev = pci_physfn(dev);
3428         satcu = dmar_find_matched_satc_unit(dev);
3429         if (satcu)
3430                 /*
3431                  * This device supports ATS as it is in SATC table.
3432                  * When IOMMU is in legacy mode, enabling ATS is done
3433                  * automatically by HW for the device that requires
3434                  * ATS, hence OS should not enable this device ATS
3435                  * to avoid duplicated TLB invalidation.
3436                  */
3437                 return !(satcu->atc_required && !sm_supported(iommu));
3438
3439         for (bus = dev->bus; bus; bus = bus->parent) {
3440                 bridge = bus->self;
3441                 /* If it's an integrated device, allow ATS */
3442                 if (!bridge)
3443                         return 1;
3444                 /* Connected via non-PCIe: no ATS */
3445                 if (!pci_is_pcie(bridge) ||
3446                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3447                         return 0;
3448                 /* If we found the root port, look it up in the ATSR */
3449                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3450                         break;
3451         }
3452
3453         rcu_read_lock();
3454         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3455                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3456                 if (atsr->segment != pci_domain_nr(dev->bus))
3457                         continue;
3458
3459                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3460                         if (tmp == &bridge->dev)
3461                                 goto out;
3462
3463                 if (atsru->include_all)
3464                         goto out;
3465         }
3466         ret = 0;
3467 out:
3468         rcu_read_unlock();
3469
3470         return ret;
3471 }
3472
3473 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3474 {
3475         int ret;
3476         struct dmar_rmrr_unit *rmrru;
3477         struct dmar_atsr_unit *atsru;
3478         struct dmar_satc_unit *satcu;
3479         struct acpi_dmar_atsr *atsr;
3480         struct acpi_dmar_reserved_memory *rmrr;
3481         struct acpi_dmar_satc *satc;
3482
3483         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3484                 return 0;
3485
3486         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3487                 rmrr = container_of(rmrru->hdr,
3488                                     struct acpi_dmar_reserved_memory, header);
3489                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3490                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3491                                 ((void *)rmrr) + rmrr->header.length,
3492                                 rmrr->segment, rmrru->devices,
3493                                 rmrru->devices_cnt);
3494                         if (ret < 0)
3495                                 return ret;
3496                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3497                         dmar_remove_dev_scope(info, rmrr->segment,
3498                                 rmrru->devices, rmrru->devices_cnt);
3499                 }
3500         }
3501
3502         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3503                 if (atsru->include_all)
3504                         continue;
3505
3506                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3507                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3508                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3509                                         (void *)atsr + atsr->header.length,
3510                                         atsr->segment, atsru->devices,
3511                                         atsru->devices_cnt);
3512                         if (ret > 0)
3513                                 break;
3514                         else if (ret < 0)
3515                                 return ret;
3516                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3517                         if (dmar_remove_dev_scope(info, atsr->segment,
3518                                         atsru->devices, atsru->devices_cnt))
3519                                 break;
3520                 }
3521         }
3522         list_for_each_entry(satcu, &dmar_satc_units, list) {
3523                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3524                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3525                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3526                                         (void *)satc + satc->header.length,
3527                                         satc->segment, satcu->devices,
3528                                         satcu->devices_cnt);
3529                         if (ret > 0)
3530                                 break;
3531                         else if (ret < 0)
3532                                 return ret;
3533                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3534                         if (dmar_remove_dev_scope(info, satc->segment,
3535                                         satcu->devices, satcu->devices_cnt))
3536                                 break;
3537                 }
3538         }
3539
3540         return 0;
3541 }
3542
3543 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3544                                        unsigned long val, void *v)
3545 {
3546         struct memory_notify *mhp = v;
3547         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3548         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3549                         mhp->nr_pages - 1);
3550
3551         switch (val) {
3552         case MEM_GOING_ONLINE:
3553                 if (iommu_domain_identity_map(si_domain,
3554                                               start_vpfn, last_vpfn)) {
3555                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3556                                 start_vpfn, last_vpfn);
3557                         return NOTIFY_BAD;
3558                 }
3559                 break;
3560
3561         case MEM_OFFLINE:
3562         case MEM_CANCEL_ONLINE:
3563                 {
3564                         struct dmar_drhd_unit *drhd;
3565                         struct intel_iommu *iommu;
3566                         LIST_HEAD(freelist);
3567
3568                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3569
3570                         rcu_read_lock();
3571                         for_each_active_iommu(iommu, drhd)
3572                                 iommu_flush_iotlb_psi(iommu, si_domain,
3573                                         start_vpfn, mhp->nr_pages,
3574                                         list_empty(&freelist), 0);
3575                         rcu_read_unlock();
3576                         put_pages_list(&freelist);
3577                 }
3578                 break;
3579         }
3580
3581         return NOTIFY_OK;
3582 }
3583
3584 static struct notifier_block intel_iommu_memory_nb = {
3585         .notifier_call = intel_iommu_memory_notifier,
3586         .priority = 0
3587 };
3588
3589 static void intel_disable_iommus(void)
3590 {
3591         struct intel_iommu *iommu = NULL;
3592         struct dmar_drhd_unit *drhd;
3593
3594         for_each_iommu(iommu, drhd)
3595                 iommu_disable_translation(iommu);
3596 }
3597
3598 void intel_iommu_shutdown(void)
3599 {
3600         struct dmar_drhd_unit *drhd;
3601         struct intel_iommu *iommu = NULL;
3602
3603         if (no_iommu || dmar_disabled)
3604                 return;
3605
3606         down_write(&dmar_global_lock);
3607
3608         /* Disable PMRs explicitly here. */
3609         for_each_iommu(iommu, drhd)
3610                 iommu_disable_protect_mem_regions(iommu);
3611
3612         /* Make sure the IOMMUs are switched off */
3613         intel_disable_iommus();
3614
3615         up_write(&dmar_global_lock);
3616 }
3617
3618 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3619 {
3620         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3621
3622         return container_of(iommu_dev, struct intel_iommu, iommu);
3623 }
3624
3625 static ssize_t version_show(struct device *dev,
3626                             struct device_attribute *attr, char *buf)
3627 {
3628         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3629         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3630         return sysfs_emit(buf, "%d:%d\n",
3631                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3632 }
3633 static DEVICE_ATTR_RO(version);
3634
3635 static ssize_t address_show(struct device *dev,
3636                             struct device_attribute *attr, char *buf)
3637 {
3638         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3639         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3640 }
3641 static DEVICE_ATTR_RO(address);
3642
3643 static ssize_t cap_show(struct device *dev,
3644                         struct device_attribute *attr, char *buf)
3645 {
3646         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3647         return sysfs_emit(buf, "%llx\n", iommu->cap);
3648 }
3649 static DEVICE_ATTR_RO(cap);
3650
3651 static ssize_t ecap_show(struct device *dev,
3652                          struct device_attribute *attr, char *buf)
3653 {
3654         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3655         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3656 }
3657 static DEVICE_ATTR_RO(ecap);
3658
3659 static ssize_t domains_supported_show(struct device *dev,
3660                                       struct device_attribute *attr, char *buf)
3661 {
3662         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3663         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3664 }
3665 static DEVICE_ATTR_RO(domains_supported);
3666
3667 static ssize_t domains_used_show(struct device *dev,
3668                                  struct device_attribute *attr, char *buf)
3669 {
3670         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3671         return sysfs_emit(buf, "%d\n",
3672                           bitmap_weight(iommu->domain_ids,
3673                                         cap_ndoms(iommu->cap)));
3674 }
3675 static DEVICE_ATTR_RO(domains_used);
3676
3677 static struct attribute *intel_iommu_attrs[] = {
3678         &dev_attr_version.attr,
3679         &dev_attr_address.attr,
3680         &dev_attr_cap.attr,
3681         &dev_attr_ecap.attr,
3682         &dev_attr_domains_supported.attr,
3683         &dev_attr_domains_used.attr,
3684         NULL,
3685 };
3686
3687 static struct attribute_group intel_iommu_group = {
3688         .name = "intel-iommu",
3689         .attrs = intel_iommu_attrs,
3690 };
3691
3692 const struct attribute_group *intel_iommu_groups[] = {
3693         &intel_iommu_group,
3694         NULL,
3695 };
3696
3697 static inline bool has_external_pci(void)
3698 {
3699         struct pci_dev *pdev = NULL;
3700
3701         for_each_pci_dev(pdev)
3702                 if (pdev->external_facing) {
3703                         pci_dev_put(pdev);
3704                         return true;
3705                 }
3706
3707         return false;
3708 }
3709
3710 static int __init platform_optin_force_iommu(void)
3711 {
3712         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3713                 return 0;
3714
3715         if (no_iommu || dmar_disabled)
3716                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3717
3718         /*
3719          * If Intel-IOMMU is disabled by default, we will apply identity
3720          * map for all devices except those marked as being untrusted.
3721          */
3722         if (dmar_disabled)
3723                 iommu_set_default_passthrough(false);
3724
3725         dmar_disabled = 0;
3726         no_iommu = 0;
3727
3728         return 1;
3729 }
3730
3731 static int __init probe_acpi_namespace_devices(void)
3732 {
3733         struct dmar_drhd_unit *drhd;
3734         /* To avoid a -Wunused-but-set-variable warning. */
3735         struct intel_iommu *iommu __maybe_unused;
3736         struct device *dev;
3737         int i, ret = 0;
3738
3739         for_each_active_iommu(iommu, drhd) {
3740                 for_each_active_dev_scope(drhd->devices,
3741                                           drhd->devices_cnt, i, dev) {
3742                         struct acpi_device_physical_node *pn;
3743                         struct acpi_device *adev;
3744
3745                         if (dev->bus != &acpi_bus_type)
3746                                 continue;
3747
3748                         adev = to_acpi_device(dev);
3749                         mutex_lock(&adev->physical_node_lock);
3750                         list_for_each_entry(pn,
3751                                             &adev->physical_node_list, node) {
3752                                 ret = iommu_probe_device(pn->dev);
3753                                 if (ret)
3754                                         break;
3755                         }
3756                         mutex_unlock(&adev->physical_node_lock);
3757
3758                         if (ret)
3759                                 return ret;
3760                 }
3761         }
3762
3763         return 0;
3764 }
3765
3766 static __init int tboot_force_iommu(void)
3767 {
3768         if (!tboot_enabled())
3769                 return 0;
3770
3771         if (no_iommu || dmar_disabled)
3772                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3773
3774         dmar_disabled = 0;
3775         no_iommu = 0;
3776
3777         return 1;
3778 }
3779
3780 int __init intel_iommu_init(void)
3781 {
3782         int ret = -ENODEV;
3783         struct dmar_drhd_unit *drhd;
3784         struct intel_iommu *iommu;
3785
3786         /*
3787          * Intel IOMMU is required for a TXT/tboot launch or platform
3788          * opt in, so enforce that.
3789          */
3790         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3791                     platform_optin_force_iommu();
3792
3793         down_write(&dmar_global_lock);
3794         if (dmar_table_init()) {
3795                 if (force_on)
3796                         panic("tboot: Failed to initialize DMAR table\n");
3797                 goto out_free_dmar;
3798         }
3799
3800         if (dmar_dev_scope_init() < 0) {
3801                 if (force_on)
3802                         panic("tboot: Failed to initialize DMAR device scope\n");
3803                 goto out_free_dmar;
3804         }
3805
3806         up_write(&dmar_global_lock);
3807
3808         /*
3809          * The bus notifier takes the dmar_global_lock, so lockdep will
3810          * complain later when we register it under the lock.
3811          */
3812         dmar_register_bus_notifier();
3813
3814         down_write(&dmar_global_lock);
3815
3816         if (!no_iommu)
3817                 intel_iommu_debugfs_init();
3818
3819         if (no_iommu || dmar_disabled) {
3820                 /*
3821                  * We exit the function here to ensure IOMMU's remapping and
3822                  * mempool aren't setup, which means that the IOMMU's PMRs
3823                  * won't be disabled via the call to init_dmars(). So disable
3824                  * it explicitly here. The PMRs were setup by tboot prior to
3825                  * calling SENTER, but the kernel is expected to reset/tear
3826                  * down the PMRs.
3827                  */
3828                 if (intel_iommu_tboot_noforce) {
3829                         for_each_iommu(iommu, drhd)
3830                                 iommu_disable_protect_mem_regions(iommu);
3831                 }
3832
3833                 /*
3834                  * Make sure the IOMMUs are switched off, even when we
3835                  * boot into a kexec kernel and the previous kernel left
3836                  * them enabled
3837                  */
3838                 intel_disable_iommus();
3839                 goto out_free_dmar;
3840         }
3841
3842         if (list_empty(&dmar_rmrr_units))
3843                 pr_info("No RMRR found\n");
3844
3845         if (list_empty(&dmar_atsr_units))
3846                 pr_info("No ATSR found\n");
3847
3848         if (list_empty(&dmar_satc_units))
3849                 pr_info("No SATC found\n");
3850
3851         init_no_remapping_devices();
3852
3853         ret = init_dmars();
3854         if (ret) {
3855                 if (force_on)
3856                         panic("tboot: Failed to initialize DMARs\n");
3857                 pr_err("Initialization failed\n");
3858                 goto out_free_dmar;
3859         }
3860         up_write(&dmar_global_lock);
3861
3862         init_iommu_pm_ops();
3863
3864         down_read(&dmar_global_lock);
3865         for_each_active_iommu(iommu, drhd) {
3866                 /*
3867                  * The flush queue implementation does not perform
3868                  * page-selective invalidations that are required for efficient
3869                  * TLB flushes in virtual environments.  The benefit of batching
3870                  * is likely to be much lower than the overhead of synchronizing
3871                  * the virtual and physical IOMMU page-tables.
3872                  */
3873                 if (cap_caching_mode(iommu->cap) &&
3874                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3875                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3876                         iommu_set_dma_strict();
3877                 }
3878                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3879                                        intel_iommu_groups,
3880                                        "%s", iommu->name);
3881                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3882
3883                 iommu_pmu_register(iommu);
3884         }
3885         up_read(&dmar_global_lock);
3886
3887         if (si_domain && !hw_pass_through)
3888                 register_memory_notifier(&intel_iommu_memory_nb);
3889
3890         down_read(&dmar_global_lock);
3891         if (probe_acpi_namespace_devices())
3892                 pr_warn("ACPI name space devices didn't probe correctly\n");
3893
3894         /* Finally, we enable the DMA remapping hardware. */
3895         for_each_iommu(iommu, drhd) {
3896                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3897                         iommu_enable_translation(iommu);
3898
3899                 iommu_disable_protect_mem_regions(iommu);
3900         }
3901         up_read(&dmar_global_lock);
3902
3903         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3904
3905         intel_iommu_enabled = 1;
3906
3907         return 0;
3908
3909 out_free_dmar:
3910         intel_iommu_free_dmars();
3911         up_write(&dmar_global_lock);
3912         return ret;
3913 }
3914
3915 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3916 {
3917         struct device_domain_info *info = opaque;
3918
3919         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3920         return 0;
3921 }
3922
3923 /*
3924  * NB - intel-iommu lacks any sort of reference counting for the users of
3925  * dependent devices.  If multiple endpoints have intersecting dependent
3926  * devices, unbinding the driver from any one of them will possibly leave
3927  * the others unable to operate.
3928  */
3929 static void domain_context_clear(struct device_domain_info *info)
3930 {
3931         if (!dev_is_pci(info->dev))
3932                 domain_context_clear_one(info, info->bus, info->devfn);
3933
3934         pci_for_each_dma_alias(to_pci_dev(info->dev),
3935                                &domain_context_clear_one_cb, info);
3936 }
3937
3938 static void dmar_remove_one_dev_info(struct device *dev)
3939 {
3940         struct device_domain_info *info = dev_iommu_priv_get(dev);
3941         struct dmar_domain *domain = info->domain;
3942         struct intel_iommu *iommu = info->iommu;
3943         unsigned long flags;
3944
3945         if (!dev_is_real_dma_subdevice(info->dev)) {
3946                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3947                         intel_pasid_tear_down_entry(iommu, info->dev,
3948                                         IOMMU_NO_PASID, false);
3949
3950                 iommu_disable_pci_caps(info);
3951                 domain_context_clear(info);
3952         }
3953
3954         spin_lock_irqsave(&domain->lock, flags);
3955         list_del(&info->link);
3956         spin_unlock_irqrestore(&domain->lock, flags);
3957
3958         domain_detach_iommu(domain, iommu);
3959         info->domain = NULL;
3960 }
3961
3962 /*
3963  * Clear the page table pointer in context or pasid table entries so that
3964  * all DMA requests without PASID from the device are blocked. If the page
3965  * table has been set, clean up the data structures.
3966  */
3967 void device_block_translation(struct device *dev)
3968 {
3969         struct device_domain_info *info = dev_iommu_priv_get(dev);
3970         struct intel_iommu *iommu = info->iommu;
3971         unsigned long flags;
3972
3973         iommu_disable_pci_caps(info);
3974         if (!dev_is_real_dma_subdevice(dev)) {
3975                 if (sm_supported(iommu))
3976                         intel_pasid_tear_down_entry(iommu, dev,
3977                                                     IOMMU_NO_PASID, false);
3978                 else
3979                         domain_context_clear(info);
3980         }
3981
3982         if (!info->domain)
3983                 return;
3984
3985         spin_lock_irqsave(&info->domain->lock, flags);
3986         list_del(&info->link);
3987         spin_unlock_irqrestore(&info->domain->lock, flags);
3988
3989         domain_detach_iommu(info->domain, iommu);
3990         info->domain = NULL;
3991 }
3992
3993 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3994 {
3995         int adjust_width;
3996
3997         /* calculate AGAW */
3998         domain->gaw = guest_width;
3999         adjust_width = guestwidth_to_adjustwidth(guest_width);
4000         domain->agaw = width_to_agaw(adjust_width);
4001
4002         domain->iommu_coherency = false;
4003         domain->iommu_superpage = 0;
4004         domain->max_addr = 0;
4005
4006         /* always allocate the top pgd */
4007         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4008         if (!domain->pgd)
4009                 return -ENOMEM;
4010         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4011         return 0;
4012 }
4013
4014 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4015                                       struct device *dev)
4016 {
4017         device_block_translation(dev);
4018         return 0;
4019 }
4020
4021 static struct iommu_domain blocking_domain = {
4022         .type = IOMMU_DOMAIN_BLOCKED,
4023         .ops = &(const struct iommu_domain_ops) {
4024                 .attach_dev     = blocking_domain_attach_dev,
4025         }
4026 };
4027
4028 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4029 {
4030         struct dmar_domain *dmar_domain;
4031         struct iommu_domain *domain;
4032
4033         switch (type) {
4034         case IOMMU_DOMAIN_DMA:
4035         case IOMMU_DOMAIN_UNMANAGED:
4036                 dmar_domain = alloc_domain(type);
4037                 if (!dmar_domain) {
4038                         pr_err("Can't allocate dmar_domain\n");
4039                         return NULL;
4040                 }
4041                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4042                         pr_err("Domain initialization failed\n");
4043                         domain_exit(dmar_domain);
4044                         return NULL;
4045                 }
4046
4047                 domain = &dmar_domain->domain;
4048                 domain->geometry.aperture_start = 0;
4049                 domain->geometry.aperture_end   =
4050                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4051                 domain->geometry.force_aperture = true;
4052
4053                 return domain;
4054         case IOMMU_DOMAIN_IDENTITY:
4055                 return &si_domain->domain;
4056         case IOMMU_DOMAIN_SVA:
4057                 return intel_svm_domain_alloc();
4058         default:
4059                 return NULL;
4060         }
4061
4062         return NULL;
4063 }
4064
4065 static struct iommu_domain *
4066 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4067                               struct iommu_domain *parent,
4068                               const struct iommu_user_data *user_data)
4069 {
4070         struct device_domain_info *info = dev_iommu_priv_get(dev);
4071         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4072         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4073         struct intel_iommu *iommu = info->iommu;
4074         struct iommu_domain *domain;
4075
4076         /* Must be NESTING domain */
4077         if (parent) {
4078                 if (!nested_supported(iommu) || flags)
4079                         return ERR_PTR(-EOPNOTSUPP);
4080                 return intel_nested_domain_alloc(parent, user_data);
4081         }
4082
4083         if (flags &
4084             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4085                 return ERR_PTR(-EOPNOTSUPP);
4086         if (nested_parent && !nested_supported(iommu))
4087                 return ERR_PTR(-EOPNOTSUPP);
4088         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4089                 return ERR_PTR(-EOPNOTSUPP);
4090
4091         /*
4092          * domain_alloc_user op needs to fully initialize a domain before
4093          * return, so uses iommu_domain_alloc() here for simple.
4094          */
4095         domain = iommu_domain_alloc(dev->bus);
4096         if (!domain)
4097                 return ERR_PTR(-ENOMEM);
4098
4099         if (nested_parent)
4100                 to_dmar_domain(domain)->nested_parent = true;
4101
4102         if (dirty_tracking) {
4103                 if (to_dmar_domain(domain)->use_first_level) {
4104                         iommu_domain_free(domain);
4105                         return ERR_PTR(-EOPNOTSUPP);
4106                 }
4107                 domain->dirty_ops = &intel_dirty_ops;
4108         }
4109
4110         return domain;
4111 }
4112
4113 static void intel_iommu_domain_free(struct iommu_domain *domain)
4114 {
4115         if (domain != &si_domain->domain)
4116                 domain_exit(to_dmar_domain(domain));
4117 }
4118
4119 int prepare_domain_attach_device(struct iommu_domain *domain,
4120                                  struct device *dev)
4121 {
4122         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4123         struct intel_iommu *iommu;
4124         int addr_width;
4125
4126         iommu = device_to_iommu(dev, NULL, NULL);
4127         if (!iommu)
4128                 return -ENODEV;
4129
4130         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4131                 return -EINVAL;
4132
4133         if (domain->dirty_ops && !ssads_supported(iommu))
4134                 return -EINVAL;
4135
4136         /* check if this iommu agaw is sufficient for max mapped address */
4137         addr_width = agaw_to_width(iommu->agaw);
4138         if (addr_width > cap_mgaw(iommu->cap))
4139                 addr_width = cap_mgaw(iommu->cap);
4140
4141         if (dmar_domain->max_addr > (1LL << addr_width))
4142                 return -EINVAL;
4143         dmar_domain->gaw = addr_width;
4144
4145         /*
4146          * Knock out extra levels of page tables if necessary
4147          */
4148         while (iommu->agaw < dmar_domain->agaw) {
4149                 struct dma_pte *pte;
4150
4151                 pte = dmar_domain->pgd;
4152                 if (dma_pte_present(pte)) {
4153                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4154                         free_pgtable_page(pte);
4155                 }
4156                 dmar_domain->agaw--;
4157         }
4158
4159         return 0;
4160 }
4161
4162 static int intel_iommu_attach_device(struct iommu_domain *domain,
4163                                      struct device *dev)
4164 {
4165         struct device_domain_info *info = dev_iommu_priv_get(dev);
4166         int ret;
4167
4168         if (info->domain)
4169                 device_block_translation(dev);
4170
4171         ret = prepare_domain_attach_device(domain, dev);
4172         if (ret)
4173                 return ret;
4174
4175         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4176 }
4177
4178 static int intel_iommu_map(struct iommu_domain *domain,
4179                            unsigned long iova, phys_addr_t hpa,
4180                            size_t size, int iommu_prot, gfp_t gfp)
4181 {
4182         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4183         u64 max_addr;
4184         int prot = 0;
4185
4186         if (iommu_prot & IOMMU_READ)
4187                 prot |= DMA_PTE_READ;
4188         if (iommu_prot & IOMMU_WRITE)
4189                 prot |= DMA_PTE_WRITE;
4190         if (dmar_domain->set_pte_snp)
4191                 prot |= DMA_PTE_SNP;
4192
4193         max_addr = iova + size;
4194         if (dmar_domain->max_addr < max_addr) {
4195                 u64 end;
4196
4197                 /* check if minimum agaw is sufficient for mapped address */
4198                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4199                 if (end < max_addr) {
4200                         pr_err("%s: iommu width (%d) is not "
4201                                "sufficient for the mapped address (%llx)\n",
4202                                __func__, dmar_domain->gaw, max_addr);
4203                         return -EFAULT;
4204                 }
4205                 dmar_domain->max_addr = max_addr;
4206         }
4207         /* Round up size to next multiple of PAGE_SIZE, if it and
4208            the low bits of hpa would take us onto the next page */
4209         size = aligned_nrpages(hpa, size);
4210         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4211                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4212 }
4213
4214 static int intel_iommu_map_pages(struct iommu_domain *domain,
4215                                  unsigned long iova, phys_addr_t paddr,
4216                                  size_t pgsize, size_t pgcount,
4217                                  int prot, gfp_t gfp, size_t *mapped)
4218 {
4219         unsigned long pgshift = __ffs(pgsize);
4220         size_t size = pgcount << pgshift;
4221         int ret;
4222
4223         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4224                 return -EINVAL;
4225
4226         if (!IS_ALIGNED(iova | paddr, pgsize))
4227                 return -EINVAL;
4228
4229         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4230         if (!ret && mapped)
4231                 *mapped = size;
4232
4233         return ret;
4234 }
4235
4236 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4237                                 unsigned long iova, size_t size,
4238                                 struct iommu_iotlb_gather *gather)
4239 {
4240         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4241         unsigned long start_pfn, last_pfn;
4242         int level = 0;
4243
4244         /* Cope with horrid API which requires us to unmap more than the
4245            size argument if it happens to be a large-page mapping. */
4246         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4247                                      &level, GFP_ATOMIC)))
4248                 return 0;
4249
4250         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4251                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4252
4253         start_pfn = iova >> VTD_PAGE_SHIFT;
4254         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4255
4256         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4257
4258         if (dmar_domain->max_addr == iova + size)
4259                 dmar_domain->max_addr = iova;
4260
4261         /*
4262          * We do not use page-selective IOTLB invalidation in flush queue,
4263          * so there is no need to track page and sync iotlb.
4264          */
4265         if (!iommu_iotlb_gather_queued(gather))
4266                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4267
4268         return size;
4269 }
4270
4271 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4272                                       unsigned long iova,
4273                                       size_t pgsize, size_t pgcount,
4274                                       struct iommu_iotlb_gather *gather)
4275 {
4276         unsigned long pgshift = __ffs(pgsize);
4277         size_t size = pgcount << pgshift;
4278
4279         return intel_iommu_unmap(domain, iova, size, gather);
4280 }
4281
4282 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4283                                  struct iommu_iotlb_gather *gather)
4284 {
4285         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4286         unsigned long iova_pfn = IOVA_PFN(gather->start);
4287         size_t size = gather->end - gather->start;
4288         struct iommu_domain_info *info;
4289         unsigned long start_pfn;
4290         unsigned long nrpages;
4291         unsigned long i;
4292
4293         nrpages = aligned_nrpages(gather->start, size);
4294         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4295
4296         xa_for_each(&dmar_domain->iommu_array, i, info)
4297                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4298                                       start_pfn, nrpages,
4299                                       list_empty(&gather->freelist), 0);
4300
4301         put_pages_list(&gather->freelist);
4302 }
4303
4304 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4305                                             dma_addr_t iova)
4306 {
4307         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4308         struct dma_pte *pte;
4309         int level = 0;
4310         u64 phys = 0;
4311
4312         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4313                              GFP_ATOMIC);
4314         if (pte && dma_pte_present(pte))
4315                 phys = dma_pte_addr(pte) +
4316                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4317                                                 VTD_PAGE_SHIFT) - 1));
4318
4319         return phys;
4320 }
4321
4322 static bool domain_support_force_snooping(struct dmar_domain *domain)
4323 {
4324         struct device_domain_info *info;
4325         bool support = true;
4326
4327         assert_spin_locked(&domain->lock);
4328         list_for_each_entry(info, &domain->devices, link) {
4329                 if (!ecap_sc_support(info->iommu->ecap)) {
4330                         support = false;
4331                         break;
4332                 }
4333         }
4334
4335         return support;
4336 }
4337
4338 static void domain_set_force_snooping(struct dmar_domain *domain)
4339 {
4340         struct device_domain_info *info;
4341
4342         assert_spin_locked(&domain->lock);
4343         /*
4344          * Second level page table supports per-PTE snoop control. The
4345          * iommu_map() interface will handle this by setting SNP bit.
4346          */
4347         if (!domain->use_first_level) {
4348                 domain->set_pte_snp = true;
4349                 return;
4350         }
4351
4352         list_for_each_entry(info, &domain->devices, link)
4353                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4354                                                      IOMMU_NO_PASID);
4355 }
4356
4357 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4358 {
4359         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4360         unsigned long flags;
4361
4362         if (dmar_domain->force_snooping)
4363                 return true;
4364
4365         spin_lock_irqsave(&dmar_domain->lock, flags);
4366         if (!domain_support_force_snooping(dmar_domain) ||
4367             (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4368                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4369                 return false;
4370         }
4371
4372         domain_set_force_snooping(dmar_domain);
4373         dmar_domain->force_snooping = true;
4374         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4375
4376         return true;
4377 }
4378
4379 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4380 {
4381         struct device_domain_info *info = dev_iommu_priv_get(dev);
4382
4383         switch (cap) {
4384         case IOMMU_CAP_CACHE_COHERENCY:
4385         case IOMMU_CAP_DEFERRED_FLUSH:
4386                 return true;
4387         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4388                 return dmar_platform_optin();
4389         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4390                 return ecap_sc_support(info->iommu->ecap);
4391         case IOMMU_CAP_DIRTY_TRACKING:
4392                 return ssads_supported(info->iommu);
4393         default:
4394                 return false;
4395         }
4396 }
4397
4398 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4399 {
4400         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4401         struct device_domain_info *info;
4402         struct intel_iommu *iommu;
4403         u8 bus, devfn;
4404         int ret;
4405
4406         iommu = device_to_iommu(dev, &bus, &devfn);
4407         if (!iommu || !iommu->iommu.ops)
4408                 return ERR_PTR(-ENODEV);
4409
4410         info = kzalloc(sizeof(*info), GFP_KERNEL);
4411         if (!info)
4412                 return ERR_PTR(-ENOMEM);
4413
4414         if (dev_is_real_dma_subdevice(dev)) {
4415                 info->bus = pdev->bus->number;
4416                 info->devfn = pdev->devfn;
4417                 info->segment = pci_domain_nr(pdev->bus);
4418         } else {
4419                 info->bus = bus;
4420                 info->devfn = devfn;
4421                 info->segment = iommu->segment;
4422         }
4423
4424         info->dev = dev;
4425         info->iommu = iommu;
4426         if (dev_is_pci(dev)) {
4427                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4428                     pci_ats_supported(pdev) &&
4429                     dmar_ats_supported(pdev, iommu)) {
4430                         info->ats_supported = 1;
4431                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4432
4433                         /*
4434                          * For IOMMU that supports device IOTLB throttling
4435                          * (DIT), we assign PFSID to the invalidation desc
4436                          * of a VF such that IOMMU HW can gauge queue depth
4437                          * at PF level. If DIT is not set, PFSID will be
4438                          * treated as reserved, which should be set to 0.
4439                          */
4440                         if (ecap_dit(iommu->ecap))
4441                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4442                         info->ats_qdep = pci_ats_queue_depth(pdev);
4443                 }
4444                 if (sm_supported(iommu)) {
4445                         if (pasid_supported(iommu)) {
4446                                 int features = pci_pasid_features(pdev);
4447
4448                                 if (features >= 0)
4449                                         info->pasid_supported = features | 1;
4450                         }
4451
4452                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4453                             pci_pri_supported(pdev))
4454                                 info->pri_supported = 1;
4455                 }
4456         }
4457
4458         dev_iommu_priv_set(dev, info);
4459
4460         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4461                 ret = intel_pasid_alloc_table(dev);
4462                 if (ret) {
4463                         dev_err(dev, "PASID table allocation failed\n");
4464                         dev_iommu_priv_set(dev, NULL);
4465                         kfree(info);
4466                         return ERR_PTR(ret);
4467                 }
4468         }
4469
4470         intel_iommu_debugfs_create_dev(info);
4471
4472         return &iommu->iommu;
4473 }
4474
4475 static void intel_iommu_release_device(struct device *dev)
4476 {
4477         struct device_domain_info *info = dev_iommu_priv_get(dev);
4478
4479         dmar_remove_one_dev_info(dev);
4480         intel_pasid_free_table(dev);
4481         intel_iommu_debugfs_remove_dev(info);
4482         dev_iommu_priv_set(dev, NULL);
4483         kfree(info);
4484         set_dma_ops(dev, NULL);
4485 }
4486
4487 static void intel_iommu_probe_finalize(struct device *dev)
4488 {
4489         set_dma_ops(dev, NULL);
4490         iommu_setup_dma_ops(dev, 0, U64_MAX);
4491 }
4492
4493 static void intel_iommu_get_resv_regions(struct device *device,
4494                                          struct list_head *head)
4495 {
4496         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4497         struct iommu_resv_region *reg;
4498         struct dmar_rmrr_unit *rmrr;
4499         struct device *i_dev;
4500         int i;
4501
4502         rcu_read_lock();
4503         for_each_rmrr_units(rmrr) {
4504                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4505                                           i, i_dev) {
4506                         struct iommu_resv_region *resv;
4507                         enum iommu_resv_type type;
4508                         size_t length;
4509
4510                         if (i_dev != device &&
4511                             !is_downstream_to_pci_bridge(device, i_dev))
4512                                 continue;
4513
4514                         length = rmrr->end_address - rmrr->base_address + 1;
4515
4516                         type = device_rmrr_is_relaxable(device) ?
4517                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4518
4519                         resv = iommu_alloc_resv_region(rmrr->base_address,
4520                                                        length, prot, type,
4521                                                        GFP_ATOMIC);
4522                         if (!resv)
4523                                 break;
4524
4525                         list_add_tail(&resv->list, head);
4526                 }
4527         }
4528         rcu_read_unlock();
4529
4530 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4531         if (dev_is_pci(device)) {
4532                 struct pci_dev *pdev = to_pci_dev(device);
4533
4534                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4535                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4536                                         IOMMU_RESV_DIRECT_RELAXABLE,
4537                                         GFP_KERNEL);
4538                         if (reg)
4539                                 list_add_tail(&reg->list, head);
4540                 }
4541         }
4542 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4543
4544         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4545                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4546                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4547         if (!reg)
4548                 return;
4549         list_add_tail(&reg->list, head);
4550 }
4551
4552 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4553 {
4554         if (dev_is_pci(dev))
4555                 return pci_device_group(dev);
4556         return generic_device_group(dev);
4557 }
4558
4559 static int intel_iommu_enable_sva(struct device *dev)
4560 {
4561         struct device_domain_info *info = dev_iommu_priv_get(dev);
4562         struct intel_iommu *iommu;
4563
4564         if (!info || dmar_disabled)
4565                 return -EINVAL;
4566
4567         iommu = info->iommu;
4568         if (!iommu)
4569                 return -EINVAL;
4570
4571         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4572                 return -ENODEV;
4573
4574         if (!info->pasid_enabled || !info->ats_enabled)
4575                 return -EINVAL;
4576
4577         /*
4578          * Devices having device-specific I/O fault handling should not
4579          * support PCI/PRI. The IOMMU side has no means to check the
4580          * capability of device-specific IOPF.  Therefore, IOMMU can only
4581          * default that if the device driver enables SVA on a non-PRI
4582          * device, it will handle IOPF in its own way.
4583          */
4584         if (!info->pri_supported)
4585                 return 0;
4586
4587         /* Devices supporting PRI should have it enabled. */
4588         if (!info->pri_enabled)
4589                 return -EINVAL;
4590
4591         return 0;
4592 }
4593
4594 static int intel_iommu_enable_iopf(struct device *dev)
4595 {
4596         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4597         struct device_domain_info *info = dev_iommu_priv_get(dev);
4598         struct intel_iommu *iommu;
4599         int ret;
4600
4601         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4602                 return -ENODEV;
4603
4604         if (info->pri_enabled)
4605                 return -EBUSY;
4606
4607         iommu = info->iommu;
4608         if (!iommu)
4609                 return -EINVAL;
4610
4611         /* PASID is required in PRG Response Message. */
4612         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4613                 return -EINVAL;
4614
4615         ret = pci_reset_pri(pdev);
4616         if (ret)
4617                 return ret;
4618
4619         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4620         if (ret)
4621                 return ret;
4622
4623         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4624         if (ret)
4625                 goto iopf_remove_device;
4626
4627         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4628         if (ret)
4629                 goto iopf_unregister_handler;
4630         info->pri_enabled = 1;
4631
4632         return 0;
4633
4634 iopf_unregister_handler:
4635         iommu_unregister_device_fault_handler(dev);
4636 iopf_remove_device:
4637         iopf_queue_remove_device(iommu->iopf_queue, dev);
4638
4639         return ret;
4640 }
4641
4642 static int intel_iommu_disable_iopf(struct device *dev)
4643 {
4644         struct device_domain_info *info = dev_iommu_priv_get(dev);
4645         struct intel_iommu *iommu = info->iommu;
4646
4647         if (!info->pri_enabled)
4648                 return -EINVAL;
4649
4650         /*
4651          * PCIe spec states that by clearing PRI enable bit, the Page
4652          * Request Interface will not issue new page requests, but has
4653          * outstanding page requests that have been transmitted or are
4654          * queued for transmission. This is supposed to be called after
4655          * the device driver has stopped DMA, all PASIDs have been
4656          * unbound and the outstanding PRQs have been drained.
4657          */
4658         pci_disable_pri(to_pci_dev(dev));
4659         info->pri_enabled = 0;
4660
4661         /*
4662          * With PRI disabled and outstanding PRQs drained, unregistering
4663          * fault handler and removing device from iopf queue should never
4664          * fail.
4665          */
4666         WARN_ON(iommu_unregister_device_fault_handler(dev));
4667         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4668
4669         return 0;
4670 }
4671
4672 static int
4673 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4674 {
4675         switch (feat) {
4676         case IOMMU_DEV_FEAT_IOPF:
4677                 return intel_iommu_enable_iopf(dev);
4678
4679         case IOMMU_DEV_FEAT_SVA:
4680                 return intel_iommu_enable_sva(dev);
4681
4682         default:
4683                 return -ENODEV;
4684         }
4685 }
4686
4687 static int
4688 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4689 {
4690         switch (feat) {
4691         case IOMMU_DEV_FEAT_IOPF:
4692                 return intel_iommu_disable_iopf(dev);
4693
4694         case IOMMU_DEV_FEAT_SVA:
4695                 return 0;
4696
4697         default:
4698                 return -ENODEV;
4699         }
4700 }
4701
4702 static bool intel_iommu_is_attach_deferred(struct device *dev)
4703 {
4704         struct device_domain_info *info = dev_iommu_priv_get(dev);
4705
4706         return translation_pre_enabled(info->iommu) && !info->domain;
4707 }
4708
4709 /*
4710  * Check that the device does not live on an external facing PCI port that is
4711  * marked as untrusted. Such devices should not be able to apply quirks and
4712  * thus not be able to bypass the IOMMU restrictions.
4713  */
4714 static bool risky_device(struct pci_dev *pdev)
4715 {
4716         if (pdev->untrusted) {
4717                 pci_info(pdev,
4718                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4719                          pdev->vendor, pdev->device);
4720                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4721                 return true;
4722         }
4723         return false;
4724 }
4725
4726 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4727                                       unsigned long iova, size_t size)
4728 {
4729         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4730         unsigned long pages = aligned_nrpages(iova, size);
4731         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4732         struct iommu_domain_info *info;
4733         unsigned long i;
4734
4735         xa_for_each(&dmar_domain->iommu_array, i, info)
4736                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4737         return 0;
4738 }
4739
4740 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4741 {
4742         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4743         struct dev_pasid_info *curr, *dev_pasid = NULL;
4744         struct dmar_domain *dmar_domain;
4745         struct iommu_domain *domain;
4746         unsigned long flags;
4747
4748         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4749         if (WARN_ON_ONCE(!domain))
4750                 goto out_tear_down;
4751
4752         /*
4753          * The SVA implementation needs to handle its own stuffs like the mm
4754          * notification. Before consolidating that code into iommu core, let
4755          * the intel sva code handle it.
4756          */
4757         if (domain->type == IOMMU_DOMAIN_SVA) {
4758                 intel_svm_remove_dev_pasid(dev, pasid);
4759                 goto out_tear_down;
4760         }
4761
4762         dmar_domain = to_dmar_domain(domain);
4763         spin_lock_irqsave(&dmar_domain->lock, flags);
4764         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4765                 if (curr->dev == dev && curr->pasid == pasid) {
4766                         list_del(&curr->link_domain);
4767                         dev_pasid = curr;
4768                         break;
4769                 }
4770         }
4771         WARN_ON_ONCE(!dev_pasid);
4772         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4773
4774         domain_detach_iommu(dmar_domain, iommu);
4775         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4776         kfree(dev_pasid);
4777 out_tear_down:
4778         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4779         intel_drain_pasid_prq(dev, pasid);
4780 }
4781
4782 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4783                                      struct device *dev, ioasid_t pasid)
4784 {
4785         struct device_domain_info *info = dev_iommu_priv_get(dev);
4786         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4787         struct intel_iommu *iommu = info->iommu;
4788         struct dev_pasid_info *dev_pasid;
4789         unsigned long flags;
4790         int ret;
4791
4792         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4793                 return -EOPNOTSUPP;
4794
4795         if (domain->dirty_ops)
4796                 return -EINVAL;
4797
4798         if (context_copied(iommu, info->bus, info->devfn))
4799                 return -EBUSY;
4800
4801         ret = prepare_domain_attach_device(domain, dev);
4802         if (ret)
4803                 return ret;
4804
4805         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4806         if (!dev_pasid)
4807                 return -ENOMEM;
4808
4809         ret = domain_attach_iommu(dmar_domain, iommu);
4810         if (ret)
4811                 goto out_free;
4812
4813         if (domain_type_is_si(dmar_domain))
4814                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4815                                                      dev, pasid);
4816         else if (dmar_domain->use_first_level)
4817                 ret = domain_setup_first_level(iommu, dmar_domain,
4818                                                dev, pasid);
4819         else
4820                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4821                                                      dev, pasid);
4822         if (ret)
4823                 goto out_detach_iommu;
4824
4825         dev_pasid->dev = dev;
4826         dev_pasid->pasid = pasid;
4827         spin_lock_irqsave(&dmar_domain->lock, flags);
4828         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4829         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4830
4831         if (domain->type & __IOMMU_DOMAIN_PAGING)
4832                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4833
4834         return 0;
4835 out_detach_iommu:
4836         domain_detach_iommu(dmar_domain, iommu);
4837 out_free:
4838         kfree(dev_pasid);
4839         return ret;
4840 }
4841
4842 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4843 {
4844         struct device_domain_info *info = dev_iommu_priv_get(dev);
4845         struct intel_iommu *iommu = info->iommu;
4846         struct iommu_hw_info_vtd *vtd;
4847
4848         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4849         if (!vtd)
4850                 return ERR_PTR(-ENOMEM);
4851
4852         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4853         vtd->cap_reg = iommu->cap;
4854         vtd->ecap_reg = iommu->ecap;
4855         *length = sizeof(*vtd);
4856         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4857         return vtd;
4858 }
4859
4860 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4861                                           bool enable)
4862 {
4863         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4864         struct device_domain_info *info;
4865         int ret;
4866
4867         spin_lock(&dmar_domain->lock);
4868         if (dmar_domain->dirty_tracking == enable)
4869                 goto out_unlock;
4870
4871         list_for_each_entry(info, &dmar_domain->devices, link) {
4872                 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4873                                                        info->domain, info->dev,
4874                                                        IOMMU_NO_PASID, enable);
4875                 if (ret)
4876                         goto err_unwind;
4877         }
4878
4879         dmar_domain->dirty_tracking = enable;
4880 out_unlock:
4881         spin_unlock(&dmar_domain->lock);
4882
4883         return 0;
4884
4885 err_unwind:
4886         list_for_each_entry(info, &dmar_domain->devices, link)
4887                 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4888                                                  info->dev, IOMMU_NO_PASID,
4889                                                  dmar_domain->dirty_tracking);
4890         spin_unlock(&dmar_domain->lock);
4891         return ret;
4892 }
4893
4894 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4895                                             unsigned long iova, size_t size,
4896                                             unsigned long flags,
4897                                             struct iommu_dirty_bitmap *dirty)
4898 {
4899         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4900         unsigned long end = iova + size - 1;
4901         unsigned long pgsize;
4902
4903         /*
4904          * IOMMUFD core calls into a dirty tracking disabled domain without an
4905          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4906          * have occurred when we stopped dirty tracking. This ensures that we
4907          * never inherit dirtied bits from a previous cycle.
4908          */
4909         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4910                 return -EINVAL;
4911
4912         do {
4913                 struct dma_pte *pte;
4914                 int lvl = 0;
4915
4916                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4917                                      GFP_ATOMIC);
4918                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4919                 if (!pte || !dma_pte_present(pte)) {
4920                         iova += pgsize;
4921                         continue;
4922                 }
4923
4924                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4925                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4926                 iova += pgsize;
4927         } while (iova < end);
4928
4929         return 0;
4930 }
4931
4932 static const struct iommu_dirty_ops intel_dirty_ops = {
4933         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4934         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4935 };
4936
4937 const struct iommu_ops intel_iommu_ops = {
4938         .blocked_domain         = &blocking_domain,
4939         .capable                = intel_iommu_capable,
4940         .hw_info                = intel_iommu_hw_info,
4941         .domain_alloc           = intel_iommu_domain_alloc,
4942         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4943         .probe_device           = intel_iommu_probe_device,
4944         .probe_finalize         = intel_iommu_probe_finalize,
4945         .release_device         = intel_iommu_release_device,
4946         .get_resv_regions       = intel_iommu_get_resv_regions,
4947         .device_group           = intel_iommu_device_group,
4948         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4949         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4950         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4951         .def_domain_type        = device_def_domain_type,
4952         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4953         .pgsize_bitmap          = SZ_4K,
4954 #ifdef CONFIG_INTEL_IOMMU_SVM
4955         .page_response          = intel_svm_page_response,
4956 #endif
4957         .default_domain_ops = &(const struct iommu_domain_ops) {
4958                 .attach_dev             = intel_iommu_attach_device,
4959                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4960                 .map_pages              = intel_iommu_map_pages,
4961                 .unmap_pages            = intel_iommu_unmap_pages,
4962                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4963                 .flush_iotlb_all        = intel_flush_iotlb_all,
4964                 .iotlb_sync             = intel_iommu_tlb_sync,
4965                 .iova_to_phys           = intel_iommu_iova_to_phys,
4966                 .free                   = intel_iommu_domain_free,
4967                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4968         }
4969 };
4970
4971 static void quirk_iommu_igfx(struct pci_dev *dev)
4972 {
4973         if (risky_device(dev))
4974                 return;
4975
4976         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4977         dmar_map_gfx = 0;
4978 }
4979
4980 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4988
4989 /* Broadwell igfx malfunctions with dmar */
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5014
5015 static void quirk_iommu_rwbf(struct pci_dev *dev)
5016 {
5017         if (risky_device(dev))
5018                 return;
5019
5020         /*
5021          * Mobile 4 Series Chipset neglects to set RWBF capability,
5022          * but needs it. Same seems to hold for the desktop versions.
5023          */
5024         pci_info(dev, "Forcing write-buffer flush capability\n");
5025         rwbf_quirk = 1;
5026 }
5027
5028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5035
5036 #define GGC 0x52
5037 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5038 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5039 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5040 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5041 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5042 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5043 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5044 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5045
5046 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5047 {
5048         unsigned short ggc;
5049
5050         if (risky_device(dev))
5051                 return;
5052
5053         if (pci_read_config_word(dev, GGC, &ggc))
5054                 return;
5055
5056         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5057                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5058                 dmar_map_gfx = 0;
5059         } else if (dmar_map_gfx) {
5060                 /* we have to ensure the gfx device is idle before we flush */
5061                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5062                 iommu_set_dma_strict();
5063         }
5064 }
5065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5069
5070 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5071 {
5072         unsigned short ver;
5073
5074         if (!IS_GFX_DEVICE(dev))
5075                 return;
5076
5077         ver = (dev->device >> 8) & 0xff;
5078         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5079             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5080             ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5081                 return;
5082
5083         if (risky_device(dev))
5084                 return;
5085
5086         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5087         iommu_skip_te_disable = 1;
5088 }
5089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5090
5091 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5092    ISOCH DMAR unit for the Azalia sound device, but not give it any
5093    TLB entries, which causes it to deadlock. Check for that.  We do
5094    this in a function called from init_dmars(), instead of in a PCI
5095    quirk, because we don't want to print the obnoxious "BIOS broken"
5096    message if VT-d is actually disabled.
5097 */
5098 static void __init check_tylersburg_isoch(void)
5099 {
5100         struct pci_dev *pdev;
5101         uint32_t vtisochctrl;
5102
5103         /* If there's no Azalia in the system anyway, forget it. */
5104         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5105         if (!pdev)
5106                 return;
5107
5108         if (risky_device(pdev)) {
5109                 pci_dev_put(pdev);
5110                 return;
5111         }
5112
5113         pci_dev_put(pdev);
5114
5115         /* System Management Registers. Might be hidden, in which case
5116            we can't do the sanity check. But that's OK, because the
5117            known-broken BIOSes _don't_ actually hide it, so far. */
5118         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5119         if (!pdev)
5120                 return;
5121
5122         if (risky_device(pdev)) {
5123                 pci_dev_put(pdev);
5124                 return;
5125         }
5126
5127         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5128                 pci_dev_put(pdev);
5129                 return;
5130         }
5131
5132         pci_dev_put(pdev);
5133
5134         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5135         if (vtisochctrl & 1)
5136                 return;
5137
5138         /* Drop all bits other than the number of TLB entries */
5139         vtisochctrl &= 0x1c;
5140
5141         /* If we have the recommended number of TLB entries (16), fine. */
5142         if (vtisochctrl == 0x10)
5143                 return;
5144
5145         /* Zero TLB entries? You get to ride the short bus to school. */
5146         if (!vtisochctrl) {
5147                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5148                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5149                      dmi_get_system_info(DMI_BIOS_VENDOR),
5150                      dmi_get_system_info(DMI_BIOS_VERSION),
5151                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5152                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5153                 return;
5154         }
5155
5156         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5157                vtisochctrl);
5158 }
5159
5160 /*
5161  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5162  * invalidation completion before posted writes initiated with translated address
5163  * that utilized translations matching the invalidation address range, violating
5164  * the invalidation completion ordering.
5165  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5166  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5167  * under the control of the trusted/privileged host device driver must use this
5168  * quirk.
5169  * Device TLBs are invalidated under the following six conditions:
5170  * 1. Device driver does DMA API unmap IOVA
5171  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5172  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5173  *    exit_mmap() due to crash
5174  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5175  *    VM has to free pages that were unmapped
5176  * 5. Userspace driver unmaps a DMA buffer
5177  * 6. Cache invalidation in vSVA usage (upcoming)
5178  *
5179  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5180  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5181  * invalidate TLB the same way as normal user unmap which will use this quirk.
5182  * The dTLB invalidation after PASID cache flush does not need this quirk.
5183  *
5184  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5185  */
5186 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5187                                unsigned long address, unsigned long mask,
5188                                u32 pasid, u16 qdep)
5189 {
5190         u16 sid;
5191
5192         if (likely(!info->dtlb_extra_inval))
5193                 return;
5194
5195         sid = PCI_DEVID(info->bus, info->devfn);
5196         if (pasid == IOMMU_NO_PASID) {
5197                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5198                                    qdep, address, mask);
5199         } else {
5200                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5201                                          pasid, qdep, address, mask);
5202         }
5203 }
5204
5205 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5206
5207 /*
5208  * Function to submit a command to the enhanced command interface. The
5209  * valid enhanced command descriptions are defined in Table 47 of the
5210  * VT-d spec. The VT-d hardware implementation may support some but not
5211  * all commands, which can be determined by checking the Enhanced
5212  * Command Capability Register.
5213  *
5214  * Return values:
5215  *  - 0: Command successful without any error;
5216  *  - Negative: software error value;
5217  *  - Nonzero positive: failure status code defined in Table 48.
5218  */
5219 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5220 {
5221         unsigned long flags;
5222         u64 res;
5223         int ret;
5224
5225         if (!cap_ecmds(iommu->cap))
5226                 return -ENODEV;
5227
5228         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5229
5230         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5231         if (res & DMA_ECMD_ECRSP_IP) {
5232                 ret = -EBUSY;
5233                 goto err;
5234         }
5235
5236         /*
5237          * Unconditionally write the operand B, because
5238          * - There is no side effect if an ecmd doesn't require an
5239          *   operand B, but we set the register to some value.
5240          * - It's not invoked in any critical path. The extra MMIO
5241          *   write doesn't bring any performance concerns.
5242          */
5243         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5244         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5245
5246         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5247                       !(res & DMA_ECMD_ECRSP_IP), res);
5248
5249         if (res & DMA_ECMD_ECRSP_IP) {
5250                 ret = -ETIMEDOUT;
5251                 goto err;
5252         }
5253
5254         ret = ecmd_get_status_code(res);
5255 err:
5256         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5257
5258         return ret;
5259 }