Merge tag 'arm-dt-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173         context->lo &= ~(1ULL << 11);
174 }
175
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178         return !!(context->lo & (1ULL << 11));
179 }
180
181 static inline void context_set_copied(struct context_entry *context)
182 {
183         context->hi |= (1ull << 3);
184 }
185
186 static inline bool context_copied(struct context_entry *context)
187 {
188         return !!(context->hi & (1ULL << 3));
189 }
190
191 static inline bool __context_present(struct context_entry *context)
192 {
193         return (context->lo & 1);
194 }
195
196 bool context_present(struct context_entry *context)
197 {
198         return context_pasid_enabled(context) ?
199              __context_present(context) :
200              __context_present(context) && !context_copied(context);
201 }
202
203 static inline void context_set_present(struct context_entry *context)
204 {
205         context->lo |= 1;
206 }
207
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210         context->lo &= (((u64)-1) << 2) | 1;
211 }
212
213 static inline void context_set_translation_type(struct context_entry *context,
214                                                 unsigned long value)
215 {
216         context->lo &= (((u64)-1) << 4) | 3;
217         context->lo |= (value & 3) << 2;
218 }
219
220 static inline void context_set_address_root(struct context_entry *context,
221                                             unsigned long value)
222 {
223         context->lo &= ~VTD_PAGE_MASK;
224         context->lo |= value & VTD_PAGE_MASK;
225 }
226
227 static inline void context_set_address_width(struct context_entry *context,
228                                              unsigned long value)
229 {
230         context->hi |= value & 7;
231 }
232
233 static inline void context_set_domain_id(struct context_entry *context,
234                                          unsigned long value)
235 {
236         context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238
239 static inline int context_domain_id(struct context_entry *c)
240 {
241         return((c->hi >> 8) & 0xffff);
242 }
243
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246         context->lo = 0;
247         context->hi = 0;
248 }
249
250 /*
251  * This domain is a statically identity mapping domain.
252  *      1. This domain creats a static 1:1 mapping to all usable memory.
253  *      2. It maps to each iommu if successful.
254  *      3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258
259 #define for_each_domain_iommu(idx, domain)                      \
260         for (idx = 0; idx < g_num_of_iommus; idx++)             \
261                 if (domain->iommu_refcnt[idx])
262
263 struct dmar_rmrr_unit {
264         struct list_head list;          /* list of rmrr units   */
265         struct acpi_dmar_header *hdr;   /* ACPI header          */
266         u64     base_address;           /* reserved base address*/
267         u64     end_address;            /* reserved end address */
268         struct dmar_dev_scope *devices; /* target devices */
269         int     devices_cnt;            /* target device count */
270 };
271
272 struct dmar_atsr_unit {
273         struct list_head list;          /* list of ATSR units */
274         struct acpi_dmar_header *hdr;   /* ACPI header */
275         struct dmar_dev_scope *devices; /* target devices */
276         int devices_cnt;                /* target device count */
277         u8 include_all:1;               /* include all ports */
278 };
279
280 struct dmar_satc_unit {
281         struct list_head list;          /* list of SATC units */
282         struct acpi_dmar_header *hdr;   /* ACPI header */
283         struct dmar_dev_scope *devices; /* target devices */
284         struct intel_iommu *iommu;      /* the corresponding iommu */
285         int devices_cnt;                /* target device count */
286         u8 atc_required:1;              /* ATS is required */
287 };
288
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292
293 #define for_each_rmrr_units(rmrr) \
294         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313
314 #define IDENTMAP_GFX            2
315 #define IDENTMAP_AZALIA         4
316
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322
323 const struct iommu_ops intel_iommu_ops;
324
325 static bool translation_pre_enabled(struct intel_iommu *iommu)
326 {
327         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
328 }
329
330 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
331 {
332         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
333 }
334
335 static void init_translation_status(struct intel_iommu *iommu)
336 {
337         u32 gsts;
338
339         gsts = readl(iommu->reg + DMAR_GSTS_REG);
340         if (gsts & DMA_GSTS_TES)
341                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
342 }
343
344 static int __init intel_iommu_setup(char *str)
345 {
346         if (!str)
347                 return -EINVAL;
348
349         while (*str) {
350                 if (!strncmp(str, "on", 2)) {
351                         dmar_disabled = 0;
352                         pr_info("IOMMU enabled\n");
353                 } else if (!strncmp(str, "off", 3)) {
354                         dmar_disabled = 1;
355                         no_platform_optin = 1;
356                         pr_info("IOMMU disabled\n");
357                 } else if (!strncmp(str, "igfx_off", 8)) {
358                         dmar_map_gfx = 0;
359                         pr_info("Disable GFX device mapping\n");
360                 } else if (!strncmp(str, "forcedac", 8)) {
361                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
362                         iommu_dma_forcedac = true;
363                 } else if (!strncmp(str, "strict", 6)) {
364                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
365                         iommu_set_dma_strict();
366                 } else if (!strncmp(str, "sp_off", 6)) {
367                         pr_info("Disable supported super page\n");
368                         intel_iommu_superpage = 0;
369                 } else if (!strncmp(str, "sm_on", 5)) {
370                         pr_info("Enable scalable mode if hardware supports\n");
371                         intel_iommu_sm = 1;
372                 } else if (!strncmp(str, "sm_off", 6)) {
373                         pr_info("Scalable mode is disallowed\n");
374                         intel_iommu_sm = 0;
375                 } else if (!strncmp(str, "tboot_noforce", 13)) {
376                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
377                         intel_iommu_tboot_noforce = 1;
378                 } else {
379                         pr_notice("Unknown option - '%s'\n", str);
380                 }
381
382                 str += strcspn(str, ",");
383                 while (*str == ',')
384                         str++;
385         }
386
387         return 1;
388 }
389 __setup("intel_iommu=", intel_iommu_setup);
390
391 void *alloc_pgtable_page(int node)
392 {
393         struct page *page;
394         void *vaddr = NULL;
395
396         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
397         if (page)
398                 vaddr = page_address(page);
399         return vaddr;
400 }
401
402 void free_pgtable_page(void *vaddr)
403 {
404         free_page((unsigned long)vaddr);
405 }
406
407 static inline int domain_type_is_si(struct dmar_domain *domain)
408 {
409         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
410 }
411
412 static inline bool domain_use_first_level(struct dmar_domain *domain)
413 {
414         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
415 }
416
417 static inline int domain_pfn_supported(struct dmar_domain *domain,
418                                        unsigned long pfn)
419 {
420         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
421
422         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
423 }
424
425 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
426 {
427         unsigned long sagaw;
428         int agaw;
429
430         sagaw = cap_sagaw(iommu->cap);
431         for (agaw = width_to_agaw(max_gaw);
432              agaw >= 0; agaw--) {
433                 if (test_bit(agaw, &sagaw))
434                         break;
435         }
436
437         return agaw;
438 }
439
440 /*
441  * Calculate max SAGAW for each iommu.
442  */
443 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
444 {
445         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
446 }
447
448 /*
449  * calculate agaw for each iommu.
450  * "SAGAW" may be different across iommus, use a default agaw, and
451  * get a supported less agaw for iommus that don't support the default agaw.
452  */
453 int iommu_calculate_agaw(struct intel_iommu *iommu)
454 {
455         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
456 }
457
458 /* This functionin only returns single iommu in a domain */
459 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
460 {
461         int iommu_id;
462
463         /* si_domain and vm domain should not get here. */
464         if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
465                 return NULL;
466
467         for_each_domain_iommu(iommu_id, domain)
468                 break;
469
470         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
471                 return NULL;
472
473         return g_iommus[iommu_id];
474 }
475
476 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
477 {
478         return sm_supported(iommu) ?
479                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
480 }
481
482 static void domain_update_iommu_coherency(struct dmar_domain *domain)
483 {
484         struct dmar_drhd_unit *drhd;
485         struct intel_iommu *iommu;
486         bool found = false;
487         int i;
488
489         domain->iommu_coherency = true;
490
491         for_each_domain_iommu(i, domain) {
492                 found = true;
493                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
494                         domain->iommu_coherency = false;
495                         break;
496                 }
497         }
498         if (found)
499                 return;
500
501         /* No hardware attached; use lowest common denominator */
502         rcu_read_lock();
503         for_each_active_iommu(iommu, drhd) {
504                 if (!iommu_paging_structure_coherency(iommu)) {
505                         domain->iommu_coherency = false;
506                         break;
507                 }
508         }
509         rcu_read_unlock();
510 }
511
512 static int domain_update_iommu_superpage(struct dmar_domain *domain,
513                                          struct intel_iommu *skip)
514 {
515         struct dmar_drhd_unit *drhd;
516         struct intel_iommu *iommu;
517         int mask = 0x3;
518
519         if (!intel_iommu_superpage)
520                 return 0;
521
522         /* set iommu_superpage to the smallest common denominator */
523         rcu_read_lock();
524         for_each_active_iommu(iommu, drhd) {
525                 if (iommu != skip) {
526                         if (domain && domain_use_first_level(domain)) {
527                                 if (!cap_fl1gp_support(iommu->cap))
528                                         mask = 0x1;
529                         } else {
530                                 mask &= cap_super_page_val(iommu->cap);
531                         }
532
533                         if (!mask)
534                                 break;
535                 }
536         }
537         rcu_read_unlock();
538
539         return fls(mask);
540 }
541
542 static int domain_update_device_node(struct dmar_domain *domain)
543 {
544         struct device_domain_info *info;
545         int nid = NUMA_NO_NODE;
546
547         assert_spin_locked(&device_domain_lock);
548
549         if (list_empty(&domain->devices))
550                 return NUMA_NO_NODE;
551
552         list_for_each_entry(info, &domain->devices, link) {
553                 if (!info->dev)
554                         continue;
555
556                 /*
557                  * There could possibly be multiple device numa nodes as devices
558                  * within the same domain may sit behind different IOMMUs. There
559                  * isn't perfect answer in such situation, so we select first
560                  * come first served policy.
561                  */
562                 nid = dev_to_node(info->dev);
563                 if (nid != NUMA_NO_NODE)
564                         break;
565         }
566
567         return nid;
568 }
569
570 static void domain_update_iotlb(struct dmar_domain *domain);
571
572 /* Return the super pagesize bitmap if supported. */
573 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
574 {
575         unsigned long bitmap = 0;
576
577         /*
578          * 1-level super page supports page size of 2MiB, 2-level super page
579          * supports page size of both 2MiB and 1GiB.
580          */
581         if (domain->iommu_superpage == 1)
582                 bitmap |= SZ_2M;
583         else if (domain->iommu_superpage == 2)
584                 bitmap |= SZ_2M | SZ_1G;
585
586         return bitmap;
587 }
588
589 /* Some capabilities may be different across iommus */
590 static void domain_update_iommu_cap(struct dmar_domain *domain)
591 {
592         domain_update_iommu_coherency(domain);
593         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
594
595         /*
596          * If RHSA is missing, we should default to the device numa domain
597          * as fall back.
598          */
599         if (domain->nid == NUMA_NO_NODE)
600                 domain->nid = domain_update_device_node(domain);
601
602         /*
603          * First-level translation restricts the input-address to a
604          * canonical address (i.e., address bits 63:N have the same
605          * value as address bit [N-1], where N is 48-bits with 4-level
606          * paging and 57-bits with 5-level paging). Hence, skip bit
607          * [N-1].
608          */
609         if (domain_use_first_level(domain))
610                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
611         else
612                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
613
614         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
615         domain_update_iotlb(domain);
616 }
617
618 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
619                                          u8 devfn, int alloc)
620 {
621         struct root_entry *root = &iommu->root_entry[bus];
622         struct context_entry *context;
623         u64 *entry;
624
625         entry = &root->lo;
626         if (sm_supported(iommu)) {
627                 if (devfn >= 0x80) {
628                         devfn -= 0x80;
629                         entry = &root->hi;
630                 }
631                 devfn *= 2;
632         }
633         if (*entry & 1)
634                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
635         else {
636                 unsigned long phy_addr;
637                 if (!alloc)
638                         return NULL;
639
640                 context = alloc_pgtable_page(iommu->node);
641                 if (!context)
642                         return NULL;
643
644                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
645                 phy_addr = virt_to_phys((void *)context);
646                 *entry = phy_addr | 1;
647                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
648         }
649         return &context[devfn];
650 }
651
652 /**
653  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
654  *                               sub-hierarchy of a candidate PCI-PCI bridge
655  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
656  * @bridge: the candidate PCI-PCI bridge
657  *
658  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
659  */
660 static bool
661 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
662 {
663         struct pci_dev *pdev, *pbridge;
664
665         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
666                 return false;
667
668         pdev = to_pci_dev(dev);
669         pbridge = to_pci_dev(bridge);
670
671         if (pbridge->subordinate &&
672             pbridge->subordinate->number <= pdev->bus->number &&
673             pbridge->subordinate->busn_res.end >= pdev->bus->number)
674                 return true;
675
676         return false;
677 }
678
679 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
680 {
681         struct dmar_drhd_unit *drhd;
682         u32 vtbar;
683         int rc;
684
685         /* We know that this device on this chipset has its own IOMMU.
686          * If we find it under a different IOMMU, then the BIOS is lying
687          * to us. Hope that the IOMMU for this device is actually
688          * disabled, and it needs no translation...
689          */
690         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
691         if (rc) {
692                 /* "can't" happen */
693                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
694                 return false;
695         }
696         vtbar &= 0xffff0000;
697
698         /* we know that the this iommu should be at offset 0xa000 from vtbar */
699         drhd = dmar_find_matched_drhd_unit(pdev);
700         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
701                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
702                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
703                 return true;
704         }
705
706         return false;
707 }
708
709 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
710 {
711         if (!iommu || iommu->drhd->ignored)
712                 return true;
713
714         if (dev_is_pci(dev)) {
715                 struct pci_dev *pdev = to_pci_dev(dev);
716
717                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
718                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
719                     quirk_ioat_snb_local_iommu(pdev))
720                         return true;
721         }
722
723         return false;
724 }
725
726 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
727 {
728         struct dmar_drhd_unit *drhd = NULL;
729         struct pci_dev *pdev = NULL;
730         struct intel_iommu *iommu;
731         struct device *tmp;
732         u16 segment = 0;
733         int i;
734
735         if (!dev)
736                 return NULL;
737
738         if (dev_is_pci(dev)) {
739                 struct pci_dev *pf_pdev;
740
741                 pdev = pci_real_dma_dev(to_pci_dev(dev));
742
743                 /* VFs aren't listed in scope tables; we need to look up
744                  * the PF instead to find the IOMMU. */
745                 pf_pdev = pci_physfn(pdev);
746                 dev = &pf_pdev->dev;
747                 segment = pci_domain_nr(pdev->bus);
748         } else if (has_acpi_companion(dev))
749                 dev = &ACPI_COMPANION(dev)->dev;
750
751         rcu_read_lock();
752         for_each_iommu(iommu, drhd) {
753                 if (pdev && segment != drhd->segment)
754                         continue;
755
756                 for_each_active_dev_scope(drhd->devices,
757                                           drhd->devices_cnt, i, tmp) {
758                         if (tmp == dev) {
759                                 /* For a VF use its original BDF# not that of the PF
760                                  * which we used for the IOMMU lookup. Strictly speaking
761                                  * we could do this for all PCI devices; we only need to
762                                  * get the BDF# from the scope table for ACPI matches. */
763                                 if (pdev && pdev->is_virtfn)
764                                         goto got_pdev;
765
766                                 if (bus && devfn) {
767                                         *bus = drhd->devices[i].bus;
768                                         *devfn = drhd->devices[i].devfn;
769                                 }
770                                 goto out;
771                         }
772
773                         if (is_downstream_to_pci_bridge(dev, tmp))
774                                 goto got_pdev;
775                 }
776
777                 if (pdev && drhd->include_all) {
778 got_pdev:
779                         if (bus && devfn) {
780                                 *bus = pdev->bus->number;
781                                 *devfn = pdev->devfn;
782                         }
783                         goto out;
784                 }
785         }
786         iommu = NULL;
787 out:
788         if (iommu_is_dummy(iommu, dev))
789                 iommu = NULL;
790
791         rcu_read_unlock();
792
793         return iommu;
794 }
795
796 static void domain_flush_cache(struct dmar_domain *domain,
797                                void *addr, int size)
798 {
799         if (!domain->iommu_coherency)
800                 clflush_cache_range(addr, size);
801 }
802
803 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
804 {
805         struct context_entry *context;
806         int ret = 0;
807         unsigned long flags;
808
809         spin_lock_irqsave(&iommu->lock, flags);
810         context = iommu_context_addr(iommu, bus, devfn, 0);
811         if (context)
812                 ret = context_present(context);
813         spin_unlock_irqrestore(&iommu->lock, flags);
814         return ret;
815 }
816
817 static void free_context_table(struct intel_iommu *iommu)
818 {
819         int i;
820         unsigned long flags;
821         struct context_entry *context;
822
823         spin_lock_irqsave(&iommu->lock, flags);
824         if (!iommu->root_entry) {
825                 goto out;
826         }
827         for (i = 0; i < ROOT_ENTRY_NR; i++) {
828                 context = iommu_context_addr(iommu, i, 0, 0);
829                 if (context)
830                         free_pgtable_page(context);
831
832                 if (!sm_supported(iommu))
833                         continue;
834
835                 context = iommu_context_addr(iommu, i, 0x80, 0);
836                 if (context)
837                         free_pgtable_page(context);
838
839         }
840         free_pgtable_page(iommu->root_entry);
841         iommu->root_entry = NULL;
842 out:
843         spin_unlock_irqrestore(&iommu->lock, flags);
844 }
845
846 #ifdef CONFIG_DMAR_DEBUG
847 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
848 {
849         struct device_domain_info *info;
850         struct dma_pte *parent, *pte;
851         struct dmar_domain *domain;
852         int offset, level;
853
854         info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
855         if (!info || !info->domain) {
856                 pr_info("device [%02x:%02x.%d] not probed\n",
857                         bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
858                 return;
859         }
860
861         domain = info->domain;
862         level = agaw_to_level(domain->agaw);
863         parent = domain->pgd;
864         if (!parent) {
865                 pr_info("no page table setup\n");
866                 return;
867         }
868
869         while (1) {
870                 offset = pfn_level_offset(pfn, level);
871                 pte = &parent[offset];
872                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
873                         pr_info("PTE not present at level %d\n", level);
874                         break;
875                 }
876
877                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
878
879                 if (level == 1)
880                         break;
881
882                 parent = phys_to_virt(dma_pte_addr(pte));
883                 level--;
884         }
885 }
886
887 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
888                           unsigned long long addr, u32 pasid)
889 {
890         struct pasid_dir_entry *dir, *pde;
891         struct pasid_entry *entries, *pte;
892         struct context_entry *ctx_entry;
893         struct root_entry *rt_entry;
894         u8 devfn = source_id & 0xff;
895         u8 bus = source_id >> 8;
896         int i, dir_index, index;
897
898         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
899
900         /* root entry dump */
901         rt_entry = &iommu->root_entry[bus];
902         if (!rt_entry) {
903                 pr_info("root table entry is not present\n");
904                 return;
905         }
906
907         if (sm_supported(iommu))
908                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
909                         rt_entry->hi, rt_entry->lo);
910         else
911                 pr_info("root entry: 0x%016llx", rt_entry->lo);
912
913         /* context entry dump */
914         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
915         if (!ctx_entry) {
916                 pr_info("context table entry is not present\n");
917                 return;
918         }
919
920         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
921                 ctx_entry->hi, ctx_entry->lo);
922
923         /* legacy mode does not require PASID entries */
924         if (!sm_supported(iommu))
925                 goto pgtable_walk;
926
927         /* get the pointer to pasid directory entry */
928         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
929         if (!dir) {
930                 pr_info("pasid directory entry is not present\n");
931                 return;
932         }
933         /* For request-without-pasid, get the pasid from context entry */
934         if (intel_iommu_sm && pasid == INVALID_IOASID)
935                 pasid = PASID_RID2PASID;
936
937         dir_index = pasid >> PASID_PDE_SHIFT;
938         pde = &dir[dir_index];
939         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
940
941         /* get the pointer to the pasid table entry */
942         entries = get_pasid_table_from_pde(pde);
943         if (!entries) {
944                 pr_info("pasid table entry is not present\n");
945                 return;
946         }
947         index = pasid & PASID_PTE_MASK;
948         pte = &entries[index];
949         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
950                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
951
952 pgtable_walk:
953         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
954 }
955 #endif
956
957 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
958                                       unsigned long pfn, int *target_level)
959 {
960         struct dma_pte *parent, *pte;
961         int level = agaw_to_level(domain->agaw);
962         int offset;
963
964         BUG_ON(!domain->pgd);
965
966         if (!domain_pfn_supported(domain, pfn))
967                 /* Address beyond IOMMU's addressing capabilities. */
968                 return NULL;
969
970         parent = domain->pgd;
971
972         while (1) {
973                 void *tmp_page;
974
975                 offset = pfn_level_offset(pfn, level);
976                 pte = &parent[offset];
977                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
978                         break;
979                 if (level == *target_level)
980                         break;
981
982                 if (!dma_pte_present(pte)) {
983                         uint64_t pteval;
984
985                         tmp_page = alloc_pgtable_page(domain->nid);
986
987                         if (!tmp_page)
988                                 return NULL;
989
990                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
991                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
992                         if (domain_use_first_level(domain)) {
993                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
994                                 if (iommu_is_dma_domain(&domain->domain))
995                                         pteval |= DMA_FL_PTE_ACCESS;
996                         }
997                         if (cmpxchg64(&pte->val, 0ULL, pteval))
998                                 /* Someone else set it while we were thinking; use theirs. */
999                                 free_pgtable_page(tmp_page);
1000                         else
1001                                 domain_flush_cache(domain, pte, sizeof(*pte));
1002                 }
1003                 if (level == 1)
1004                         break;
1005
1006                 parent = phys_to_virt(dma_pte_addr(pte));
1007                 level--;
1008         }
1009
1010         if (!*target_level)
1011                 *target_level = level;
1012
1013         return pte;
1014 }
1015
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018                                          unsigned long pfn,
1019                                          int level, int *large_page)
1020 {
1021         struct dma_pte *parent, *pte;
1022         int total = agaw_to_level(domain->agaw);
1023         int offset;
1024
1025         parent = domain->pgd;
1026         while (level <= total) {
1027                 offset = pfn_level_offset(pfn, total);
1028                 pte = &parent[offset];
1029                 if (level == total)
1030                         return pte;
1031
1032                 if (!dma_pte_present(pte)) {
1033                         *large_page = total;
1034                         break;
1035                 }
1036
1037                 if (dma_pte_superpage(pte)) {
1038                         *large_page = total;
1039                         return pte;
1040                 }
1041
1042                 parent = phys_to_virt(dma_pte_addr(pte));
1043                 total--;
1044         }
1045         return NULL;
1046 }
1047
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050                                 unsigned long start_pfn,
1051                                 unsigned long last_pfn)
1052 {
1053         unsigned int large_page;
1054         struct dma_pte *first_pte, *pte;
1055
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         /* we don't need lock here; nobody else touches the iova range */
1061         do {
1062                 large_page = 1;
1063                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064                 if (!pte) {
1065                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1066                         continue;
1067                 }
1068                 do {
1069                         dma_clear_pte(pte);
1070                         start_pfn += lvl_to_nr_pages(large_page);
1071                         pte++;
1072                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073
1074                 domain_flush_cache(domain, first_pte,
1075                                    (void *)pte - (void *)first_pte);
1076
1077         } while (start_pfn && start_pfn <= last_pfn);
1078 }
1079
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081                                int retain_level, struct dma_pte *pte,
1082                                unsigned long pfn, unsigned long start_pfn,
1083                                unsigned long last_pfn)
1084 {
1085         pfn = max(start_pfn, pfn);
1086         pte = &pte[pfn_level_offset(pfn, level)];
1087
1088         do {
1089                 unsigned long level_pfn;
1090                 struct dma_pte *level_pte;
1091
1092                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1093                         goto next;
1094
1095                 level_pfn = pfn & level_mask(level);
1096                 level_pte = phys_to_virt(dma_pte_addr(pte));
1097
1098                 if (level > 2) {
1099                         dma_pte_free_level(domain, level - 1, retain_level,
1100                                            level_pte, level_pfn, start_pfn,
1101                                            last_pfn);
1102                 }
1103
1104                 /*
1105                  * Free the page table if we're below the level we want to
1106                  * retain and the range covers the entire table.
1107                  */
1108                 if (level < retain_level && !(start_pfn > level_pfn ||
1109                       last_pfn < level_pfn + level_size(level) - 1)) {
1110                         dma_clear_pte(pte);
1111                         domain_flush_cache(domain, pte, sizeof(*pte));
1112                         free_pgtable_page(level_pte);
1113                 }
1114 next:
1115                 pfn += level_size(level);
1116         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1117 }
1118
1119 /*
1120  * clear last level (leaf) ptes and free page table pages below the
1121  * level we wish to keep intact.
1122  */
1123 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1124                                    unsigned long start_pfn,
1125                                    unsigned long last_pfn,
1126                                    int retain_level)
1127 {
1128         dma_pte_clear_range(domain, start_pfn, last_pfn);
1129
1130         /* We don't need lock here; nobody else touches the iova range */
1131         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1132                            domain->pgd, 0, start_pfn, last_pfn);
1133
1134         /* free pgd */
1135         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1136                 free_pgtable_page(domain->pgd);
1137                 domain->pgd = NULL;
1138         }
1139 }
1140
1141 /* When a page at a given level is being unlinked from its parent, we don't
1142    need to *modify* it at all. All we need to do is make a list of all the
1143    pages which can be freed just as soon as we've flushed the IOTLB and we
1144    know the hardware page-walk will no longer touch them.
1145    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1146    be freed. */
1147 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1148                                     int level, struct dma_pte *pte,
1149                                     struct list_head *freelist)
1150 {
1151         struct page *pg;
1152
1153         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1154         list_add_tail(&pg->lru, freelist);
1155
1156         if (level == 1)
1157                 return;
1158
1159         pte = page_address(pg);
1160         do {
1161                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1162                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1163                 pte++;
1164         } while (!first_pte_in_page(pte));
1165 }
1166
1167 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1168                                 struct dma_pte *pte, unsigned long pfn,
1169                                 unsigned long start_pfn, unsigned long last_pfn,
1170                                 struct list_head *freelist)
1171 {
1172         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1173
1174         pfn = max(start_pfn, pfn);
1175         pte = &pte[pfn_level_offset(pfn, level)];
1176
1177         do {
1178                 unsigned long level_pfn = pfn & level_mask(level);
1179
1180                 if (!dma_pte_present(pte))
1181                         goto next;
1182
1183                 /* If range covers entire pagetable, free it */
1184                 if (start_pfn <= level_pfn &&
1185                     last_pfn >= level_pfn + level_size(level) - 1) {
1186                         /* These suborbinate page tables are going away entirely. Don't
1187                            bother to clear them; we're just going to *free* them. */
1188                         if (level > 1 && !dma_pte_superpage(pte))
1189                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1190
1191                         dma_clear_pte(pte);
1192                         if (!first_pte)
1193                                 first_pte = pte;
1194                         last_pte = pte;
1195                 } else if (level > 1) {
1196                         /* Recurse down into a level that isn't *entirely* obsolete */
1197                         dma_pte_clear_level(domain, level - 1,
1198                                             phys_to_virt(dma_pte_addr(pte)),
1199                                             level_pfn, start_pfn, last_pfn,
1200                                             freelist);
1201                 }
1202 next:
1203                 pfn = level_pfn + level_size(level);
1204         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1205
1206         if (first_pte)
1207                 domain_flush_cache(domain, first_pte,
1208                                    (void *)++last_pte - (void *)first_pte);
1209 }
1210
1211 /* We can't just free the pages because the IOMMU may still be walking
1212    the page tables, and may have cached the intermediate levels. The
1213    pages can only be freed after the IOTLB flush has been done. */
1214 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1215                          unsigned long last_pfn, struct list_head *freelist)
1216 {
1217         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1218         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1219         BUG_ON(start_pfn > last_pfn);
1220
1221         /* we don't need lock here; nobody else touches the iova range */
1222         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1223                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1224
1225         /* free pgd */
1226         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1227                 struct page *pgd_page = virt_to_page(domain->pgd);
1228                 list_add_tail(&pgd_page->lru, freelist);
1229                 domain->pgd = NULL;
1230         }
1231 }
1232
1233 /* iommu handling */
1234 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1235 {
1236         struct root_entry *root;
1237         unsigned long flags;
1238
1239         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1240         if (!root) {
1241                 pr_err("Allocating root entry for %s failed\n",
1242                         iommu->name);
1243                 return -ENOMEM;
1244         }
1245
1246         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1247
1248         spin_lock_irqsave(&iommu->lock, flags);
1249         iommu->root_entry = root;
1250         spin_unlock_irqrestore(&iommu->lock, flags);
1251
1252         return 0;
1253 }
1254
1255 static void iommu_set_root_entry(struct intel_iommu *iommu)
1256 {
1257         u64 addr;
1258         u32 sts;
1259         unsigned long flag;
1260
1261         addr = virt_to_phys(iommu->root_entry);
1262         if (sm_supported(iommu))
1263                 addr |= DMA_RTADDR_SMT;
1264
1265         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1266         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1267
1268         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1269
1270         /* Make sure hardware complete it */
1271         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272                       readl, (sts & DMA_GSTS_RTPS), sts);
1273
1274         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1275
1276         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1277         if (sm_supported(iommu))
1278                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1279         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1280 }
1281
1282 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1283 {
1284         u32 val;
1285         unsigned long flag;
1286
1287         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1288                 return;
1289
1290         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1291         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1292
1293         /* Make sure hardware complete it */
1294         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1295                       readl, (!(val & DMA_GSTS_WBFS)), val);
1296
1297         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1298 }
1299
1300 /* return value determine if we need a write buffer flush */
1301 static void __iommu_flush_context(struct intel_iommu *iommu,
1302                                   u16 did, u16 source_id, u8 function_mask,
1303                                   u64 type)
1304 {
1305         u64 val = 0;
1306         unsigned long flag;
1307
1308         switch (type) {
1309         case DMA_CCMD_GLOBAL_INVL:
1310                 val = DMA_CCMD_GLOBAL_INVL;
1311                 break;
1312         case DMA_CCMD_DOMAIN_INVL:
1313                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1314                 break;
1315         case DMA_CCMD_DEVICE_INVL:
1316                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1317                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1318                 break;
1319         default:
1320                 BUG();
1321         }
1322         val |= DMA_CCMD_ICC;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1326
1327         /* Make sure hardware complete it */
1328         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1329                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1330
1331         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332 }
1333
1334 /* return value determine if we need a write buffer flush */
1335 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1336                                 u64 addr, unsigned int size_order, u64 type)
1337 {
1338         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1339         u64 val = 0, val_iva = 0;
1340         unsigned long flag;
1341
1342         switch (type) {
1343         case DMA_TLB_GLOBAL_FLUSH:
1344                 /* global flush doesn't need set IVA_REG */
1345                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1346                 break;
1347         case DMA_TLB_DSI_FLUSH:
1348                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1349                 break;
1350         case DMA_TLB_PSI_FLUSH:
1351                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1352                 /* IH bit is passed in as part of address */
1353                 val_iva = size_order | addr;
1354                 break;
1355         default:
1356                 BUG();
1357         }
1358         /* Note: set drain read/write */
1359 #if 0
1360         /*
1361          * This is probably to be super secure.. Looks like we can
1362          * ignore it without any impact.
1363          */
1364         if (cap_read_drain(iommu->cap))
1365                 val |= DMA_TLB_READ_DRAIN;
1366 #endif
1367         if (cap_write_drain(iommu->cap))
1368                 val |= DMA_TLB_WRITE_DRAIN;
1369
1370         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1371         /* Note: Only uses first TLB reg currently */
1372         if (val_iva)
1373                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1374         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1378                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381
1382         /* check IOTLB invalidation granularity */
1383         if (DMA_TLB_IAIG(val) == 0)
1384                 pr_err("Flush IOTLB failed\n");
1385         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1386                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1387                         (unsigned long long)DMA_TLB_IIRG(type),
1388                         (unsigned long long)DMA_TLB_IAIG(val));
1389 }
1390
1391 static struct device_domain_info *
1392 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1393                          u8 bus, u8 devfn)
1394 {
1395         struct device_domain_info *info;
1396
1397         assert_spin_locked(&device_domain_lock);
1398
1399         if (!iommu->qi)
1400                 return NULL;
1401
1402         list_for_each_entry(info, &domain->devices, link)
1403                 if (info->iommu == iommu && info->bus == bus &&
1404                     info->devfn == devfn) {
1405                         if (info->ats_supported && info->dev)
1406                                 return info;
1407                         break;
1408                 }
1409
1410         return NULL;
1411 }
1412
1413 static void domain_update_iotlb(struct dmar_domain *domain)
1414 {
1415         struct device_domain_info *info;
1416         bool has_iotlb_device = false;
1417
1418         assert_spin_locked(&device_domain_lock);
1419
1420         list_for_each_entry(info, &domain->devices, link)
1421                 if (info->ats_enabled) {
1422                         has_iotlb_device = true;
1423                         break;
1424                 }
1425
1426         domain->has_iotlb_device = has_iotlb_device;
1427 }
1428
1429 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1430 {
1431         struct pci_dev *pdev;
1432
1433         assert_spin_locked(&device_domain_lock);
1434
1435         if (!info || !dev_is_pci(info->dev))
1436                 return;
1437
1438         pdev = to_pci_dev(info->dev);
1439         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1440          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1441          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1442          * reserved, which should be set to 0.
1443          */
1444         if (!ecap_dit(info->iommu->ecap))
1445                 info->pfsid = 0;
1446         else {
1447                 struct pci_dev *pf_pdev;
1448
1449                 /* pdev will be returned if device is not a vf */
1450                 pf_pdev = pci_physfn(pdev);
1451                 info->pfsid = pci_dev_id(pf_pdev);
1452         }
1453
1454 #ifdef CONFIG_INTEL_IOMMU_SVM
1455         /* The PCIe spec, in its wisdom, declares that the behaviour of
1456            the device if you enable PASID support after ATS support is
1457            undefined. So always enable PASID support on devices which
1458            have it, even if we can't yet know if we're ever going to
1459            use it. */
1460         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1461                 info->pasid_enabled = 1;
1462
1463         if (info->pri_supported &&
1464             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1465             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1466                 info->pri_enabled = 1;
1467 #endif
1468         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1469             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1470                 info->ats_enabled = 1;
1471                 domain_update_iotlb(info->domain);
1472                 info->ats_qdep = pci_ats_queue_depth(pdev);
1473         }
1474 }
1475
1476 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1477 {
1478         struct pci_dev *pdev;
1479
1480         assert_spin_locked(&device_domain_lock);
1481
1482         if (!dev_is_pci(info->dev))
1483                 return;
1484
1485         pdev = to_pci_dev(info->dev);
1486
1487         if (info->ats_enabled) {
1488                 pci_disable_ats(pdev);
1489                 info->ats_enabled = 0;
1490                 domain_update_iotlb(info->domain);
1491         }
1492 #ifdef CONFIG_INTEL_IOMMU_SVM
1493         if (info->pri_enabled) {
1494                 pci_disable_pri(pdev);
1495                 info->pri_enabled = 0;
1496         }
1497         if (info->pasid_enabled) {
1498                 pci_disable_pasid(pdev);
1499                 info->pasid_enabled = 0;
1500         }
1501 #endif
1502 }
1503
1504 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1505                                     u64 addr, unsigned int mask)
1506 {
1507         u16 sid, qdep;
1508
1509         if (!info || !info->ats_enabled)
1510                 return;
1511
1512         sid = info->bus << 8 | info->devfn;
1513         qdep = info->ats_qdep;
1514         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1515                            qdep, addr, mask);
1516 }
1517
1518 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1519                                   u64 addr, unsigned mask)
1520 {
1521         unsigned long flags;
1522         struct device_domain_info *info;
1523
1524         if (!domain->has_iotlb_device)
1525                 return;
1526
1527         spin_lock_irqsave(&device_domain_lock, flags);
1528         list_for_each_entry(info, &domain->devices, link)
1529                 __iommu_flush_dev_iotlb(info, addr, mask);
1530
1531         spin_unlock_irqrestore(&device_domain_lock, flags);
1532 }
1533
1534 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1535                                   struct dmar_domain *domain,
1536                                   unsigned long pfn, unsigned int pages,
1537                                   int ih, int map)
1538 {
1539         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1540         unsigned int mask = ilog2(aligned_pages);
1541         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1542         u16 did = domain->iommu_did[iommu->seq_id];
1543
1544         BUG_ON(pages == 0);
1545
1546         if (ih)
1547                 ih = 1 << 6;
1548
1549         if (domain_use_first_level(domain)) {
1550                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1551         } else {
1552                 unsigned long bitmask = aligned_pages - 1;
1553
1554                 /*
1555                  * PSI masks the low order bits of the base address. If the
1556                  * address isn't aligned to the mask, then compute a mask value
1557                  * needed to ensure the target range is flushed.
1558                  */
1559                 if (unlikely(bitmask & pfn)) {
1560                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1561
1562                         /*
1563                          * Since end_pfn <= pfn + bitmask, the only way bits
1564                          * higher than bitmask can differ in pfn and end_pfn is
1565                          * by carrying. This means after masking out bitmask,
1566                          * high bits starting with the first set bit in
1567                          * shared_bits are all equal in both pfn and end_pfn.
1568                          */
1569                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1570                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1571                 }
1572
1573                 /*
1574                  * Fallback to domain selective flush if no PSI support or
1575                  * the size is too big.
1576                  */
1577                 if (!cap_pgsel_inv(iommu->cap) ||
1578                     mask > cap_max_amask_val(iommu->cap))
1579                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580                                                         DMA_TLB_DSI_FLUSH);
1581                 else
1582                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1583                                                         DMA_TLB_PSI_FLUSH);
1584         }
1585
1586         /*
1587          * In caching mode, changes of pages from non-present to present require
1588          * flush. However, device IOTLB doesn't need to be flushed in this case.
1589          */
1590         if (!cap_caching_mode(iommu->cap) || !map)
1591                 iommu_flush_dev_iotlb(domain, addr, mask);
1592 }
1593
1594 /* Notification for newly created mappings */
1595 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1596                                         struct dmar_domain *domain,
1597                                         unsigned long pfn, unsigned int pages)
1598 {
1599         /*
1600          * It's a non-present to present mapping. Only flush if caching mode
1601          * and second level.
1602          */
1603         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1604                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1605         else
1606                 iommu_flush_write_buffer(iommu);
1607 }
1608
1609 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1610 {
1611         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1612         int idx;
1613
1614         for_each_domain_iommu(idx, dmar_domain) {
1615                 struct intel_iommu *iommu = g_iommus[idx];
1616                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1617
1618                 if (domain_use_first_level(dmar_domain))
1619                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1620                 else
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                  DMA_TLB_DSI_FLUSH);
1623
1624                 if (!cap_caching_mode(iommu->cap))
1625                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1626         }
1627 }
1628
1629 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1630 {
1631         u32 pmen;
1632         unsigned long flags;
1633
1634         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1635                 return;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1639         pmen &= ~DMA_PMEN_EPM;
1640         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1641
1642         /* wait for the protected region status bit to clear */
1643         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1644                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1645
1646         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1647 }
1648
1649 static void iommu_enable_translation(struct intel_iommu *iommu)
1650 {
1651         u32 sts;
1652         unsigned long flags;
1653
1654         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1655         iommu->gcmd |= DMA_GCMD_TE;
1656         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1657
1658         /* Make sure hardware complete it */
1659         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1660                       readl, (sts & DMA_GSTS_TES), sts);
1661
1662         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1663 }
1664
1665 static void iommu_disable_translation(struct intel_iommu *iommu)
1666 {
1667         u32 sts;
1668         unsigned long flag;
1669
1670         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1671             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1672                 return;
1673
1674         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1675         iommu->gcmd &= ~DMA_GCMD_TE;
1676         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1677
1678         /* Make sure hardware complete it */
1679         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1680                       readl, (!(sts & DMA_GSTS_TES)), sts);
1681
1682         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1683 }
1684
1685 static int iommu_init_domains(struct intel_iommu *iommu)
1686 {
1687         u32 ndomains;
1688
1689         ndomains = cap_ndoms(iommu->cap);
1690         pr_debug("%s: Number of Domains supported <%d>\n",
1691                  iommu->name, ndomains);
1692
1693         spin_lock_init(&iommu->lock);
1694
1695         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1696         if (!iommu->domain_ids)
1697                 return -ENOMEM;
1698
1699         /*
1700          * If Caching mode is set, then invalid translations are tagged
1701          * with domain-id 0, hence we need to pre-allocate it. We also
1702          * use domain-id 0 as a marker for non-allocated domain-id, so
1703          * make sure it is not used for a real domain.
1704          */
1705         set_bit(0, iommu->domain_ids);
1706
1707         /*
1708          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1709          * entry for first-level or pass-through translation modes should
1710          * be programmed with a domain id different from those used for
1711          * second-level or nested translation. We reserve a domain id for
1712          * this purpose.
1713          */
1714         if (sm_supported(iommu))
1715                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1716
1717         return 0;
1718 }
1719
1720 static void disable_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         struct device_domain_info *info, *tmp;
1723         unsigned long flags;
1724
1725         if (!iommu->domain_ids)
1726                 return;
1727
1728         spin_lock_irqsave(&device_domain_lock, flags);
1729         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1730                 if (info->iommu != iommu)
1731                         continue;
1732
1733                 if (!info->dev || !info->domain)
1734                         continue;
1735
1736                 __dmar_remove_one_dev_info(info);
1737         }
1738         spin_unlock_irqrestore(&device_domain_lock, flags);
1739
1740         if (iommu->gcmd & DMA_GCMD_TE)
1741                 iommu_disable_translation(iommu);
1742 }
1743
1744 static void free_dmar_iommu(struct intel_iommu *iommu)
1745 {
1746         if (iommu->domain_ids) {
1747                 bitmap_free(iommu->domain_ids);
1748                 iommu->domain_ids = NULL;
1749         }
1750
1751         g_iommus[iommu->seq_id] = NULL;
1752
1753         /* free context mapping */
1754         free_context_table(iommu);
1755
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757         if (pasid_supported(iommu)) {
1758                 if (ecap_prs(iommu->ecap))
1759                         intel_svm_finish_prq(iommu);
1760         }
1761         if (vccap_pasid(iommu->vccap))
1762                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1763
1764 #endif
1765 }
1766
1767 /*
1768  * Check and return whether first level is used by default for
1769  * DMA translation.
1770  */
1771 static bool first_level_by_default(unsigned int type)
1772 {
1773         /* Only SL is available in legacy mode */
1774         if (!scalable_mode_support())
1775                 return false;
1776
1777         /* Only level (either FL or SL) is available, just use it */
1778         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1779                 return intel_cap_flts_sanity();
1780
1781         /* Both levels are available, decide it based on domain type */
1782         return type != IOMMU_DOMAIN_UNMANAGED;
1783 }
1784
1785 static struct dmar_domain *alloc_domain(unsigned int type)
1786 {
1787         struct dmar_domain *domain;
1788
1789         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1790         if (!domain)
1791                 return NULL;
1792
1793         domain->nid = NUMA_NO_NODE;
1794         if (first_level_by_default(type))
1795                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1796         domain->has_iotlb_device = false;
1797         INIT_LIST_HEAD(&domain->devices);
1798
1799         return domain;
1800 }
1801
1802 /* Must be called with iommu->lock */
1803 static int domain_attach_iommu(struct dmar_domain *domain,
1804                                struct intel_iommu *iommu)
1805 {
1806         unsigned long ndomains;
1807         int num;
1808
1809         assert_spin_locked(&device_domain_lock);
1810         assert_spin_locked(&iommu->lock);
1811
1812         domain->iommu_refcnt[iommu->seq_id] += 1;
1813         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1814                 ndomains = cap_ndoms(iommu->cap);
1815                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1816
1817                 if (num >= ndomains) {
1818                         pr_err("%s: No free domain ids\n", iommu->name);
1819                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1820                         return -ENOSPC;
1821                 }
1822
1823                 set_bit(num, iommu->domain_ids);
1824                 domain->iommu_did[iommu->seq_id] = num;
1825                 domain->nid                      = iommu->node;
1826                 domain_update_iommu_cap(domain);
1827         }
1828
1829         return 0;
1830 }
1831
1832 static void domain_detach_iommu(struct dmar_domain *domain,
1833                                 struct intel_iommu *iommu)
1834 {
1835         int num;
1836
1837         assert_spin_locked(&device_domain_lock);
1838         assert_spin_locked(&iommu->lock);
1839
1840         domain->iommu_refcnt[iommu->seq_id] -= 1;
1841         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1842                 num = domain->iommu_did[iommu->seq_id];
1843                 clear_bit(num, iommu->domain_ids);
1844                 domain_update_iommu_cap(domain);
1845                 domain->iommu_did[iommu->seq_id] = 0;
1846         }
1847 }
1848
1849 static inline int guestwidth_to_adjustwidth(int gaw)
1850 {
1851         int agaw;
1852         int r = (gaw - 12) % 9;
1853
1854         if (r == 0)
1855                 agaw = gaw;
1856         else
1857                 agaw = gaw + 9 - r;
1858         if (agaw > 64)
1859                 agaw = 64;
1860         return agaw;
1861 }
1862
1863 static void domain_exit(struct dmar_domain *domain)
1864 {
1865
1866         /* Remove associated devices and clear attached or cached domains */
1867         domain_remove_dev_info(domain);
1868
1869         if (domain->pgd) {
1870                 LIST_HEAD(freelist);
1871
1872                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1873                 put_pages_list(&freelist);
1874         }
1875
1876         kfree(domain);
1877 }
1878
1879 /*
1880  * Get the PASID directory size for scalable mode context entry.
1881  * Value of X in the PDTS field of a scalable mode context entry
1882  * indicates PASID directory with 2^(X + 7) entries.
1883  */
1884 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1885 {
1886         unsigned long pds, max_pde;
1887
1888         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1889         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1890         if (pds < 7)
1891                 return 0;
1892
1893         return pds - 7;
1894 }
1895
1896 /*
1897  * Set the RID_PASID field of a scalable mode context entry. The
1898  * IOMMU hardware will use the PASID value set in this field for
1899  * DMA translations of DMA requests without PASID.
1900  */
1901 static inline void
1902 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1903 {
1904         context->hi |= pasid & ((1 << 20) - 1);
1905 }
1906
1907 /*
1908  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1909  * entry.
1910  */
1911 static inline void context_set_sm_dte(struct context_entry *context)
1912 {
1913         context->lo |= (1 << 2);
1914 }
1915
1916 /*
1917  * Set the PRE(Page Request Enable) field of a scalable mode context
1918  * entry.
1919  */
1920 static inline void context_set_sm_pre(struct context_entry *context)
1921 {
1922         context->lo |= (1 << 4);
1923 }
1924
1925 /* Convert value to context PASID directory size field coding. */
1926 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1927
1928 static int domain_context_mapping_one(struct dmar_domain *domain,
1929                                       struct intel_iommu *iommu,
1930                                       struct pasid_table *table,
1931                                       u8 bus, u8 devfn)
1932 {
1933         u16 did = domain->iommu_did[iommu->seq_id];
1934         int translation = CONTEXT_TT_MULTI_LEVEL;
1935         struct device_domain_info *info = NULL;
1936         struct context_entry *context;
1937         unsigned long flags;
1938         int ret;
1939
1940         WARN_ON(did == 0);
1941
1942         if (hw_pass_through && domain_type_is_si(domain))
1943                 translation = CONTEXT_TT_PASS_THROUGH;
1944
1945         pr_debug("Set context mapping for %02x:%02x.%d\n",
1946                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1947
1948         BUG_ON(!domain->pgd);
1949
1950         spin_lock_irqsave(&device_domain_lock, flags);
1951         spin_lock(&iommu->lock);
1952
1953         ret = -ENOMEM;
1954         context = iommu_context_addr(iommu, bus, devfn, 1);
1955         if (!context)
1956                 goto out_unlock;
1957
1958         ret = 0;
1959         if (context_present(context))
1960                 goto out_unlock;
1961
1962         /*
1963          * For kdump cases, old valid entries may be cached due to the
1964          * in-flight DMA and copied pgtable, but there is no unmapping
1965          * behaviour for them, thus we need an explicit cache flush for
1966          * the newly-mapped device. For kdump, at this point, the device
1967          * is supposed to finish reset at its driver probe stage, so no
1968          * in-flight DMA will exist, and we don't need to worry anymore
1969          * hereafter.
1970          */
1971         if (context_copied(context)) {
1972                 u16 did_old = context_domain_id(context);
1973
1974                 if (did_old < cap_ndoms(iommu->cap)) {
1975                         iommu->flush.flush_context(iommu, did_old,
1976                                                    (((u16)bus) << 8) | devfn,
1977                                                    DMA_CCMD_MASK_NOBIT,
1978                                                    DMA_CCMD_DEVICE_INVL);
1979                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1980                                                  DMA_TLB_DSI_FLUSH);
1981                 }
1982         }
1983
1984         context_clear_entry(context);
1985
1986         if (sm_supported(iommu)) {
1987                 unsigned long pds;
1988
1989                 WARN_ON(!table);
1990
1991                 /* Setup the PASID DIR pointer: */
1992                 pds = context_get_sm_pds(table);
1993                 context->lo = (u64)virt_to_phys(table->table) |
1994                                 context_pdts(pds);
1995
1996                 /* Setup the RID_PASID field: */
1997                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1998
1999                 /*
2000                  * Setup the Device-TLB enable bit and Page request
2001                  * Enable bit:
2002                  */
2003                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2004                 if (info && info->ats_supported)
2005                         context_set_sm_dte(context);
2006                 if (info && info->pri_supported)
2007                         context_set_sm_pre(context);
2008         } else {
2009                 struct dma_pte *pgd = domain->pgd;
2010                 int agaw;
2011
2012                 context_set_domain_id(context, did);
2013
2014                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2015                         /*
2016                          * Skip top levels of page tables for iommu which has
2017                          * less agaw than default. Unnecessary for PT mode.
2018                          */
2019                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2020                                 ret = -ENOMEM;
2021                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2022                                 if (!dma_pte_present(pgd))
2023                                         goto out_unlock;
2024                         }
2025
2026                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2027                         if (info && info->ats_supported)
2028                                 translation = CONTEXT_TT_DEV_IOTLB;
2029                         else
2030                                 translation = CONTEXT_TT_MULTI_LEVEL;
2031
2032                         context_set_address_root(context, virt_to_phys(pgd));
2033                         context_set_address_width(context, agaw);
2034                 } else {
2035                         /*
2036                          * In pass through mode, AW must be programmed to
2037                          * indicate the largest AGAW value supported by
2038                          * hardware. And ASR is ignored by hardware.
2039                          */
2040                         context_set_address_width(context, iommu->msagaw);
2041                 }
2042
2043                 context_set_translation_type(context, translation);
2044         }
2045
2046         context_set_fault_enable(context);
2047         context_set_present(context);
2048         if (!ecap_coherent(iommu->ecap))
2049                 clflush_cache_range(context, sizeof(*context));
2050
2051         /*
2052          * It's a non-present to present mapping. If hardware doesn't cache
2053          * non-present entry we only need to flush the write-buffer. If the
2054          * _does_ cache non-present entries, then it does so in the special
2055          * domain #0, which we have to flush:
2056          */
2057         if (cap_caching_mode(iommu->cap)) {
2058                 iommu->flush.flush_context(iommu, 0,
2059                                            (((u16)bus) << 8) | devfn,
2060                                            DMA_CCMD_MASK_NOBIT,
2061                                            DMA_CCMD_DEVICE_INVL);
2062                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2063         } else {
2064                 iommu_flush_write_buffer(iommu);
2065         }
2066         iommu_enable_dev_iotlb(info);
2067
2068         ret = 0;
2069
2070 out_unlock:
2071         spin_unlock(&iommu->lock);
2072         spin_unlock_irqrestore(&device_domain_lock, flags);
2073
2074         return ret;
2075 }
2076
2077 struct domain_context_mapping_data {
2078         struct dmar_domain *domain;
2079         struct intel_iommu *iommu;
2080         struct pasid_table *table;
2081 };
2082
2083 static int domain_context_mapping_cb(struct pci_dev *pdev,
2084                                      u16 alias, void *opaque)
2085 {
2086         struct domain_context_mapping_data *data = opaque;
2087
2088         return domain_context_mapping_one(data->domain, data->iommu,
2089                                           data->table, PCI_BUS_NUM(alias),
2090                                           alias & 0xff);
2091 }
2092
2093 static int
2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2095 {
2096         struct domain_context_mapping_data data;
2097         struct pasid_table *table;
2098         struct intel_iommu *iommu;
2099         u8 bus, devfn;
2100
2101         iommu = device_to_iommu(dev, &bus, &devfn);
2102         if (!iommu)
2103                 return -ENODEV;
2104
2105         table = intel_pasid_get_table(dev);
2106
2107         if (!dev_is_pci(dev))
2108                 return domain_context_mapping_one(domain, iommu, table,
2109                                                   bus, devfn);
2110
2111         data.domain = domain;
2112         data.iommu = iommu;
2113         data.table = table;
2114
2115         return pci_for_each_dma_alias(to_pci_dev(dev),
2116                                       &domain_context_mapping_cb, &data);
2117 }
2118
2119 static int domain_context_mapped_cb(struct pci_dev *pdev,
2120                                     u16 alias, void *opaque)
2121 {
2122         struct intel_iommu *iommu = opaque;
2123
2124         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2125 }
2126
2127 static int domain_context_mapped(struct device *dev)
2128 {
2129         struct intel_iommu *iommu;
2130         u8 bus, devfn;
2131
2132         iommu = device_to_iommu(dev, &bus, &devfn);
2133         if (!iommu)
2134                 return -ENODEV;
2135
2136         if (!dev_is_pci(dev))
2137                 return device_context_mapped(iommu, bus, devfn);
2138
2139         return !pci_for_each_dma_alias(to_pci_dev(dev),
2140                                        domain_context_mapped_cb, iommu);
2141 }
2142
2143 /* Returns a number of VTD pages, but aligned to MM page size */
2144 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2145                                             size_t size)
2146 {
2147         host_addr &= ~PAGE_MASK;
2148         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2149 }
2150
2151 /* Return largest possible superpage level for a given mapping */
2152 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2153                                           unsigned long iov_pfn,
2154                                           unsigned long phy_pfn,
2155                                           unsigned long pages)
2156 {
2157         int support, level = 1;
2158         unsigned long pfnmerge;
2159
2160         support = domain->iommu_superpage;
2161
2162         /* To use a large page, the virtual *and* physical addresses
2163            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2164            of them will mean we have to use smaller pages. So just
2165            merge them and check both at once. */
2166         pfnmerge = iov_pfn | phy_pfn;
2167
2168         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2169                 pages >>= VTD_STRIDE_SHIFT;
2170                 if (!pages)
2171                         break;
2172                 pfnmerge >>= VTD_STRIDE_SHIFT;
2173                 level++;
2174                 support--;
2175         }
2176         return level;
2177 }
2178
2179 /*
2180  * Ensure that old small page tables are removed to make room for superpage(s).
2181  * We're going to add new large pages, so make sure we don't remove their parent
2182  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2183  */
2184 static void switch_to_super_page(struct dmar_domain *domain,
2185                                  unsigned long start_pfn,
2186                                  unsigned long end_pfn, int level)
2187 {
2188         unsigned long lvl_pages = lvl_to_nr_pages(level);
2189         struct dma_pte *pte = NULL;
2190         int i;
2191
2192         while (start_pfn <= end_pfn) {
2193                 if (!pte)
2194                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2195
2196                 if (dma_pte_present(pte)) {
2197                         dma_pte_free_pagetable(domain, start_pfn,
2198                                                start_pfn + lvl_pages - 1,
2199                                                level + 1);
2200
2201                         for_each_domain_iommu(i, domain)
2202                                 iommu_flush_iotlb_psi(g_iommus[i], domain,
2203                                                       start_pfn, lvl_pages,
2204                                                       0, 0);
2205                 }
2206
2207                 pte++;
2208                 start_pfn += lvl_pages;
2209                 if (first_pte_in_page(pte))
2210                         pte = NULL;
2211         }
2212 }
2213
2214 static int
2215 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2216                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2217 {
2218         struct dma_pte *first_pte = NULL, *pte = NULL;
2219         unsigned int largepage_lvl = 0;
2220         unsigned long lvl_pages = 0;
2221         phys_addr_t pteval;
2222         u64 attr;
2223
2224         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2225
2226         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2227                 return -EINVAL;
2228
2229         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2230         attr |= DMA_FL_PTE_PRESENT;
2231         if (domain_use_first_level(domain)) {
2232                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2233                 if (prot & DMA_PTE_WRITE)
2234                         attr |= DMA_FL_PTE_DIRTY;
2235         }
2236
2237         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2238
2239         while (nr_pages > 0) {
2240                 uint64_t tmp;
2241
2242                 if (!pte) {
2243                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2244                                         phys_pfn, nr_pages);
2245
2246                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2247                         if (!pte)
2248                                 return -ENOMEM;
2249                         first_pte = pte;
2250
2251                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2252
2253                         /* It is large page*/
2254                         if (largepage_lvl > 1) {
2255                                 unsigned long end_pfn;
2256                                 unsigned long pages_to_remove;
2257
2258                                 pteval |= DMA_PTE_LARGE_PAGE;
2259                                 pages_to_remove = min_t(unsigned long, nr_pages,
2260                                                         nr_pte_to_next_page(pte) * lvl_pages);
2261                                 end_pfn = iov_pfn + pages_to_remove - 1;
2262                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2263                         } else {
2264                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2265                         }
2266
2267                 }
2268                 /* We don't need lock here, nobody else
2269                  * touches the iova range
2270                  */
2271                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2272                 if (tmp) {
2273                         static int dumps = 5;
2274                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2275                                 iov_pfn, tmp, (unsigned long long)pteval);
2276                         if (dumps) {
2277                                 dumps--;
2278                                 debug_dma_dump_mappings(NULL);
2279                         }
2280                         WARN_ON(1);
2281                 }
2282
2283                 nr_pages -= lvl_pages;
2284                 iov_pfn += lvl_pages;
2285                 phys_pfn += lvl_pages;
2286                 pteval += lvl_pages * VTD_PAGE_SIZE;
2287
2288                 /* If the next PTE would be the first in a new page, then we
2289                  * need to flush the cache on the entries we've just written.
2290                  * And then we'll need to recalculate 'pte', so clear it and
2291                  * let it get set again in the if (!pte) block above.
2292                  *
2293                  * If we're done (!nr_pages) we need to flush the cache too.
2294                  *
2295                  * Also if we've been setting superpages, we may need to
2296                  * recalculate 'pte' and switch back to smaller pages for the
2297                  * end of the mapping, if the trailing size is not enough to
2298                  * use another superpage (i.e. nr_pages < lvl_pages).
2299                  */
2300                 pte++;
2301                 if (!nr_pages || first_pte_in_page(pte) ||
2302                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2303                         domain_flush_cache(domain, first_pte,
2304                                            (void *)pte - (void *)first_pte);
2305                         pte = NULL;
2306                 }
2307         }
2308
2309         return 0;
2310 }
2311
2312 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2313 {
2314         struct intel_iommu *iommu = info->iommu;
2315         struct context_entry *context;
2316         unsigned long flags;
2317         u16 did_old;
2318
2319         if (!iommu)
2320                 return;
2321
2322         spin_lock_irqsave(&iommu->lock, flags);
2323         context = iommu_context_addr(iommu, bus, devfn, 0);
2324         if (!context) {
2325                 spin_unlock_irqrestore(&iommu->lock, flags);
2326                 return;
2327         }
2328
2329         if (sm_supported(iommu)) {
2330                 if (hw_pass_through && domain_type_is_si(info->domain))
2331                         did_old = FLPT_DEFAULT_DID;
2332                 else
2333                         did_old = info->domain->iommu_did[iommu->seq_id];
2334         } else {
2335                 did_old = context_domain_id(context);
2336         }
2337
2338         context_clear_entry(context);
2339         __iommu_flush_cache(iommu, context, sizeof(*context));
2340         spin_unlock_irqrestore(&iommu->lock, flags);
2341         iommu->flush.flush_context(iommu,
2342                                    did_old,
2343                                    (((u16)bus) << 8) | devfn,
2344                                    DMA_CCMD_MASK_NOBIT,
2345                                    DMA_CCMD_DEVICE_INVL);
2346
2347         if (sm_supported(iommu))
2348                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2349
2350         iommu->flush.flush_iotlb(iommu,
2351                                  did_old,
2352                                  0,
2353                                  0,
2354                                  DMA_TLB_DSI_FLUSH);
2355
2356         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2357 }
2358
2359 static void domain_remove_dev_info(struct dmar_domain *domain)
2360 {
2361         struct device_domain_info *info, *tmp;
2362         unsigned long flags;
2363
2364         spin_lock_irqsave(&device_domain_lock, flags);
2365         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2366                 __dmar_remove_one_dev_info(info);
2367         spin_unlock_irqrestore(&device_domain_lock, flags);
2368 }
2369
2370 static inline struct device_domain_info *
2371 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2372 {
2373         struct device_domain_info *info;
2374
2375         list_for_each_entry(info, &device_domain_list, global)
2376                 if (info->segment == segment && info->bus == bus &&
2377                     info->devfn == devfn)
2378                         return info;
2379
2380         return NULL;
2381 }
2382
2383 static int domain_setup_first_level(struct intel_iommu *iommu,
2384                                     struct dmar_domain *domain,
2385                                     struct device *dev,
2386                                     u32 pasid)
2387 {
2388         struct dma_pte *pgd = domain->pgd;
2389         int agaw, level;
2390         int flags = 0;
2391
2392         /*
2393          * Skip top levels of page tables for iommu which has
2394          * less agaw than default. Unnecessary for PT mode.
2395          */
2396         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2397                 pgd = phys_to_virt(dma_pte_addr(pgd));
2398                 if (!dma_pte_present(pgd))
2399                         return -ENOMEM;
2400         }
2401
2402         level = agaw_to_level(agaw);
2403         if (level != 4 && level != 5)
2404                 return -EINVAL;
2405
2406         if (pasid != PASID_RID2PASID)
2407                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2408         if (level == 5)
2409                 flags |= PASID_FLAG_FL5LP;
2410
2411         if (domain->force_snooping)
2412                 flags |= PASID_FLAG_PAGE_SNOOP;
2413
2414         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2415                                              domain->iommu_did[iommu->seq_id],
2416                                              flags);
2417 }
2418
2419 static bool dev_is_real_dma_subdevice(struct device *dev)
2420 {
2421         return dev && dev_is_pci(dev) &&
2422                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2423 }
2424
2425 static int iommu_domain_identity_map(struct dmar_domain *domain,
2426                                      unsigned long first_vpfn,
2427                                      unsigned long last_vpfn)
2428 {
2429         /*
2430          * RMRR range might have overlap with physical memory range,
2431          * clear it first
2432          */
2433         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2434
2435         return __domain_mapping(domain, first_vpfn,
2436                                 first_vpfn, last_vpfn - first_vpfn + 1,
2437                                 DMA_PTE_READ|DMA_PTE_WRITE);
2438 }
2439
2440 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2441
2442 static int __init si_domain_init(int hw)
2443 {
2444         struct dmar_rmrr_unit *rmrr;
2445         struct device *dev;
2446         int i, nid, ret;
2447
2448         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2449         if (!si_domain)
2450                 return -EFAULT;
2451
2452         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2453                 domain_exit(si_domain);
2454                 return -EFAULT;
2455         }
2456
2457         if (hw)
2458                 return 0;
2459
2460         for_each_online_node(nid) {
2461                 unsigned long start_pfn, end_pfn;
2462                 int i;
2463
2464                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2465                         ret = iommu_domain_identity_map(si_domain,
2466                                         mm_to_dma_pfn(start_pfn),
2467                                         mm_to_dma_pfn(end_pfn));
2468                         if (ret)
2469                                 return ret;
2470                 }
2471         }
2472
2473         /*
2474          * Identity map the RMRRs so that devices with RMRRs could also use
2475          * the si_domain.
2476          */
2477         for_each_rmrr_units(rmrr) {
2478                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2479                                           i, dev) {
2480                         unsigned long long start = rmrr->base_address;
2481                         unsigned long long end = rmrr->end_address;
2482
2483                         if (WARN_ON(end < start ||
2484                                     end >> agaw_to_width(si_domain->agaw)))
2485                                 continue;
2486
2487                         ret = iommu_domain_identity_map(si_domain,
2488                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2489                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2490                         if (ret)
2491                                 return ret;
2492                 }
2493         }
2494
2495         return 0;
2496 }
2497
2498 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2499 {
2500         struct device_domain_info *info = dev_iommu_priv_get(dev);
2501         struct intel_iommu *iommu;
2502         unsigned long flags;
2503         u8 bus, devfn;
2504         int ret;
2505
2506         iommu = device_to_iommu(dev, &bus, &devfn);
2507         if (!iommu)
2508                 return -ENODEV;
2509
2510         spin_lock_irqsave(&device_domain_lock, flags);
2511         info->domain = domain;
2512         spin_lock(&iommu->lock);
2513         ret = domain_attach_iommu(domain, iommu);
2514         spin_unlock(&iommu->lock);
2515         if (ret) {
2516                 spin_unlock_irqrestore(&device_domain_lock, flags);
2517                 return ret;
2518         }
2519         list_add(&info->link, &domain->devices);
2520         spin_unlock_irqrestore(&device_domain_lock, flags);
2521
2522         /* PASID table is mandatory for a PCI device in scalable mode. */
2523         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2524                 ret = intel_pasid_alloc_table(dev);
2525                 if (ret) {
2526                         dev_err(dev, "PASID table allocation failed\n");
2527                         dmar_remove_one_dev_info(dev);
2528                         return ret;
2529                 }
2530
2531                 /* Setup the PASID entry for requests without PASID: */
2532                 spin_lock_irqsave(&iommu->lock, flags);
2533                 if (hw_pass_through && domain_type_is_si(domain))
2534                         ret = intel_pasid_setup_pass_through(iommu, domain,
2535                                         dev, PASID_RID2PASID);
2536                 else if (domain_use_first_level(domain))
2537                         ret = domain_setup_first_level(iommu, domain, dev,
2538                                         PASID_RID2PASID);
2539                 else
2540                         ret = intel_pasid_setup_second_level(iommu, domain,
2541                                         dev, PASID_RID2PASID);
2542                 spin_unlock_irqrestore(&iommu->lock, flags);
2543                 if (ret) {
2544                         dev_err(dev, "Setup RID2PASID failed\n");
2545                         dmar_remove_one_dev_info(dev);
2546                         return ret;
2547                 }
2548         }
2549
2550         ret = domain_context_mapping(domain, dev);
2551         if (ret) {
2552                 dev_err(dev, "Domain context map failed\n");
2553                 dmar_remove_one_dev_info(dev);
2554                 return ret;
2555         }
2556
2557         return 0;
2558 }
2559
2560 static bool device_has_rmrr(struct device *dev)
2561 {
2562         struct dmar_rmrr_unit *rmrr;
2563         struct device *tmp;
2564         int i;
2565
2566         rcu_read_lock();
2567         for_each_rmrr_units(rmrr) {
2568                 /*
2569                  * Return TRUE if this RMRR contains the device that
2570                  * is passed in.
2571                  */
2572                 for_each_active_dev_scope(rmrr->devices,
2573                                           rmrr->devices_cnt, i, tmp)
2574                         if (tmp == dev ||
2575                             is_downstream_to_pci_bridge(dev, tmp)) {
2576                                 rcu_read_unlock();
2577                                 return true;
2578                         }
2579         }
2580         rcu_read_unlock();
2581         return false;
2582 }
2583
2584 /**
2585  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2586  * is relaxable (ie. is allowed to be not enforced under some conditions)
2587  * @dev: device handle
2588  *
2589  * We assume that PCI USB devices with RMRRs have them largely
2590  * for historical reasons and that the RMRR space is not actively used post
2591  * boot.  This exclusion may change if vendors begin to abuse it.
2592  *
2593  * The same exception is made for graphics devices, with the requirement that
2594  * any use of the RMRR regions will be torn down before assigning the device
2595  * to a guest.
2596  *
2597  * Return: true if the RMRR is relaxable, false otherwise
2598  */
2599 static bool device_rmrr_is_relaxable(struct device *dev)
2600 {
2601         struct pci_dev *pdev;
2602
2603         if (!dev_is_pci(dev))
2604                 return false;
2605
2606         pdev = to_pci_dev(dev);
2607         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2608                 return true;
2609         else
2610                 return false;
2611 }
2612
2613 /*
2614  * There are a couple cases where we need to restrict the functionality of
2615  * devices associated with RMRRs.  The first is when evaluating a device for
2616  * identity mapping because problems exist when devices are moved in and out
2617  * of domains and their respective RMRR information is lost.  This means that
2618  * a device with associated RMRRs will never be in a "passthrough" domain.
2619  * The second is use of the device through the IOMMU API.  This interface
2620  * expects to have full control of the IOVA space for the device.  We cannot
2621  * satisfy both the requirement that RMRR access is maintained and have an
2622  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2623  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2624  * We therefore prevent devices associated with an RMRR from participating in
2625  * the IOMMU API, which eliminates them from device assignment.
2626  *
2627  * In both cases, devices which have relaxable RMRRs are not concerned by this
2628  * restriction. See device_rmrr_is_relaxable comment.
2629  */
2630 static bool device_is_rmrr_locked(struct device *dev)
2631 {
2632         if (!device_has_rmrr(dev))
2633                 return false;
2634
2635         if (device_rmrr_is_relaxable(dev))
2636                 return false;
2637
2638         return true;
2639 }
2640
2641 /*
2642  * Return the required default domain type for a specific device.
2643  *
2644  * @dev: the device in query
2645  * @startup: true if this is during early boot
2646  *
2647  * Returns:
2648  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2649  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2650  *  - 0: both identity and dynamic domains work for this device
2651  */
2652 static int device_def_domain_type(struct device *dev)
2653 {
2654         if (dev_is_pci(dev)) {
2655                 struct pci_dev *pdev = to_pci_dev(dev);
2656
2657                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2658                         return IOMMU_DOMAIN_IDENTITY;
2659
2660                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2661                         return IOMMU_DOMAIN_IDENTITY;
2662         }
2663
2664         return 0;
2665 }
2666
2667 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2668 {
2669         /*
2670          * Start from the sane iommu hardware state.
2671          * If the queued invalidation is already initialized by us
2672          * (for example, while enabling interrupt-remapping) then
2673          * we got the things already rolling from a sane state.
2674          */
2675         if (!iommu->qi) {
2676                 /*
2677                  * Clear any previous faults.
2678                  */
2679                 dmar_fault(-1, iommu);
2680                 /*
2681                  * Disable queued invalidation if supported and already enabled
2682                  * before OS handover.
2683                  */
2684                 dmar_disable_qi(iommu);
2685         }
2686
2687         if (dmar_enable_qi(iommu)) {
2688                 /*
2689                  * Queued Invalidate not enabled, use Register Based Invalidate
2690                  */
2691                 iommu->flush.flush_context = __iommu_flush_context;
2692                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2693                 pr_info("%s: Using Register based invalidation\n",
2694                         iommu->name);
2695         } else {
2696                 iommu->flush.flush_context = qi_flush_context;
2697                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2698                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2699         }
2700 }
2701
2702 static int copy_context_table(struct intel_iommu *iommu,
2703                               struct root_entry *old_re,
2704                               struct context_entry **tbl,
2705                               int bus, bool ext)
2706 {
2707         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2708         struct context_entry *new_ce = NULL, ce;
2709         struct context_entry *old_ce = NULL;
2710         struct root_entry re;
2711         phys_addr_t old_ce_phys;
2712
2713         tbl_idx = ext ? bus * 2 : bus;
2714         memcpy(&re, old_re, sizeof(re));
2715
2716         for (devfn = 0; devfn < 256; devfn++) {
2717                 /* First calculate the correct index */
2718                 idx = (ext ? devfn * 2 : devfn) % 256;
2719
2720                 if (idx == 0) {
2721                         /* First save what we may have and clean up */
2722                         if (new_ce) {
2723                                 tbl[tbl_idx] = new_ce;
2724                                 __iommu_flush_cache(iommu, new_ce,
2725                                                     VTD_PAGE_SIZE);
2726                                 pos = 1;
2727                         }
2728
2729                         if (old_ce)
2730                                 memunmap(old_ce);
2731
2732                         ret = 0;
2733                         if (devfn < 0x80)
2734                                 old_ce_phys = root_entry_lctp(&re);
2735                         else
2736                                 old_ce_phys = root_entry_uctp(&re);
2737
2738                         if (!old_ce_phys) {
2739                                 if (ext && devfn == 0) {
2740                                         /* No LCTP, try UCTP */
2741                                         devfn = 0x7f;
2742                                         continue;
2743                                 } else {
2744                                         goto out;
2745                                 }
2746                         }
2747
2748                         ret = -ENOMEM;
2749                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2750                                         MEMREMAP_WB);
2751                         if (!old_ce)
2752                                 goto out;
2753
2754                         new_ce = alloc_pgtable_page(iommu->node);
2755                         if (!new_ce)
2756                                 goto out_unmap;
2757
2758                         ret = 0;
2759                 }
2760
2761                 /* Now copy the context entry */
2762                 memcpy(&ce, old_ce + idx, sizeof(ce));
2763
2764                 if (!__context_present(&ce))
2765                         continue;
2766
2767                 did = context_domain_id(&ce);
2768                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2769                         set_bit(did, iommu->domain_ids);
2770
2771                 /*
2772                  * We need a marker for copied context entries. This
2773                  * marker needs to work for the old format as well as
2774                  * for extended context entries.
2775                  *
2776                  * Bit 67 of the context entry is used. In the old
2777                  * format this bit is available to software, in the
2778                  * extended format it is the PGE bit, but PGE is ignored
2779                  * by HW if PASIDs are disabled (and thus still
2780                  * available).
2781                  *
2782                  * So disable PASIDs first and then mark the entry
2783                  * copied. This means that we don't copy PASID
2784                  * translations from the old kernel, but this is fine as
2785                  * faults there are not fatal.
2786                  */
2787                 context_clear_pasid_enable(&ce);
2788                 context_set_copied(&ce);
2789
2790                 new_ce[idx] = ce;
2791         }
2792
2793         tbl[tbl_idx + pos] = new_ce;
2794
2795         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2796
2797 out_unmap:
2798         memunmap(old_ce);
2799
2800 out:
2801         return ret;
2802 }
2803
2804 static int copy_translation_tables(struct intel_iommu *iommu)
2805 {
2806         struct context_entry **ctxt_tbls;
2807         struct root_entry *old_rt;
2808         phys_addr_t old_rt_phys;
2809         int ctxt_table_entries;
2810         unsigned long flags;
2811         u64 rtaddr_reg;
2812         int bus, ret;
2813         bool new_ext, ext;
2814
2815         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2816         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2817         new_ext    = !!ecap_ecs(iommu->ecap);
2818
2819         /*
2820          * The RTT bit can only be changed when translation is disabled,
2821          * but disabling translation means to open a window for data
2822          * corruption. So bail out and don't copy anything if we would
2823          * have to change the bit.
2824          */
2825         if (new_ext != ext)
2826                 return -EINVAL;
2827
2828         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2829         if (!old_rt_phys)
2830                 return -EINVAL;
2831
2832         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2833         if (!old_rt)
2834                 return -ENOMEM;
2835
2836         /* This is too big for the stack - allocate it from slab */
2837         ctxt_table_entries = ext ? 512 : 256;
2838         ret = -ENOMEM;
2839         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2840         if (!ctxt_tbls)
2841                 goto out_unmap;
2842
2843         for (bus = 0; bus < 256; bus++) {
2844                 ret = copy_context_table(iommu, &old_rt[bus],
2845                                          ctxt_tbls, bus, ext);
2846                 if (ret) {
2847                         pr_err("%s: Failed to copy context table for bus %d\n",
2848                                 iommu->name, bus);
2849                         continue;
2850                 }
2851         }
2852
2853         spin_lock_irqsave(&iommu->lock, flags);
2854
2855         /* Context tables are copied, now write them to the root_entry table */
2856         for (bus = 0; bus < 256; bus++) {
2857                 int idx = ext ? bus * 2 : bus;
2858                 u64 val;
2859
2860                 if (ctxt_tbls[idx]) {
2861                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2862                         iommu->root_entry[bus].lo = val;
2863                 }
2864
2865                 if (!ext || !ctxt_tbls[idx + 1])
2866                         continue;
2867
2868                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2869                 iommu->root_entry[bus].hi = val;
2870         }
2871
2872         spin_unlock_irqrestore(&iommu->lock, flags);
2873
2874         kfree(ctxt_tbls);
2875
2876         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2877
2878         ret = 0;
2879
2880 out_unmap:
2881         memunmap(old_rt);
2882
2883         return ret;
2884 }
2885
2886 #ifdef CONFIG_INTEL_IOMMU_SVM
2887 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2888 {
2889         struct intel_iommu *iommu = data;
2890         ioasid_t ioasid;
2891
2892         if (!iommu)
2893                 return INVALID_IOASID;
2894         /*
2895          * VT-d virtual command interface always uses the full 20 bit
2896          * PASID range. Host can partition guest PASID range based on
2897          * policies but it is out of guest's control.
2898          */
2899         if (min < PASID_MIN || max > intel_pasid_max_id)
2900                 return INVALID_IOASID;
2901
2902         if (vcmd_alloc_pasid(iommu, &ioasid))
2903                 return INVALID_IOASID;
2904
2905         return ioasid;
2906 }
2907
2908 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2909 {
2910         struct intel_iommu *iommu = data;
2911
2912         if (!iommu)
2913                 return;
2914         /*
2915          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2916          * We can only free the PASID when all the devices are unbound.
2917          */
2918         if (ioasid_find(NULL, ioasid, NULL)) {
2919                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2920                 return;
2921         }
2922         vcmd_free_pasid(iommu, ioasid);
2923 }
2924
2925 static void register_pasid_allocator(struct intel_iommu *iommu)
2926 {
2927         /*
2928          * If we are running in the host, no need for custom allocator
2929          * in that PASIDs are allocated from the host system-wide.
2930          */
2931         if (!cap_caching_mode(iommu->cap))
2932                 return;
2933
2934         if (!sm_supported(iommu)) {
2935                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2936                 return;
2937         }
2938
2939         /*
2940          * Register a custom PASID allocator if we are running in a guest,
2941          * guest PASID must be obtained via virtual command interface.
2942          * There can be multiple vIOMMUs in each guest but only one allocator
2943          * is active. All vIOMMU allocators will eventually be calling the same
2944          * host allocator.
2945          */
2946         if (!vccap_pasid(iommu->vccap))
2947                 return;
2948
2949         pr_info("Register custom PASID allocator\n");
2950         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2951         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2952         iommu->pasid_allocator.pdata = (void *)iommu;
2953         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2954                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2955                 /*
2956                  * Disable scalable mode on this IOMMU if there
2957                  * is no custom allocator. Mixing SM capable vIOMMU
2958                  * and non-SM vIOMMU are not supported.
2959                  */
2960                 intel_iommu_sm = 0;
2961         }
2962 }
2963 #endif
2964
2965 static int __init init_dmars(void)
2966 {
2967         struct dmar_drhd_unit *drhd;
2968         struct intel_iommu *iommu;
2969         int ret;
2970
2971         /*
2972          * for each drhd
2973          *    allocate root
2974          *    initialize and program root entry to not present
2975          * endfor
2976          */
2977         for_each_drhd_unit(drhd) {
2978                 /*
2979                  * lock not needed as this is only incremented in the single
2980                  * threaded kernel __init code path all other access are read
2981                  * only
2982                  */
2983                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2984                         g_num_of_iommus++;
2985                         continue;
2986                 }
2987                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
2988         }
2989
2990         /* Preallocate enough resources for IOMMU hot-addition */
2991         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2992                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2993
2994         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2995                         GFP_KERNEL);
2996         if (!g_iommus) {
2997                 ret = -ENOMEM;
2998                 goto error;
2999         }
3000
3001         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3002         if (ret)
3003                 goto free_iommu;
3004
3005         for_each_iommu(iommu, drhd) {
3006                 if (drhd->ignored) {
3007                         iommu_disable_translation(iommu);
3008                         continue;
3009                 }
3010
3011                 /*
3012                  * Find the max pasid size of all IOMMU's in the system.
3013                  * We need to ensure the system pasid table is no bigger
3014                  * than the smallest supported.
3015                  */
3016                 if (pasid_supported(iommu)) {
3017                         u32 temp = 2 << ecap_pss(iommu->ecap);
3018
3019                         intel_pasid_max_id = min_t(u32, temp,
3020                                                    intel_pasid_max_id);
3021                 }
3022
3023                 g_iommus[iommu->seq_id] = iommu;
3024
3025                 intel_iommu_init_qi(iommu);
3026
3027                 ret = iommu_init_domains(iommu);
3028                 if (ret)
3029                         goto free_iommu;
3030
3031                 init_translation_status(iommu);
3032
3033                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3034                         iommu_disable_translation(iommu);
3035                         clear_translation_pre_enabled(iommu);
3036                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3037                                 iommu->name);
3038                 }
3039
3040                 /*
3041                  * TBD:
3042                  * we could share the same root & context tables
3043                  * among all IOMMU's. Need to Split it later.
3044                  */
3045                 ret = iommu_alloc_root_entry(iommu);
3046                 if (ret)
3047                         goto free_iommu;
3048
3049                 if (translation_pre_enabled(iommu)) {
3050                         pr_info("Translation already enabled - trying to copy translation structures\n");
3051
3052                         ret = copy_translation_tables(iommu);
3053                         if (ret) {
3054                                 /*
3055                                  * We found the IOMMU with translation
3056                                  * enabled - but failed to copy over the
3057                                  * old root-entry table. Try to proceed
3058                                  * by disabling translation now and
3059                                  * allocating a clean root-entry table.
3060                                  * This might cause DMAR faults, but
3061                                  * probably the dump will still succeed.
3062                                  */
3063                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3064                                        iommu->name);
3065                                 iommu_disable_translation(iommu);
3066                                 clear_translation_pre_enabled(iommu);
3067                         } else {
3068                                 pr_info("Copied translation tables from previous kernel for %s\n",
3069                                         iommu->name);
3070                         }
3071                 }
3072
3073                 if (!ecap_pass_through(iommu->ecap))
3074                         hw_pass_through = 0;
3075                 intel_svm_check(iommu);
3076         }
3077
3078         /*
3079          * Now that qi is enabled on all iommus, set the root entry and flush
3080          * caches. This is required on some Intel X58 chipsets, otherwise the
3081          * flush_context function will loop forever and the boot hangs.
3082          */
3083         for_each_active_iommu(iommu, drhd) {
3084                 iommu_flush_write_buffer(iommu);
3085 #ifdef CONFIG_INTEL_IOMMU_SVM
3086                 register_pasid_allocator(iommu);
3087 #endif
3088                 iommu_set_root_entry(iommu);
3089         }
3090
3091 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3092         dmar_map_gfx = 0;
3093 #endif
3094
3095         if (!dmar_map_gfx)
3096                 iommu_identity_mapping |= IDENTMAP_GFX;
3097
3098         check_tylersburg_isoch();
3099
3100         ret = si_domain_init(hw_pass_through);
3101         if (ret)
3102                 goto free_iommu;
3103
3104         /*
3105          * for each drhd
3106          *   enable fault log
3107          *   global invalidate context cache
3108          *   global invalidate iotlb
3109          *   enable translation
3110          */
3111         for_each_iommu(iommu, drhd) {
3112                 if (drhd->ignored) {
3113                         /*
3114                          * we always have to disable PMRs or DMA may fail on
3115                          * this device
3116                          */
3117                         if (force_on)
3118                                 iommu_disable_protect_mem_regions(iommu);
3119                         continue;
3120                 }
3121
3122                 iommu_flush_write_buffer(iommu);
3123
3124 #ifdef CONFIG_INTEL_IOMMU_SVM
3125                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3126                         /*
3127                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3128                          * could cause possible lock race condition.
3129                          */
3130                         up_write(&dmar_global_lock);
3131                         ret = intel_svm_enable_prq(iommu);
3132                         down_write(&dmar_global_lock);
3133                         if (ret)
3134                                 goto free_iommu;
3135                 }
3136 #endif
3137                 ret = dmar_set_interrupt(iommu);
3138                 if (ret)
3139                         goto free_iommu;
3140         }
3141
3142         return 0;
3143
3144 free_iommu:
3145         for_each_active_iommu(iommu, drhd) {
3146                 disable_dmar_iommu(iommu);
3147                 free_dmar_iommu(iommu);
3148         }
3149
3150         kfree(g_iommus);
3151
3152 error:
3153         return ret;
3154 }
3155
3156 static void __init init_no_remapping_devices(void)
3157 {
3158         struct dmar_drhd_unit *drhd;
3159         struct device *dev;
3160         int i;
3161
3162         for_each_drhd_unit(drhd) {
3163                 if (!drhd->include_all) {
3164                         for_each_active_dev_scope(drhd->devices,
3165                                                   drhd->devices_cnt, i, dev)
3166                                 break;
3167                         /* ignore DMAR unit if no devices exist */
3168                         if (i == drhd->devices_cnt)
3169                                 drhd->ignored = 1;
3170                 }
3171         }
3172
3173         for_each_active_drhd_unit(drhd) {
3174                 if (drhd->include_all)
3175                         continue;
3176
3177                 for_each_active_dev_scope(drhd->devices,
3178                                           drhd->devices_cnt, i, dev)
3179                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3180                                 break;
3181                 if (i < drhd->devices_cnt)
3182                         continue;
3183
3184                 /* This IOMMU has *only* gfx devices. Either bypass it or
3185                    set the gfx_mapped flag, as appropriate */
3186                 drhd->gfx_dedicated = 1;
3187                 if (!dmar_map_gfx)
3188                         drhd->ignored = 1;
3189         }
3190 }
3191
3192 #ifdef CONFIG_SUSPEND
3193 static int init_iommu_hw(void)
3194 {
3195         struct dmar_drhd_unit *drhd;
3196         struct intel_iommu *iommu = NULL;
3197
3198         for_each_active_iommu(iommu, drhd)
3199                 if (iommu->qi)
3200                         dmar_reenable_qi(iommu);
3201
3202         for_each_iommu(iommu, drhd) {
3203                 if (drhd->ignored) {
3204                         /*
3205                          * we always have to disable PMRs or DMA may fail on
3206                          * this device
3207                          */
3208                         if (force_on)
3209                                 iommu_disable_protect_mem_regions(iommu);
3210                         continue;
3211                 }
3212
3213                 iommu_flush_write_buffer(iommu);
3214                 iommu_set_root_entry(iommu);
3215                 iommu_enable_translation(iommu);
3216                 iommu_disable_protect_mem_regions(iommu);
3217         }
3218
3219         return 0;
3220 }
3221
3222 static void iommu_flush_all(void)
3223 {
3224         struct dmar_drhd_unit *drhd;
3225         struct intel_iommu *iommu;
3226
3227         for_each_active_iommu(iommu, drhd) {
3228                 iommu->flush.flush_context(iommu, 0, 0, 0,
3229                                            DMA_CCMD_GLOBAL_INVL);
3230                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3231                                          DMA_TLB_GLOBAL_FLUSH);
3232         }
3233 }
3234
3235 static int iommu_suspend(void)
3236 {
3237         struct dmar_drhd_unit *drhd;
3238         struct intel_iommu *iommu = NULL;
3239         unsigned long flag;
3240
3241         for_each_active_iommu(iommu, drhd) {
3242                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3243                                              GFP_KERNEL);
3244                 if (!iommu->iommu_state)
3245                         goto nomem;
3246         }
3247
3248         iommu_flush_all();
3249
3250         for_each_active_iommu(iommu, drhd) {
3251                 iommu_disable_translation(iommu);
3252
3253                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3254
3255                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3256                         readl(iommu->reg + DMAR_FECTL_REG);
3257                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3258                         readl(iommu->reg + DMAR_FEDATA_REG);
3259                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3260                         readl(iommu->reg + DMAR_FEADDR_REG);
3261                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3262                         readl(iommu->reg + DMAR_FEUADDR_REG);
3263
3264                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3265         }
3266         return 0;
3267
3268 nomem:
3269         for_each_active_iommu(iommu, drhd)
3270                 kfree(iommu->iommu_state);
3271
3272         return -ENOMEM;
3273 }
3274
3275 static void iommu_resume(void)
3276 {
3277         struct dmar_drhd_unit *drhd;
3278         struct intel_iommu *iommu = NULL;
3279         unsigned long flag;
3280
3281         if (init_iommu_hw()) {
3282                 if (force_on)
3283                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3284                 else
3285                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3286                 return;
3287         }
3288
3289         for_each_active_iommu(iommu, drhd) {
3290
3291                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3292
3293                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3294                         iommu->reg + DMAR_FECTL_REG);
3295                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3296                         iommu->reg + DMAR_FEDATA_REG);
3297                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3298                         iommu->reg + DMAR_FEADDR_REG);
3299                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3300                         iommu->reg + DMAR_FEUADDR_REG);
3301
3302                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3303         }
3304
3305         for_each_active_iommu(iommu, drhd)
3306                 kfree(iommu->iommu_state);
3307 }
3308
3309 static struct syscore_ops iommu_syscore_ops = {
3310         .resume         = iommu_resume,
3311         .suspend        = iommu_suspend,
3312 };
3313
3314 static void __init init_iommu_pm_ops(void)
3315 {
3316         register_syscore_ops(&iommu_syscore_ops);
3317 }
3318
3319 #else
3320 static inline void init_iommu_pm_ops(void) {}
3321 #endif  /* CONFIG_PM */
3322
3323 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3324 {
3325         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3326             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3327             rmrr->end_address <= rmrr->base_address ||
3328             arch_rmrr_sanity_check(rmrr))
3329                 return -EINVAL;
3330
3331         return 0;
3332 }
3333
3334 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3335 {
3336         struct acpi_dmar_reserved_memory *rmrr;
3337         struct dmar_rmrr_unit *rmrru;
3338
3339         rmrr = (struct acpi_dmar_reserved_memory *)header;
3340         if (rmrr_sanity_check(rmrr)) {
3341                 pr_warn(FW_BUG
3342                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3343                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3344                            rmrr->base_address, rmrr->end_address,
3345                            dmi_get_system_info(DMI_BIOS_VENDOR),
3346                            dmi_get_system_info(DMI_BIOS_VERSION),
3347                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3348                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3349         }
3350
3351         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3352         if (!rmrru)
3353                 goto out;
3354
3355         rmrru->hdr = header;
3356
3357         rmrru->base_address = rmrr->base_address;
3358         rmrru->end_address = rmrr->end_address;
3359
3360         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3361                                 ((void *)rmrr) + rmrr->header.length,
3362                                 &rmrru->devices_cnt);
3363         if (rmrru->devices_cnt && rmrru->devices == NULL)
3364                 goto free_rmrru;
3365
3366         list_add(&rmrru->list, &dmar_rmrr_units);
3367
3368         return 0;
3369 free_rmrru:
3370         kfree(rmrru);
3371 out:
3372         return -ENOMEM;
3373 }
3374
3375 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3376 {
3377         struct dmar_atsr_unit *atsru;
3378         struct acpi_dmar_atsr *tmp;
3379
3380         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3381                                 dmar_rcu_check()) {
3382                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3383                 if (atsr->segment != tmp->segment)
3384                         continue;
3385                 if (atsr->header.length != tmp->header.length)
3386                         continue;
3387                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3388                         return atsru;
3389         }
3390
3391         return NULL;
3392 }
3393
3394 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3395 {
3396         struct acpi_dmar_atsr *atsr;
3397         struct dmar_atsr_unit *atsru;
3398
3399         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3400                 return 0;
3401
3402         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3403         atsru = dmar_find_atsr(atsr);
3404         if (atsru)
3405                 return 0;
3406
3407         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3408         if (!atsru)
3409                 return -ENOMEM;
3410
3411         /*
3412          * If memory is allocated from slab by ACPI _DSM method, we need to
3413          * copy the memory content because the memory buffer will be freed
3414          * on return.
3415          */
3416         atsru->hdr = (void *)(atsru + 1);
3417         memcpy(atsru->hdr, hdr, hdr->length);
3418         atsru->include_all = atsr->flags & 0x1;
3419         if (!atsru->include_all) {
3420                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3421                                 (void *)atsr + atsr->header.length,
3422                                 &atsru->devices_cnt);
3423                 if (atsru->devices_cnt && atsru->devices == NULL) {
3424                         kfree(atsru);
3425                         return -ENOMEM;
3426                 }
3427         }
3428
3429         list_add_rcu(&atsru->list, &dmar_atsr_units);
3430
3431         return 0;
3432 }
3433
3434 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3435 {
3436         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3437         kfree(atsru);
3438 }
3439
3440 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3441 {
3442         struct acpi_dmar_atsr *atsr;
3443         struct dmar_atsr_unit *atsru;
3444
3445         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3446         atsru = dmar_find_atsr(atsr);
3447         if (atsru) {
3448                 list_del_rcu(&atsru->list);
3449                 synchronize_rcu();
3450                 intel_iommu_free_atsr(atsru);
3451         }
3452
3453         return 0;
3454 }
3455
3456 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3457 {
3458         int i;
3459         struct device *dev;
3460         struct acpi_dmar_atsr *atsr;
3461         struct dmar_atsr_unit *atsru;
3462
3463         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3464         atsru = dmar_find_atsr(atsr);
3465         if (!atsru)
3466                 return 0;
3467
3468         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3469                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3470                                           i, dev)
3471                         return -EBUSY;
3472         }
3473
3474         return 0;
3475 }
3476
3477 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3478 {
3479         struct dmar_satc_unit *satcu;
3480         struct acpi_dmar_satc *tmp;
3481
3482         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3483                                 dmar_rcu_check()) {
3484                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3485                 if (satc->segment != tmp->segment)
3486                         continue;
3487                 if (satc->header.length != tmp->header.length)
3488                         continue;
3489                 if (memcmp(satc, tmp, satc->header.length) == 0)
3490                         return satcu;
3491         }
3492
3493         return NULL;
3494 }
3495
3496 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3497 {
3498         struct acpi_dmar_satc *satc;
3499         struct dmar_satc_unit *satcu;
3500
3501         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3502                 return 0;
3503
3504         satc = container_of(hdr, struct acpi_dmar_satc, header);
3505         satcu = dmar_find_satc(satc);
3506         if (satcu)
3507                 return 0;
3508
3509         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3510         if (!satcu)
3511                 return -ENOMEM;
3512
3513         satcu->hdr = (void *)(satcu + 1);
3514         memcpy(satcu->hdr, hdr, hdr->length);
3515         satcu->atc_required = satc->flags & 0x1;
3516         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3517                                               (void *)satc + satc->header.length,
3518                                               &satcu->devices_cnt);
3519         if (satcu->devices_cnt && !satcu->devices) {
3520                 kfree(satcu);
3521                 return -ENOMEM;
3522         }
3523         list_add_rcu(&satcu->list, &dmar_satc_units);
3524
3525         return 0;
3526 }
3527
3528 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3529 {
3530         int sp, ret;
3531         struct intel_iommu *iommu = dmaru->iommu;
3532
3533         if (g_iommus[iommu->seq_id])
3534                 return 0;
3535
3536         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3537         if (ret)
3538                 goto out;
3539
3540         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3541                 pr_warn("%s: Doesn't support hardware pass through.\n",
3542                         iommu->name);
3543                 return -ENXIO;
3544         }
3545
3546         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3547         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3548                 pr_warn("%s: Doesn't support large page.\n",
3549                         iommu->name);
3550                 return -ENXIO;
3551         }
3552
3553         /*
3554          * Disable translation if already enabled prior to OS handover.
3555          */
3556         if (iommu->gcmd & DMA_GCMD_TE)
3557                 iommu_disable_translation(iommu);
3558
3559         g_iommus[iommu->seq_id] = iommu;
3560         ret = iommu_init_domains(iommu);
3561         if (ret == 0)
3562                 ret = iommu_alloc_root_entry(iommu);
3563         if (ret)
3564                 goto out;
3565
3566         intel_svm_check(iommu);
3567
3568         if (dmaru->ignored) {
3569                 /*
3570                  * we always have to disable PMRs or DMA may fail on this device
3571                  */
3572                 if (force_on)
3573                         iommu_disable_protect_mem_regions(iommu);
3574                 return 0;
3575         }
3576
3577         intel_iommu_init_qi(iommu);
3578         iommu_flush_write_buffer(iommu);
3579
3580 #ifdef CONFIG_INTEL_IOMMU_SVM
3581         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3582                 ret = intel_svm_enable_prq(iommu);
3583                 if (ret)
3584                         goto disable_iommu;
3585         }
3586 #endif
3587         ret = dmar_set_interrupt(iommu);
3588         if (ret)
3589                 goto disable_iommu;
3590
3591         iommu_set_root_entry(iommu);
3592         iommu_enable_translation(iommu);
3593
3594         iommu_disable_protect_mem_regions(iommu);
3595         return 0;
3596
3597 disable_iommu:
3598         disable_dmar_iommu(iommu);
3599 out:
3600         free_dmar_iommu(iommu);
3601         return ret;
3602 }
3603
3604 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3605 {
3606         int ret = 0;
3607         struct intel_iommu *iommu = dmaru->iommu;
3608
3609         if (!intel_iommu_enabled)
3610                 return 0;
3611         if (iommu == NULL)
3612                 return -EINVAL;
3613
3614         if (insert) {
3615                 ret = intel_iommu_add(dmaru);
3616         } else {
3617                 disable_dmar_iommu(iommu);
3618                 free_dmar_iommu(iommu);
3619         }
3620
3621         return ret;
3622 }
3623
3624 static void intel_iommu_free_dmars(void)
3625 {
3626         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3627         struct dmar_atsr_unit *atsru, *atsr_n;
3628         struct dmar_satc_unit *satcu, *satc_n;
3629
3630         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3631                 list_del(&rmrru->list);
3632                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3633                 kfree(rmrru);
3634         }
3635
3636         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3637                 list_del(&atsru->list);
3638                 intel_iommu_free_atsr(atsru);
3639         }
3640         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3641                 list_del(&satcu->list);
3642                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3643                 kfree(satcu);
3644         }
3645 }
3646
3647 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3648 {
3649         struct dmar_satc_unit *satcu;
3650         struct acpi_dmar_satc *satc;
3651         struct device *tmp;
3652         int i;
3653
3654         dev = pci_physfn(dev);
3655         rcu_read_lock();
3656
3657         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3658                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3659                 if (satc->segment != pci_domain_nr(dev->bus))
3660                         continue;
3661                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3662                         if (to_pci_dev(tmp) == dev)
3663                                 goto out;
3664         }
3665         satcu = NULL;
3666 out:
3667         rcu_read_unlock();
3668         return satcu;
3669 }
3670
3671 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3672 {
3673         int i, ret = 1;
3674         struct pci_bus *bus;
3675         struct pci_dev *bridge = NULL;
3676         struct device *tmp;
3677         struct acpi_dmar_atsr *atsr;
3678         struct dmar_atsr_unit *atsru;
3679         struct dmar_satc_unit *satcu;
3680
3681         dev = pci_physfn(dev);
3682         satcu = dmar_find_matched_satc_unit(dev);
3683         if (satcu)
3684                 /*
3685                  * This device supports ATS as it is in SATC table.
3686                  * When IOMMU is in legacy mode, enabling ATS is done
3687                  * automatically by HW for the device that requires
3688                  * ATS, hence OS should not enable this device ATS
3689                  * to avoid duplicated TLB invalidation.
3690                  */
3691                 return !(satcu->atc_required && !sm_supported(iommu));
3692
3693         for (bus = dev->bus; bus; bus = bus->parent) {
3694                 bridge = bus->self;
3695                 /* If it's an integrated device, allow ATS */
3696                 if (!bridge)
3697                         return 1;
3698                 /* Connected via non-PCIe: no ATS */
3699                 if (!pci_is_pcie(bridge) ||
3700                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3701                         return 0;
3702                 /* If we found the root port, look it up in the ATSR */
3703                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3704                         break;
3705         }
3706
3707         rcu_read_lock();
3708         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3709                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3710                 if (atsr->segment != pci_domain_nr(dev->bus))
3711                         continue;
3712
3713                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3714                         if (tmp == &bridge->dev)
3715                                 goto out;
3716
3717                 if (atsru->include_all)
3718                         goto out;
3719         }
3720         ret = 0;
3721 out:
3722         rcu_read_unlock();
3723
3724         return ret;
3725 }
3726
3727 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3728 {
3729         int ret;
3730         struct dmar_rmrr_unit *rmrru;
3731         struct dmar_atsr_unit *atsru;
3732         struct dmar_satc_unit *satcu;
3733         struct acpi_dmar_atsr *atsr;
3734         struct acpi_dmar_reserved_memory *rmrr;
3735         struct acpi_dmar_satc *satc;
3736
3737         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3738                 return 0;
3739
3740         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3741                 rmrr = container_of(rmrru->hdr,
3742                                     struct acpi_dmar_reserved_memory, header);
3743                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3744                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3745                                 ((void *)rmrr) + rmrr->header.length,
3746                                 rmrr->segment, rmrru->devices,
3747                                 rmrru->devices_cnt);
3748                         if (ret < 0)
3749                                 return ret;
3750                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3751                         dmar_remove_dev_scope(info, rmrr->segment,
3752                                 rmrru->devices, rmrru->devices_cnt);
3753                 }
3754         }
3755
3756         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3757                 if (atsru->include_all)
3758                         continue;
3759
3760                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3761                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3762                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3763                                         (void *)atsr + atsr->header.length,
3764                                         atsr->segment, atsru->devices,
3765                                         atsru->devices_cnt);
3766                         if (ret > 0)
3767                                 break;
3768                         else if (ret < 0)
3769                                 return ret;
3770                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3771                         if (dmar_remove_dev_scope(info, atsr->segment,
3772                                         atsru->devices, atsru->devices_cnt))
3773                                 break;
3774                 }
3775         }
3776         list_for_each_entry(satcu, &dmar_satc_units, list) {
3777                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3778                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3779                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3780                                         (void *)satc + satc->header.length,
3781                                         satc->segment, satcu->devices,
3782                                         satcu->devices_cnt);
3783                         if (ret > 0)
3784                                 break;
3785                         else if (ret < 0)
3786                                 return ret;
3787                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3788                         if (dmar_remove_dev_scope(info, satc->segment,
3789                                         satcu->devices, satcu->devices_cnt))
3790                                 break;
3791                 }
3792         }
3793
3794         return 0;
3795 }
3796
3797 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3798                                        unsigned long val, void *v)
3799 {
3800         struct memory_notify *mhp = v;
3801         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3802         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3803                         mhp->nr_pages - 1);
3804
3805         switch (val) {
3806         case MEM_GOING_ONLINE:
3807                 if (iommu_domain_identity_map(si_domain,
3808                                               start_vpfn, last_vpfn)) {
3809                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3810                                 start_vpfn, last_vpfn);
3811                         return NOTIFY_BAD;
3812                 }
3813                 break;
3814
3815         case MEM_OFFLINE:
3816         case MEM_CANCEL_ONLINE:
3817                 {
3818                         struct dmar_drhd_unit *drhd;
3819                         struct intel_iommu *iommu;
3820                         LIST_HEAD(freelist);
3821
3822                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3823
3824                         rcu_read_lock();
3825                         for_each_active_iommu(iommu, drhd)
3826                                 iommu_flush_iotlb_psi(iommu, si_domain,
3827                                         start_vpfn, mhp->nr_pages,
3828                                         list_empty(&freelist), 0);
3829                         rcu_read_unlock();
3830                         put_pages_list(&freelist);
3831                 }
3832                 break;
3833         }
3834
3835         return NOTIFY_OK;
3836 }
3837
3838 static struct notifier_block intel_iommu_memory_nb = {
3839         .notifier_call = intel_iommu_memory_notifier,
3840         .priority = 0
3841 };
3842
3843 static void intel_disable_iommus(void)
3844 {
3845         struct intel_iommu *iommu = NULL;
3846         struct dmar_drhd_unit *drhd;
3847
3848         for_each_iommu(iommu, drhd)
3849                 iommu_disable_translation(iommu);
3850 }
3851
3852 void intel_iommu_shutdown(void)
3853 {
3854         struct dmar_drhd_unit *drhd;
3855         struct intel_iommu *iommu = NULL;
3856
3857         if (no_iommu || dmar_disabled)
3858                 return;
3859
3860         down_write(&dmar_global_lock);
3861
3862         /* Disable PMRs explicitly here. */
3863         for_each_iommu(iommu, drhd)
3864                 iommu_disable_protect_mem_regions(iommu);
3865
3866         /* Make sure the IOMMUs are switched off */
3867         intel_disable_iommus();
3868
3869         up_write(&dmar_global_lock);
3870 }
3871
3872 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3873 {
3874         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3875
3876         return container_of(iommu_dev, struct intel_iommu, iommu);
3877 }
3878
3879 static ssize_t version_show(struct device *dev,
3880                             struct device_attribute *attr, char *buf)
3881 {
3882         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3883         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3884         return sprintf(buf, "%d:%d\n",
3885                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3886 }
3887 static DEVICE_ATTR_RO(version);
3888
3889 static ssize_t address_show(struct device *dev,
3890                             struct device_attribute *attr, char *buf)
3891 {
3892         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3893         return sprintf(buf, "%llx\n", iommu->reg_phys);
3894 }
3895 static DEVICE_ATTR_RO(address);
3896
3897 static ssize_t cap_show(struct device *dev,
3898                         struct device_attribute *attr, char *buf)
3899 {
3900         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3901         return sprintf(buf, "%llx\n", iommu->cap);
3902 }
3903 static DEVICE_ATTR_RO(cap);
3904
3905 static ssize_t ecap_show(struct device *dev,
3906                          struct device_attribute *attr, char *buf)
3907 {
3908         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3909         return sprintf(buf, "%llx\n", iommu->ecap);
3910 }
3911 static DEVICE_ATTR_RO(ecap);
3912
3913 static ssize_t domains_supported_show(struct device *dev,
3914                                       struct device_attribute *attr, char *buf)
3915 {
3916         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3917         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3918 }
3919 static DEVICE_ATTR_RO(domains_supported);
3920
3921 static ssize_t domains_used_show(struct device *dev,
3922                                  struct device_attribute *attr, char *buf)
3923 {
3924         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3925         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3926                                                   cap_ndoms(iommu->cap)));
3927 }
3928 static DEVICE_ATTR_RO(domains_used);
3929
3930 static struct attribute *intel_iommu_attrs[] = {
3931         &dev_attr_version.attr,
3932         &dev_attr_address.attr,
3933         &dev_attr_cap.attr,
3934         &dev_attr_ecap.attr,
3935         &dev_attr_domains_supported.attr,
3936         &dev_attr_domains_used.attr,
3937         NULL,
3938 };
3939
3940 static struct attribute_group intel_iommu_group = {
3941         .name = "intel-iommu",
3942         .attrs = intel_iommu_attrs,
3943 };
3944
3945 const struct attribute_group *intel_iommu_groups[] = {
3946         &intel_iommu_group,
3947         NULL,
3948 };
3949
3950 static inline bool has_external_pci(void)
3951 {
3952         struct pci_dev *pdev = NULL;
3953
3954         for_each_pci_dev(pdev)
3955                 if (pdev->external_facing)
3956                         return true;
3957
3958         return false;
3959 }
3960
3961 static int __init platform_optin_force_iommu(void)
3962 {
3963         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3964                 return 0;
3965
3966         if (no_iommu || dmar_disabled)
3967                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3968
3969         /*
3970          * If Intel-IOMMU is disabled by default, we will apply identity
3971          * map for all devices except those marked as being untrusted.
3972          */
3973         if (dmar_disabled)
3974                 iommu_set_default_passthrough(false);
3975
3976         dmar_disabled = 0;
3977         no_iommu = 0;
3978
3979         return 1;
3980 }
3981
3982 static int __init probe_acpi_namespace_devices(void)
3983 {
3984         struct dmar_drhd_unit *drhd;
3985         /* To avoid a -Wunused-but-set-variable warning. */
3986         struct intel_iommu *iommu __maybe_unused;
3987         struct device *dev;
3988         int i, ret = 0;
3989
3990         for_each_active_iommu(iommu, drhd) {
3991                 for_each_active_dev_scope(drhd->devices,
3992                                           drhd->devices_cnt, i, dev) {
3993                         struct acpi_device_physical_node *pn;
3994                         struct iommu_group *group;
3995                         struct acpi_device *adev;
3996
3997                         if (dev->bus != &acpi_bus_type)
3998                                 continue;
3999
4000                         adev = to_acpi_device(dev);
4001                         mutex_lock(&adev->physical_node_lock);
4002                         list_for_each_entry(pn,
4003                                             &adev->physical_node_list, node) {
4004                                 group = iommu_group_get(pn->dev);
4005                                 if (group) {
4006                                         iommu_group_put(group);
4007                                         continue;
4008                                 }
4009
4010                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4011                                 ret = iommu_probe_device(pn->dev);
4012                                 if (ret)
4013                                         break;
4014                         }
4015                         mutex_unlock(&adev->physical_node_lock);
4016
4017                         if (ret)
4018                                 return ret;
4019                 }
4020         }
4021
4022         return 0;
4023 }
4024
4025 int __init intel_iommu_init(void)
4026 {
4027         int ret = -ENODEV;
4028         struct dmar_drhd_unit *drhd;
4029         struct intel_iommu *iommu;
4030
4031         /*
4032          * Intel IOMMU is required for a TXT/tboot launch or platform
4033          * opt in, so enforce that.
4034          */
4035         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4036                     platform_optin_force_iommu();
4037
4038         down_write(&dmar_global_lock);
4039         if (dmar_table_init()) {
4040                 if (force_on)
4041                         panic("tboot: Failed to initialize DMAR table\n");
4042                 goto out_free_dmar;
4043         }
4044
4045         if (dmar_dev_scope_init() < 0) {
4046                 if (force_on)
4047                         panic("tboot: Failed to initialize DMAR device scope\n");
4048                 goto out_free_dmar;
4049         }
4050
4051         up_write(&dmar_global_lock);
4052
4053         /*
4054          * The bus notifier takes the dmar_global_lock, so lockdep will
4055          * complain later when we register it under the lock.
4056          */
4057         dmar_register_bus_notifier();
4058
4059         down_write(&dmar_global_lock);
4060
4061         if (!no_iommu)
4062                 intel_iommu_debugfs_init();
4063
4064         if (no_iommu || dmar_disabled) {
4065                 /*
4066                  * We exit the function here to ensure IOMMU's remapping and
4067                  * mempool aren't setup, which means that the IOMMU's PMRs
4068                  * won't be disabled via the call to init_dmars(). So disable
4069                  * it explicitly here. The PMRs were setup by tboot prior to
4070                  * calling SENTER, but the kernel is expected to reset/tear
4071                  * down the PMRs.
4072                  */
4073                 if (intel_iommu_tboot_noforce) {
4074                         for_each_iommu(iommu, drhd)
4075                                 iommu_disable_protect_mem_regions(iommu);
4076                 }
4077
4078                 /*
4079                  * Make sure the IOMMUs are switched off, even when we
4080                  * boot into a kexec kernel and the previous kernel left
4081                  * them enabled
4082                  */
4083                 intel_disable_iommus();
4084                 goto out_free_dmar;
4085         }
4086
4087         if (list_empty(&dmar_rmrr_units))
4088                 pr_info("No RMRR found\n");
4089
4090         if (list_empty(&dmar_atsr_units))
4091                 pr_info("No ATSR found\n");
4092
4093         if (list_empty(&dmar_satc_units))
4094                 pr_info("No SATC found\n");
4095
4096         if (dmar_map_gfx)
4097                 intel_iommu_gfx_mapped = 1;
4098
4099         init_no_remapping_devices();
4100
4101         ret = init_dmars();
4102         if (ret) {
4103                 if (force_on)
4104                         panic("tboot: Failed to initialize DMARs\n");
4105                 pr_err("Initialization failed\n");
4106                 goto out_free_dmar;
4107         }
4108         up_write(&dmar_global_lock);
4109
4110         init_iommu_pm_ops();
4111
4112         down_read(&dmar_global_lock);
4113         for_each_active_iommu(iommu, drhd) {
4114                 /*
4115                  * The flush queue implementation does not perform
4116                  * page-selective invalidations that are required for efficient
4117                  * TLB flushes in virtual environments.  The benefit of batching
4118                  * is likely to be much lower than the overhead of synchronizing
4119                  * the virtual and physical IOMMU page-tables.
4120                  */
4121                 if (cap_caching_mode(iommu->cap)) {
4122                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4123                         iommu_set_dma_strict();
4124                 }
4125                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4126                                        intel_iommu_groups,
4127                                        "%s", iommu->name);
4128                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4129         }
4130         up_read(&dmar_global_lock);
4131
4132         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4133         if (si_domain && !hw_pass_through)
4134                 register_memory_notifier(&intel_iommu_memory_nb);
4135
4136         down_read(&dmar_global_lock);
4137         if (probe_acpi_namespace_devices())
4138                 pr_warn("ACPI name space devices didn't probe correctly\n");
4139
4140         /* Finally, we enable the DMA remapping hardware. */
4141         for_each_iommu(iommu, drhd) {
4142                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4143                         iommu_enable_translation(iommu);
4144
4145                 iommu_disable_protect_mem_regions(iommu);
4146         }
4147         up_read(&dmar_global_lock);
4148
4149         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4150
4151         intel_iommu_enabled = 1;
4152
4153         return 0;
4154
4155 out_free_dmar:
4156         intel_iommu_free_dmars();
4157         up_write(&dmar_global_lock);
4158         return ret;
4159 }
4160
4161 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4162 {
4163         struct device_domain_info *info = opaque;
4164
4165         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4166         return 0;
4167 }
4168
4169 /*
4170  * NB - intel-iommu lacks any sort of reference counting for the users of
4171  * dependent devices.  If multiple endpoints have intersecting dependent
4172  * devices, unbinding the driver from any one of them will possibly leave
4173  * the others unable to operate.
4174  */
4175 static void domain_context_clear(struct device_domain_info *info)
4176 {
4177         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4178                 return;
4179
4180         pci_for_each_dma_alias(to_pci_dev(info->dev),
4181                                &domain_context_clear_one_cb, info);
4182 }
4183
4184 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4185 {
4186         struct dmar_domain *domain;
4187         struct intel_iommu *iommu;
4188         unsigned long flags;
4189
4190         assert_spin_locked(&device_domain_lock);
4191
4192         if (WARN_ON(!info))
4193                 return;
4194
4195         iommu = info->iommu;
4196         domain = info->domain;
4197
4198         if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4199                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4200                         intel_pasid_tear_down_entry(iommu, info->dev,
4201                                         PASID_RID2PASID, false);
4202
4203                 iommu_disable_dev_iotlb(info);
4204                 domain_context_clear(info);
4205                 intel_pasid_free_table(info->dev);
4206         }
4207
4208         list_del(&info->link);
4209
4210         spin_lock_irqsave(&iommu->lock, flags);
4211         domain_detach_iommu(domain, iommu);
4212         spin_unlock_irqrestore(&iommu->lock, flags);
4213 }
4214
4215 static void dmar_remove_one_dev_info(struct device *dev)
4216 {
4217         struct device_domain_info *info;
4218         unsigned long flags;
4219
4220         spin_lock_irqsave(&device_domain_lock, flags);
4221         info = dev_iommu_priv_get(dev);
4222         if (info)
4223                 __dmar_remove_one_dev_info(info);
4224         spin_unlock_irqrestore(&device_domain_lock, flags);
4225 }
4226
4227 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4228 {
4229         int adjust_width;
4230
4231         /* calculate AGAW */
4232         domain->gaw = guest_width;
4233         adjust_width = guestwidth_to_adjustwidth(guest_width);
4234         domain->agaw = width_to_agaw(adjust_width);
4235
4236         domain->iommu_coherency = false;
4237         domain->iommu_superpage = 0;
4238         domain->max_addr = 0;
4239
4240         /* always allocate the top pgd */
4241         domain->pgd = alloc_pgtable_page(domain->nid);
4242         if (!domain->pgd)
4243                 return -ENOMEM;
4244         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4245         return 0;
4246 }
4247
4248 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4249 {
4250         struct dmar_domain *dmar_domain;
4251         struct iommu_domain *domain;
4252
4253         switch (type) {
4254         case IOMMU_DOMAIN_DMA:
4255         case IOMMU_DOMAIN_DMA_FQ:
4256         case IOMMU_DOMAIN_UNMANAGED:
4257                 dmar_domain = alloc_domain(type);
4258                 if (!dmar_domain) {
4259                         pr_err("Can't allocate dmar_domain\n");
4260                         return NULL;
4261                 }
4262                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4263                         pr_err("Domain initialization failed\n");
4264                         domain_exit(dmar_domain);
4265                         return NULL;
4266                 }
4267
4268                 domain = &dmar_domain->domain;
4269                 domain->geometry.aperture_start = 0;
4270                 domain->geometry.aperture_end   =
4271                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4272                 domain->geometry.force_aperture = true;
4273
4274                 return domain;
4275         case IOMMU_DOMAIN_IDENTITY:
4276                 return &si_domain->domain;
4277         default:
4278                 return NULL;
4279         }
4280
4281         return NULL;
4282 }
4283
4284 static void intel_iommu_domain_free(struct iommu_domain *domain)
4285 {
4286         if (domain != &si_domain->domain)
4287                 domain_exit(to_dmar_domain(domain));
4288 }
4289
4290 static int prepare_domain_attach_device(struct iommu_domain *domain,
4291                                         struct device *dev)
4292 {
4293         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4294         struct intel_iommu *iommu;
4295         int addr_width;
4296
4297         iommu = device_to_iommu(dev, NULL, NULL);
4298         if (!iommu)
4299                 return -ENODEV;
4300
4301         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4302                 return -EOPNOTSUPP;
4303
4304         /* check if this iommu agaw is sufficient for max mapped address */
4305         addr_width = agaw_to_width(iommu->agaw);
4306         if (addr_width > cap_mgaw(iommu->cap))
4307                 addr_width = cap_mgaw(iommu->cap);
4308
4309         if (dmar_domain->max_addr > (1LL << addr_width)) {
4310                 dev_err(dev, "%s: iommu width (%d) is not "
4311                         "sufficient for the mapped address (%llx)\n",
4312                         __func__, addr_width, dmar_domain->max_addr);
4313                 return -EFAULT;
4314         }
4315         dmar_domain->gaw = addr_width;
4316
4317         /*
4318          * Knock out extra levels of page tables if necessary
4319          */
4320         while (iommu->agaw < dmar_domain->agaw) {
4321                 struct dma_pte *pte;
4322
4323                 pte = dmar_domain->pgd;
4324                 if (dma_pte_present(pte)) {
4325                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4326                         free_pgtable_page(pte);
4327                 }
4328                 dmar_domain->agaw--;
4329         }
4330
4331         return 0;
4332 }
4333
4334 static int intel_iommu_attach_device(struct iommu_domain *domain,
4335                                      struct device *dev)
4336 {
4337         int ret;
4338
4339         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4340             device_is_rmrr_locked(dev)) {
4341                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4342                 return -EPERM;
4343         }
4344
4345         /* normally dev is not mapped */
4346         if (unlikely(domain_context_mapped(dev))) {
4347                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4348
4349                 if (info->domain)
4350                         dmar_remove_one_dev_info(dev);
4351         }
4352
4353         ret = prepare_domain_attach_device(domain, dev);
4354         if (ret)
4355                 return ret;
4356
4357         return domain_add_dev_info(to_dmar_domain(domain), dev);
4358 }
4359
4360 static void intel_iommu_detach_device(struct iommu_domain *domain,
4361                                       struct device *dev)
4362 {
4363         dmar_remove_one_dev_info(dev);
4364 }
4365
4366 static int intel_iommu_map(struct iommu_domain *domain,
4367                            unsigned long iova, phys_addr_t hpa,
4368                            size_t size, int iommu_prot, gfp_t gfp)
4369 {
4370         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4371         u64 max_addr;
4372         int prot = 0;
4373
4374         if (iommu_prot & IOMMU_READ)
4375                 prot |= DMA_PTE_READ;
4376         if (iommu_prot & IOMMU_WRITE)
4377                 prot |= DMA_PTE_WRITE;
4378         if (dmar_domain->set_pte_snp)
4379                 prot |= DMA_PTE_SNP;
4380
4381         max_addr = iova + size;
4382         if (dmar_domain->max_addr < max_addr) {
4383                 u64 end;
4384
4385                 /* check if minimum agaw is sufficient for mapped address */
4386                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4387                 if (end < max_addr) {
4388                         pr_err("%s: iommu width (%d) is not "
4389                                "sufficient for the mapped address (%llx)\n",
4390                                __func__, dmar_domain->gaw, max_addr);
4391                         return -EFAULT;
4392                 }
4393                 dmar_domain->max_addr = max_addr;
4394         }
4395         /* Round up size to next multiple of PAGE_SIZE, if it and
4396            the low bits of hpa would take us onto the next page */
4397         size = aligned_nrpages(hpa, size);
4398         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4399                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4400 }
4401
4402 static int intel_iommu_map_pages(struct iommu_domain *domain,
4403                                  unsigned long iova, phys_addr_t paddr,
4404                                  size_t pgsize, size_t pgcount,
4405                                  int prot, gfp_t gfp, size_t *mapped)
4406 {
4407         unsigned long pgshift = __ffs(pgsize);
4408         size_t size = pgcount << pgshift;
4409         int ret;
4410
4411         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4412                 return -EINVAL;
4413
4414         if (!IS_ALIGNED(iova | paddr, pgsize))
4415                 return -EINVAL;
4416
4417         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4418         if (!ret && mapped)
4419                 *mapped = size;
4420
4421         return ret;
4422 }
4423
4424 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4425                                 unsigned long iova, size_t size,
4426                                 struct iommu_iotlb_gather *gather)
4427 {
4428         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4429         unsigned long start_pfn, last_pfn;
4430         int level = 0;
4431
4432         /* Cope with horrid API which requires us to unmap more than the
4433            size argument if it happens to be a large-page mapping. */
4434         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4435
4436         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4437                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4438
4439         start_pfn = iova >> VTD_PAGE_SHIFT;
4440         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4441
4442         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4443
4444         if (dmar_domain->max_addr == iova + size)
4445                 dmar_domain->max_addr = iova;
4446
4447         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4448
4449         return size;
4450 }
4451
4452 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4453                                       unsigned long iova,
4454                                       size_t pgsize, size_t pgcount,
4455                                       struct iommu_iotlb_gather *gather)
4456 {
4457         unsigned long pgshift = __ffs(pgsize);
4458         size_t size = pgcount << pgshift;
4459
4460         return intel_iommu_unmap(domain, iova, size, gather);
4461 }
4462
4463 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4464                                  struct iommu_iotlb_gather *gather)
4465 {
4466         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4467         unsigned long iova_pfn = IOVA_PFN(gather->start);
4468         size_t size = gather->end - gather->start;
4469         unsigned long start_pfn;
4470         unsigned long nrpages;
4471         int iommu_id;
4472
4473         nrpages = aligned_nrpages(gather->start, size);
4474         start_pfn = mm_to_dma_pfn(iova_pfn);
4475
4476         for_each_domain_iommu(iommu_id, dmar_domain)
4477                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4478                                       start_pfn, nrpages,
4479                                       list_empty(&gather->freelist), 0);
4480
4481         put_pages_list(&gather->freelist);
4482 }
4483
4484 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4485                                             dma_addr_t iova)
4486 {
4487         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4488         struct dma_pte *pte;
4489         int level = 0;
4490         u64 phys = 0;
4491
4492         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4493         if (pte && dma_pte_present(pte))
4494                 phys = dma_pte_addr(pte) +
4495                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4496                                                 VTD_PAGE_SHIFT) - 1));
4497
4498         return phys;
4499 }
4500
4501 static bool domain_support_force_snooping(struct dmar_domain *domain)
4502 {
4503         struct device_domain_info *info;
4504         bool support = true;
4505
4506         assert_spin_locked(&device_domain_lock);
4507         list_for_each_entry(info, &domain->devices, link) {
4508                 if (!ecap_sc_support(info->iommu->ecap)) {
4509                         support = false;
4510                         break;
4511                 }
4512         }
4513
4514         return support;
4515 }
4516
4517 static void domain_set_force_snooping(struct dmar_domain *domain)
4518 {
4519         struct device_domain_info *info;
4520
4521         assert_spin_locked(&device_domain_lock);
4522
4523         /*
4524          * Second level page table supports per-PTE snoop control. The
4525          * iommu_map() interface will handle this by setting SNP bit.
4526          */
4527         if (!domain_use_first_level(domain)) {
4528                 domain->set_pte_snp = true;
4529                 return;
4530         }
4531
4532         list_for_each_entry(info, &domain->devices, link)
4533                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4534                                                      PASID_RID2PASID);
4535 }
4536
4537 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4538 {
4539         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4540         unsigned long flags;
4541
4542         if (dmar_domain->force_snooping)
4543                 return true;
4544
4545         spin_lock_irqsave(&device_domain_lock, flags);
4546         if (!domain_support_force_snooping(dmar_domain)) {
4547                 spin_unlock_irqrestore(&device_domain_lock, flags);
4548                 return false;
4549         }
4550
4551         domain_set_force_snooping(dmar_domain);
4552         dmar_domain->force_snooping = true;
4553         spin_unlock_irqrestore(&device_domain_lock, flags);
4554
4555         return true;
4556 }
4557
4558 static bool intel_iommu_capable(enum iommu_cap cap)
4559 {
4560         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4561                 return true;
4562         if (cap == IOMMU_CAP_INTR_REMAP)
4563                 return irq_remapping_enabled == 1;
4564         if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4565                 return dmar_platform_optin();
4566
4567         return false;
4568 }
4569
4570 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4571 {
4572         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4573         struct device_domain_info *info;
4574         struct intel_iommu *iommu;
4575         unsigned long flags;
4576         u8 bus, devfn;
4577
4578         iommu = device_to_iommu(dev, &bus, &devfn);
4579         if (!iommu)
4580                 return ERR_PTR(-ENODEV);
4581
4582         info = kzalloc(sizeof(*info), GFP_KERNEL);
4583         if (!info)
4584                 return ERR_PTR(-ENOMEM);
4585
4586         if (dev_is_real_dma_subdevice(dev)) {
4587                 info->bus = pdev->bus->number;
4588                 info->devfn = pdev->devfn;
4589                 info->segment = pci_domain_nr(pdev->bus);
4590         } else {
4591                 info->bus = bus;
4592                 info->devfn = devfn;
4593                 info->segment = iommu->segment;
4594         }
4595
4596         info->dev = dev;
4597         info->iommu = iommu;
4598         if (dev_is_pci(dev)) {
4599                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4600                     pci_ats_supported(pdev) &&
4601                     dmar_ats_supported(pdev, iommu))
4602                         info->ats_supported = 1;
4603
4604                 if (sm_supported(iommu)) {
4605                         if (pasid_supported(iommu)) {
4606                                 int features = pci_pasid_features(pdev);
4607
4608                                 if (features >= 0)
4609                                         info->pasid_supported = features | 1;
4610                         }
4611
4612                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4613                             pci_pri_supported(pdev))
4614                                 info->pri_supported = 1;
4615                 }
4616         }
4617
4618         spin_lock_irqsave(&device_domain_lock, flags);
4619         list_add(&info->global, &device_domain_list);
4620         dev_iommu_priv_set(dev, info);
4621         spin_unlock_irqrestore(&device_domain_lock, flags);
4622
4623         return &iommu->iommu;
4624 }
4625
4626 static void intel_iommu_release_device(struct device *dev)
4627 {
4628         struct device_domain_info *info = dev_iommu_priv_get(dev);
4629         unsigned long flags;
4630
4631         dmar_remove_one_dev_info(dev);
4632
4633         spin_lock_irqsave(&device_domain_lock, flags);
4634         dev_iommu_priv_set(dev, NULL);
4635         list_del(&info->global);
4636         spin_unlock_irqrestore(&device_domain_lock, flags);
4637
4638         kfree(info);
4639         set_dma_ops(dev, NULL);
4640 }
4641
4642 static void intel_iommu_probe_finalize(struct device *dev)
4643 {
4644         set_dma_ops(dev, NULL);
4645         iommu_setup_dma_ops(dev, 0, U64_MAX);
4646 }
4647
4648 static void intel_iommu_get_resv_regions(struct device *device,
4649                                          struct list_head *head)
4650 {
4651         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4652         struct iommu_resv_region *reg;
4653         struct dmar_rmrr_unit *rmrr;
4654         struct device *i_dev;
4655         int i;
4656
4657         down_read(&dmar_global_lock);
4658         for_each_rmrr_units(rmrr) {
4659                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4660                                           i, i_dev) {
4661                         struct iommu_resv_region *resv;
4662                         enum iommu_resv_type type;
4663                         size_t length;
4664
4665                         if (i_dev != device &&
4666                             !is_downstream_to_pci_bridge(device, i_dev))
4667                                 continue;
4668
4669                         length = rmrr->end_address - rmrr->base_address + 1;
4670
4671                         type = device_rmrr_is_relaxable(device) ?
4672                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4673
4674                         resv = iommu_alloc_resv_region(rmrr->base_address,
4675                                                        length, prot, type);
4676                         if (!resv)
4677                                 break;
4678
4679                         list_add_tail(&resv->list, head);
4680                 }
4681         }
4682         up_read(&dmar_global_lock);
4683
4684 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4685         if (dev_is_pci(device)) {
4686                 struct pci_dev *pdev = to_pci_dev(device);
4687
4688                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4689                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4690                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4691                         if (reg)
4692                                 list_add_tail(&reg->list, head);
4693                 }
4694         }
4695 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4696
4697         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4698                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4699                                       0, IOMMU_RESV_MSI);
4700         if (!reg)
4701                 return;
4702         list_add_tail(&reg->list, head);
4703 }
4704
4705 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4706 {
4707         struct device_domain_info *info = dev_iommu_priv_get(dev);
4708         struct context_entry *context;
4709         struct dmar_domain *domain;
4710         unsigned long flags;
4711         u64 ctx_lo;
4712         int ret;
4713
4714         domain = info->domain;
4715         if (!domain)
4716                 return -EINVAL;
4717
4718         spin_lock_irqsave(&device_domain_lock, flags);
4719         spin_lock(&iommu->lock);
4720
4721         ret = -EINVAL;
4722         if (!info->pasid_supported)
4723                 goto out;
4724
4725         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4726         if (WARN_ON(!context))
4727                 goto out;
4728
4729         ctx_lo = context[0].lo;
4730
4731         if (!(ctx_lo & CONTEXT_PASIDE)) {
4732                 ctx_lo |= CONTEXT_PASIDE;
4733                 context[0].lo = ctx_lo;
4734                 wmb();
4735                 iommu->flush.flush_context(iommu,
4736                                            domain->iommu_did[iommu->seq_id],
4737                                            PCI_DEVID(info->bus, info->devfn),
4738                                            DMA_CCMD_MASK_NOBIT,
4739                                            DMA_CCMD_DEVICE_INVL);
4740         }
4741
4742         /* Enable PASID support in the device, if it wasn't already */
4743         if (!info->pasid_enabled)
4744                 iommu_enable_dev_iotlb(info);
4745
4746         ret = 0;
4747
4748  out:
4749         spin_unlock(&iommu->lock);
4750         spin_unlock_irqrestore(&device_domain_lock, flags);
4751
4752         return ret;
4753 }
4754
4755 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4756 {
4757         if (dev_is_pci(dev))
4758                 return pci_device_group(dev);
4759         return generic_device_group(dev);
4760 }
4761
4762 static int intel_iommu_enable_sva(struct device *dev)
4763 {
4764         struct device_domain_info *info = dev_iommu_priv_get(dev);
4765         struct intel_iommu *iommu;
4766         int ret;
4767
4768         if (!info || dmar_disabled)
4769                 return -EINVAL;
4770
4771         iommu = info->iommu;
4772         if (!iommu)
4773                 return -EINVAL;
4774
4775         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4776                 return -ENODEV;
4777
4778         if (intel_iommu_enable_pasid(iommu, dev))
4779                 return -ENODEV;
4780
4781         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4782                 return -EINVAL;
4783
4784         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4785         if (!ret)
4786                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4787
4788         return ret;
4789 }
4790
4791 static int intel_iommu_disable_sva(struct device *dev)
4792 {
4793         struct device_domain_info *info = dev_iommu_priv_get(dev);
4794         struct intel_iommu *iommu = info->iommu;
4795         int ret;
4796
4797         ret = iommu_unregister_device_fault_handler(dev);
4798         if (!ret)
4799                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4800
4801         return ret;
4802 }
4803
4804 static int intel_iommu_enable_iopf(struct device *dev)
4805 {
4806         struct device_domain_info *info = dev_iommu_priv_get(dev);
4807
4808         if (info && info->pri_supported)
4809                 return 0;
4810
4811         return -ENODEV;
4812 }
4813
4814 static int
4815 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4816 {
4817         switch (feat) {
4818         case IOMMU_DEV_FEAT_IOPF:
4819                 return intel_iommu_enable_iopf(dev);
4820
4821         case IOMMU_DEV_FEAT_SVA:
4822                 return intel_iommu_enable_sva(dev);
4823
4824         default:
4825                 return -ENODEV;
4826         }
4827 }
4828
4829 static int
4830 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4831 {
4832         switch (feat) {
4833         case IOMMU_DEV_FEAT_IOPF:
4834                 return 0;
4835
4836         case IOMMU_DEV_FEAT_SVA:
4837                 return intel_iommu_disable_sva(dev);
4838
4839         default:
4840                 return -ENODEV;
4841         }
4842 }
4843
4844 static bool intel_iommu_is_attach_deferred(struct device *dev)
4845 {
4846         struct device_domain_info *info = dev_iommu_priv_get(dev);
4847
4848         return translation_pre_enabled(info->iommu) && !info->domain;
4849 }
4850
4851 /*
4852  * Check that the device does not live on an external facing PCI port that is
4853  * marked as untrusted. Such devices should not be able to apply quirks and
4854  * thus not be able to bypass the IOMMU restrictions.
4855  */
4856 static bool risky_device(struct pci_dev *pdev)
4857 {
4858         if (pdev->untrusted) {
4859                 pci_info(pdev,
4860                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4861                          pdev->vendor, pdev->device);
4862                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4863                 return true;
4864         }
4865         return false;
4866 }
4867
4868 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4869                                        unsigned long iova, size_t size)
4870 {
4871         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4872         unsigned long pages = aligned_nrpages(iova, size);
4873         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4874         struct intel_iommu *iommu;
4875         int iommu_id;
4876
4877         for_each_domain_iommu(iommu_id, dmar_domain) {
4878                 iommu = g_iommus[iommu_id];
4879                 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
4880         }
4881 }
4882
4883 const struct iommu_ops intel_iommu_ops = {
4884         .capable                = intel_iommu_capable,
4885         .domain_alloc           = intel_iommu_domain_alloc,
4886         .probe_device           = intel_iommu_probe_device,
4887         .probe_finalize         = intel_iommu_probe_finalize,
4888         .release_device         = intel_iommu_release_device,
4889         .get_resv_regions       = intel_iommu_get_resv_regions,
4890         .put_resv_regions       = generic_iommu_put_resv_regions,
4891         .device_group           = intel_iommu_device_group,
4892         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4893         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4894         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4895         .def_domain_type        = device_def_domain_type,
4896         .pgsize_bitmap          = SZ_4K,
4897 #ifdef CONFIG_INTEL_IOMMU_SVM
4898         .sva_bind               = intel_svm_bind,
4899         .sva_unbind             = intel_svm_unbind,
4900         .sva_get_pasid          = intel_svm_get_pasid,
4901         .page_response          = intel_svm_page_response,
4902 #endif
4903         .default_domain_ops = &(const struct iommu_domain_ops) {
4904                 .attach_dev             = intel_iommu_attach_device,
4905                 .detach_dev             = intel_iommu_detach_device,
4906                 .map_pages              = intel_iommu_map_pages,
4907                 .unmap_pages            = intel_iommu_unmap_pages,
4908                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4909                 .flush_iotlb_all        = intel_flush_iotlb_all,
4910                 .iotlb_sync             = intel_iommu_tlb_sync,
4911                 .iova_to_phys           = intel_iommu_iova_to_phys,
4912                 .free                   = intel_iommu_domain_free,
4913                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4914         }
4915 };
4916
4917 static void quirk_iommu_igfx(struct pci_dev *dev)
4918 {
4919         if (risky_device(dev))
4920                 return;
4921
4922         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4923         dmar_map_gfx = 0;
4924 }
4925
4926 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4934
4935 /* Broadwell igfx malfunctions with dmar */
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4960
4961 static void quirk_iommu_rwbf(struct pci_dev *dev)
4962 {
4963         if (risky_device(dev))
4964                 return;
4965
4966         /*
4967          * Mobile 4 Series Chipset neglects to set RWBF capability,
4968          * but needs it. Same seems to hold for the desktop versions.
4969          */
4970         pci_info(dev, "Forcing write-buffer flush capability\n");
4971         rwbf_quirk = 1;
4972 }
4973
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4981
4982 #define GGC 0x52
4983 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4984 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4985 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4986 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4987 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4988 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4989 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4990 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4991
4992 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4993 {
4994         unsigned short ggc;
4995
4996         if (risky_device(dev))
4997                 return;
4998
4999         if (pci_read_config_word(dev, GGC, &ggc))
5000                 return;
5001
5002         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5003                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5004                 dmar_map_gfx = 0;
5005         } else if (dmar_map_gfx) {
5006                 /* we have to ensure the gfx device is idle before we flush */
5007                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5008                 iommu_set_dma_strict();
5009         }
5010 }
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5015
5016 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5017 {
5018         unsigned short ver;
5019
5020         if (!IS_GFX_DEVICE(dev))
5021                 return;
5022
5023         ver = (dev->device >> 8) & 0xff;
5024         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5025             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5026             ver != 0x9a && ver != 0xa7)
5027                 return;
5028
5029         if (risky_device(dev))
5030                 return;
5031
5032         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5033         iommu_skip_te_disable = 1;
5034 }
5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5036
5037 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5038    ISOCH DMAR unit for the Azalia sound device, but not give it any
5039    TLB entries, which causes it to deadlock. Check for that.  We do
5040    this in a function called from init_dmars(), instead of in a PCI
5041    quirk, because we don't want to print the obnoxious "BIOS broken"
5042    message if VT-d is actually disabled.
5043 */
5044 static void __init check_tylersburg_isoch(void)
5045 {
5046         struct pci_dev *pdev;
5047         uint32_t vtisochctrl;
5048
5049         /* If there's no Azalia in the system anyway, forget it. */
5050         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5051         if (!pdev)
5052                 return;
5053
5054         if (risky_device(pdev)) {
5055                 pci_dev_put(pdev);
5056                 return;
5057         }
5058
5059         pci_dev_put(pdev);
5060
5061         /* System Management Registers. Might be hidden, in which case
5062            we can't do the sanity check. But that's OK, because the
5063            known-broken BIOSes _don't_ actually hide it, so far. */
5064         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5065         if (!pdev)
5066                 return;
5067
5068         if (risky_device(pdev)) {
5069                 pci_dev_put(pdev);
5070                 return;
5071         }
5072
5073         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5074                 pci_dev_put(pdev);
5075                 return;
5076         }
5077
5078         pci_dev_put(pdev);
5079
5080         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5081         if (vtisochctrl & 1)
5082                 return;
5083
5084         /* Drop all bits other than the number of TLB entries */
5085         vtisochctrl &= 0x1c;
5086
5087         /* If we have the recommended number of TLB entries (16), fine. */
5088         if (vtisochctrl == 0x10)
5089                 return;
5090
5091         /* Zero TLB entries? You get to ride the short bus to school. */
5092         if (!vtisochctrl) {
5093                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5094                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5095                      dmi_get_system_info(DMI_BIOS_VENDOR),
5096                      dmi_get_system_info(DMI_BIOS_VERSION),
5097                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5098                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5099                 return;
5100         }
5101
5102         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5103                vtisochctrl);
5104 }