2200e3c5d506db04c0690993a0d27449ea097f2e
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173         context->lo &= ~(1ULL << 11);
174 }
175
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178         return !!(context->lo & (1ULL << 11));
179 }
180
181 static inline void context_set_copied(struct context_entry *context)
182 {
183         context->hi |= (1ull << 3);
184 }
185
186 static inline bool context_copied(struct context_entry *context)
187 {
188         return !!(context->hi & (1ULL << 3));
189 }
190
191 static inline bool __context_present(struct context_entry *context)
192 {
193         return (context->lo & 1);
194 }
195
196 bool context_present(struct context_entry *context)
197 {
198         return context_pasid_enabled(context) ?
199              __context_present(context) :
200              __context_present(context) && !context_copied(context);
201 }
202
203 static inline void context_set_present(struct context_entry *context)
204 {
205         context->lo |= 1;
206 }
207
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210         context->lo &= (((u64)-1) << 2) | 1;
211 }
212
213 static inline void context_set_translation_type(struct context_entry *context,
214                                                 unsigned long value)
215 {
216         context->lo &= (((u64)-1) << 4) | 3;
217         context->lo |= (value & 3) << 2;
218 }
219
220 static inline void context_set_address_root(struct context_entry *context,
221                                             unsigned long value)
222 {
223         context->lo &= ~VTD_PAGE_MASK;
224         context->lo |= value & VTD_PAGE_MASK;
225 }
226
227 static inline void context_set_address_width(struct context_entry *context,
228                                              unsigned long value)
229 {
230         context->hi |= value & 7;
231 }
232
233 static inline void context_set_domain_id(struct context_entry *context,
234                                          unsigned long value)
235 {
236         context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238
239 static inline int context_domain_id(struct context_entry *c)
240 {
241         return((c->hi >> 8) & 0xffff);
242 }
243
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246         context->lo = 0;
247         context->hi = 0;
248 }
249
250 /*
251  * This domain is a statically identity mapping domain.
252  *      1. This domain creats a static 1:1 mapping to all usable memory.
253  *      2. It maps to each iommu if successful.
254  *      3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258
259 #define for_each_domain_iommu(idx, domain)                      \
260         for (idx = 0; idx < g_num_of_iommus; idx++)             \
261                 if (domain->iommu_refcnt[idx])
262
263 struct dmar_rmrr_unit {
264         struct list_head list;          /* list of rmrr units   */
265         struct acpi_dmar_header *hdr;   /* ACPI header          */
266         u64     base_address;           /* reserved base address*/
267         u64     end_address;            /* reserved end address */
268         struct dmar_dev_scope *devices; /* target devices */
269         int     devices_cnt;            /* target device count */
270 };
271
272 struct dmar_atsr_unit {
273         struct list_head list;          /* list of ATSR units */
274         struct acpi_dmar_header *hdr;   /* ACPI header */
275         struct dmar_dev_scope *devices; /* target devices */
276         int devices_cnt;                /* target device count */
277         u8 include_all:1;               /* include all ports */
278 };
279
280 struct dmar_satc_unit {
281         struct list_head list;          /* list of SATC units */
282         struct acpi_dmar_header *hdr;   /* ACPI header */
283         struct dmar_dev_scope *devices; /* target devices */
284         struct intel_iommu *iommu;      /* the corresponding iommu */
285         int devices_cnt;                /* target device count */
286         u8 atc_required:1;              /* ATS is required */
287 };
288
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292
293 #define for_each_rmrr_units(rmrr) \
294         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313
314 #define IDENTMAP_GFX            2
315 #define IDENTMAP_AZALIA         4
316
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322
323 /*
324  * Iterate over elements in device_domain_list and call the specified
325  * callback @fn against each element.
326  */
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328                                      void *data), void *data)
329 {
330         int ret = 0;
331         unsigned long flags;
332         struct device_domain_info *info;
333
334         spin_lock_irqsave(&device_domain_lock, flags);
335         list_for_each_entry(info, &device_domain_list, global) {
336                 ret = fn(info, data);
337                 if (ret) {
338                         spin_unlock_irqrestore(&device_domain_lock, flags);
339                         return ret;
340                 }
341         }
342         spin_unlock_irqrestore(&device_domain_lock, flags);
343
344         return 0;
345 }
346
347 const struct iommu_ops intel_iommu_ops;
348
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
350 {
351         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
352 }
353
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
355 {
356         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
357 }
358
359 static void init_translation_status(struct intel_iommu *iommu)
360 {
361         u32 gsts;
362
363         gsts = readl(iommu->reg + DMAR_GSTS_REG);
364         if (gsts & DMA_GSTS_TES)
365                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
366 }
367
368 static int __init intel_iommu_setup(char *str)
369 {
370         if (!str)
371                 return -EINVAL;
372
373         while (*str) {
374                 if (!strncmp(str, "on", 2)) {
375                         dmar_disabled = 0;
376                         pr_info("IOMMU enabled\n");
377                 } else if (!strncmp(str, "off", 3)) {
378                         dmar_disabled = 1;
379                         no_platform_optin = 1;
380                         pr_info("IOMMU disabled\n");
381                 } else if (!strncmp(str, "igfx_off", 8)) {
382                         dmar_map_gfx = 0;
383                         pr_info("Disable GFX device mapping\n");
384                 } else if (!strncmp(str, "forcedac", 8)) {
385                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386                         iommu_dma_forcedac = true;
387                 } else if (!strncmp(str, "strict", 6)) {
388                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389                         iommu_set_dma_strict();
390                 } else if (!strncmp(str, "sp_off", 6)) {
391                         pr_info("Disable supported super page\n");
392                         intel_iommu_superpage = 0;
393                 } else if (!strncmp(str, "sm_on", 5)) {
394                         pr_info("Enable scalable mode if hardware supports\n");
395                         intel_iommu_sm = 1;
396                 } else if (!strncmp(str, "sm_off", 6)) {
397                         pr_info("Scalable mode is disallowed\n");
398                         intel_iommu_sm = 0;
399                 } else if (!strncmp(str, "tboot_noforce", 13)) {
400                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401                         intel_iommu_tboot_noforce = 1;
402                 } else {
403                         pr_notice("Unknown option - '%s'\n", str);
404                 }
405
406                 str += strcspn(str, ",");
407                 while (*str == ',')
408                         str++;
409         }
410
411         return 1;
412 }
413 __setup("intel_iommu=", intel_iommu_setup);
414
415 void *alloc_pgtable_page(int node)
416 {
417         struct page *page;
418         void *vaddr = NULL;
419
420         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
421         if (page)
422                 vaddr = page_address(page);
423         return vaddr;
424 }
425
426 void free_pgtable_page(void *vaddr)
427 {
428         free_page((unsigned long)vaddr);
429 }
430
431 static inline int domain_type_is_si(struct dmar_domain *domain)
432 {
433         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
434 }
435
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
437 {
438         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
439 }
440
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
442                                        unsigned long pfn)
443 {
444         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
445
446         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
447 }
448
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
450 {
451         unsigned long sagaw;
452         int agaw;
453
454         sagaw = cap_sagaw(iommu->cap);
455         for (agaw = width_to_agaw(max_gaw);
456              agaw >= 0; agaw--) {
457                 if (test_bit(agaw, &sagaw))
458                         break;
459         }
460
461         return agaw;
462 }
463
464 /*
465  * Calculate max SAGAW for each iommu.
466  */
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
470 }
471
472 /*
473  * calculate agaw for each iommu.
474  * "SAGAW" may be different across iommus, use a default agaw, and
475  * get a supported less agaw for iommus that don't support the default agaw.
476  */
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
478 {
479         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
480 }
481
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
484 {
485         int iommu_id;
486
487         /* si_domain and vm domain should not get here. */
488         if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
489                 return NULL;
490
491         for_each_domain_iommu(iommu_id, domain)
492                 break;
493
494         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
495                 return NULL;
496
497         return g_iommus[iommu_id];
498 }
499
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
501 {
502         return sm_supported(iommu) ?
503                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
504 }
505
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
507 {
508         struct dmar_drhd_unit *drhd;
509         struct intel_iommu *iommu;
510         bool found = false;
511         int i;
512
513         domain->iommu_coherency = true;
514
515         for_each_domain_iommu(i, domain) {
516                 found = true;
517                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
518                         domain->iommu_coherency = false;
519                         break;
520                 }
521         }
522         if (found)
523                 return;
524
525         /* No hardware attached; use lowest common denominator */
526         rcu_read_lock();
527         for_each_active_iommu(iommu, drhd) {
528                 if (!iommu_paging_structure_coherency(iommu)) {
529                         domain->iommu_coherency = false;
530                         break;
531                 }
532         }
533         rcu_read_unlock();
534 }
535
536 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
537 {
538         struct dmar_drhd_unit *drhd;
539         struct intel_iommu *iommu;
540         bool ret = true;
541
542         rcu_read_lock();
543         for_each_active_iommu(iommu, drhd) {
544                 if (iommu != skip) {
545                         /*
546                          * If the hardware is operating in the scalable mode,
547                          * the snooping control is always supported since we
548                          * always set PASID-table-entry.PGSNP bit if the domain
549                          * is managed outside (UNMANAGED).
550                          */
551                         if (!sm_supported(iommu) &&
552                             !ecap_sc_support(iommu->ecap)) {
553                                 ret = false;
554                                 break;
555                         }
556                 }
557         }
558         rcu_read_unlock();
559
560         return ret;
561 }
562
563 static int domain_update_iommu_superpage(struct dmar_domain *domain,
564                                          struct intel_iommu *skip)
565 {
566         struct dmar_drhd_unit *drhd;
567         struct intel_iommu *iommu;
568         int mask = 0x3;
569
570         if (!intel_iommu_superpage)
571                 return 0;
572
573         /* set iommu_superpage to the smallest common denominator */
574         rcu_read_lock();
575         for_each_active_iommu(iommu, drhd) {
576                 if (iommu != skip) {
577                         if (domain && domain_use_first_level(domain)) {
578                                 if (!cap_fl1gp_support(iommu->cap))
579                                         mask = 0x1;
580                         } else {
581                                 mask &= cap_super_page_val(iommu->cap);
582                         }
583
584                         if (!mask)
585                                 break;
586                 }
587         }
588         rcu_read_unlock();
589
590         return fls(mask);
591 }
592
593 static int domain_update_device_node(struct dmar_domain *domain)
594 {
595         struct device_domain_info *info;
596         int nid = NUMA_NO_NODE;
597
598         assert_spin_locked(&device_domain_lock);
599
600         if (list_empty(&domain->devices))
601                 return NUMA_NO_NODE;
602
603         list_for_each_entry(info, &domain->devices, link) {
604                 if (!info->dev)
605                         continue;
606
607                 /*
608                  * There could possibly be multiple device numa nodes as devices
609                  * within the same domain may sit behind different IOMMUs. There
610                  * isn't perfect answer in such situation, so we select first
611                  * come first served policy.
612                  */
613                 nid = dev_to_node(info->dev);
614                 if (nid != NUMA_NO_NODE)
615                         break;
616         }
617
618         return nid;
619 }
620
621 static void domain_update_iotlb(struct dmar_domain *domain);
622
623 /* Return the super pagesize bitmap if supported. */
624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
625 {
626         unsigned long bitmap = 0;
627
628         /*
629          * 1-level super page supports page size of 2MiB, 2-level super page
630          * supports page size of both 2MiB and 1GiB.
631          */
632         if (domain->iommu_superpage == 1)
633                 bitmap |= SZ_2M;
634         else if (domain->iommu_superpage == 2)
635                 bitmap |= SZ_2M | SZ_1G;
636
637         return bitmap;
638 }
639
640 /* Some capabilities may be different across iommus */
641 static void domain_update_iommu_cap(struct dmar_domain *domain)
642 {
643         domain_update_iommu_coherency(domain);
644         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
645         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
646
647         /*
648          * If RHSA is missing, we should default to the device numa domain
649          * as fall back.
650          */
651         if (domain->nid == NUMA_NO_NODE)
652                 domain->nid = domain_update_device_node(domain);
653
654         /*
655          * First-level translation restricts the input-address to a
656          * canonical address (i.e., address bits 63:N have the same
657          * value as address bit [N-1], where N is 48-bits with 4-level
658          * paging and 57-bits with 5-level paging). Hence, skip bit
659          * [N-1].
660          */
661         if (domain_use_first_level(domain))
662                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
663         else
664                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
665
666         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
667         domain_update_iotlb(domain);
668 }
669
670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
671                                          u8 devfn, int alloc)
672 {
673         struct root_entry *root = &iommu->root_entry[bus];
674         struct context_entry *context;
675         u64 *entry;
676
677         entry = &root->lo;
678         if (sm_supported(iommu)) {
679                 if (devfn >= 0x80) {
680                         devfn -= 0x80;
681                         entry = &root->hi;
682                 }
683                 devfn *= 2;
684         }
685         if (*entry & 1)
686                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
687         else {
688                 unsigned long phy_addr;
689                 if (!alloc)
690                         return NULL;
691
692                 context = alloc_pgtable_page(iommu->node);
693                 if (!context)
694                         return NULL;
695
696                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
697                 phy_addr = virt_to_phys((void *)context);
698                 *entry = phy_addr | 1;
699                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
700         }
701         return &context[devfn];
702 }
703
704 /**
705  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
706  *                               sub-hierarchy of a candidate PCI-PCI bridge
707  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
708  * @bridge: the candidate PCI-PCI bridge
709  *
710  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
711  */
712 static bool
713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
714 {
715         struct pci_dev *pdev, *pbridge;
716
717         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
718                 return false;
719
720         pdev = to_pci_dev(dev);
721         pbridge = to_pci_dev(bridge);
722
723         if (pbridge->subordinate &&
724             pbridge->subordinate->number <= pdev->bus->number &&
725             pbridge->subordinate->busn_res.end >= pdev->bus->number)
726                 return true;
727
728         return false;
729 }
730
731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
732 {
733         struct dmar_drhd_unit *drhd;
734         u32 vtbar;
735         int rc;
736
737         /* We know that this device on this chipset has its own IOMMU.
738          * If we find it under a different IOMMU, then the BIOS is lying
739          * to us. Hope that the IOMMU for this device is actually
740          * disabled, and it needs no translation...
741          */
742         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
743         if (rc) {
744                 /* "can't" happen */
745                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
746                 return false;
747         }
748         vtbar &= 0xffff0000;
749
750         /* we know that the this iommu should be at offset 0xa000 from vtbar */
751         drhd = dmar_find_matched_drhd_unit(pdev);
752         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
753                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
754                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
755                 return true;
756         }
757
758         return false;
759 }
760
761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
762 {
763         if (!iommu || iommu->drhd->ignored)
764                 return true;
765
766         if (dev_is_pci(dev)) {
767                 struct pci_dev *pdev = to_pci_dev(dev);
768
769                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
770                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
771                     quirk_ioat_snb_local_iommu(pdev))
772                         return true;
773         }
774
775         return false;
776 }
777
778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
779 {
780         struct dmar_drhd_unit *drhd = NULL;
781         struct pci_dev *pdev = NULL;
782         struct intel_iommu *iommu;
783         struct device *tmp;
784         u16 segment = 0;
785         int i;
786
787         if (!dev)
788                 return NULL;
789
790         if (dev_is_pci(dev)) {
791                 struct pci_dev *pf_pdev;
792
793                 pdev = pci_real_dma_dev(to_pci_dev(dev));
794
795                 /* VFs aren't listed in scope tables; we need to look up
796                  * the PF instead to find the IOMMU. */
797                 pf_pdev = pci_physfn(pdev);
798                 dev = &pf_pdev->dev;
799                 segment = pci_domain_nr(pdev->bus);
800         } else if (has_acpi_companion(dev))
801                 dev = &ACPI_COMPANION(dev)->dev;
802
803         rcu_read_lock();
804         for_each_iommu(iommu, drhd) {
805                 if (pdev && segment != drhd->segment)
806                         continue;
807
808                 for_each_active_dev_scope(drhd->devices,
809                                           drhd->devices_cnt, i, tmp) {
810                         if (tmp == dev) {
811                                 /* For a VF use its original BDF# not that of the PF
812                                  * which we used for the IOMMU lookup. Strictly speaking
813                                  * we could do this for all PCI devices; we only need to
814                                  * get the BDF# from the scope table for ACPI matches. */
815                                 if (pdev && pdev->is_virtfn)
816                                         goto got_pdev;
817
818                                 if (bus && devfn) {
819                                         *bus = drhd->devices[i].bus;
820                                         *devfn = drhd->devices[i].devfn;
821                                 }
822                                 goto out;
823                         }
824
825                         if (is_downstream_to_pci_bridge(dev, tmp))
826                                 goto got_pdev;
827                 }
828
829                 if (pdev && drhd->include_all) {
830 got_pdev:
831                         if (bus && devfn) {
832                                 *bus = pdev->bus->number;
833                                 *devfn = pdev->devfn;
834                         }
835                         goto out;
836                 }
837         }
838         iommu = NULL;
839 out:
840         if (iommu_is_dummy(iommu, dev))
841                 iommu = NULL;
842
843         rcu_read_unlock();
844
845         return iommu;
846 }
847
848 static void domain_flush_cache(struct dmar_domain *domain,
849                                void *addr, int size)
850 {
851         if (!domain->iommu_coherency)
852                 clflush_cache_range(addr, size);
853 }
854
855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
856 {
857         struct context_entry *context;
858         int ret = 0;
859         unsigned long flags;
860
861         spin_lock_irqsave(&iommu->lock, flags);
862         context = iommu_context_addr(iommu, bus, devfn, 0);
863         if (context)
864                 ret = context_present(context);
865         spin_unlock_irqrestore(&iommu->lock, flags);
866         return ret;
867 }
868
869 static void free_context_table(struct intel_iommu *iommu)
870 {
871         int i;
872         unsigned long flags;
873         struct context_entry *context;
874
875         spin_lock_irqsave(&iommu->lock, flags);
876         if (!iommu->root_entry) {
877                 goto out;
878         }
879         for (i = 0; i < ROOT_ENTRY_NR; i++) {
880                 context = iommu_context_addr(iommu, i, 0, 0);
881                 if (context)
882                         free_pgtable_page(context);
883
884                 if (!sm_supported(iommu))
885                         continue;
886
887                 context = iommu_context_addr(iommu, i, 0x80, 0);
888                 if (context)
889                         free_pgtable_page(context);
890
891         }
892         free_pgtable_page(iommu->root_entry);
893         iommu->root_entry = NULL;
894 out:
895         spin_unlock_irqrestore(&iommu->lock, flags);
896 }
897
898 #ifdef CONFIG_DMAR_DEBUG
899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
900 {
901         struct device_domain_info *info;
902         struct dma_pte *parent, *pte;
903         struct dmar_domain *domain;
904         int offset, level;
905
906         info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
907         if (!info || !info->domain) {
908                 pr_info("device [%02x:%02x.%d] not probed\n",
909                         bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
910                 return;
911         }
912
913         domain = info->domain;
914         level = agaw_to_level(domain->agaw);
915         parent = domain->pgd;
916         if (!parent) {
917                 pr_info("no page table setup\n");
918                 return;
919         }
920
921         while (1) {
922                 offset = pfn_level_offset(pfn, level);
923                 pte = &parent[offset];
924                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
925                         pr_info("PTE not present at level %d\n", level);
926                         break;
927                 }
928
929                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
930
931                 if (level == 1)
932                         break;
933
934                 parent = phys_to_virt(dma_pte_addr(pte));
935                 level--;
936         }
937 }
938
939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
940                           unsigned long long addr, u32 pasid)
941 {
942         struct pasid_dir_entry *dir, *pde;
943         struct pasid_entry *entries, *pte;
944         struct context_entry *ctx_entry;
945         struct root_entry *rt_entry;
946         u8 devfn = source_id & 0xff;
947         u8 bus = source_id >> 8;
948         int i, dir_index, index;
949
950         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
951
952         /* root entry dump */
953         rt_entry = &iommu->root_entry[bus];
954         if (!rt_entry) {
955                 pr_info("root table entry is not present\n");
956                 return;
957         }
958
959         if (sm_supported(iommu))
960                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
961                         rt_entry->hi, rt_entry->lo);
962         else
963                 pr_info("root entry: 0x%016llx", rt_entry->lo);
964
965         /* context entry dump */
966         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
967         if (!ctx_entry) {
968                 pr_info("context table entry is not present\n");
969                 return;
970         }
971
972         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
973                 ctx_entry->hi, ctx_entry->lo);
974
975         /* legacy mode does not require PASID entries */
976         if (!sm_supported(iommu))
977                 goto pgtable_walk;
978
979         /* get the pointer to pasid directory entry */
980         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
981         if (!dir) {
982                 pr_info("pasid directory entry is not present\n");
983                 return;
984         }
985         /* For request-without-pasid, get the pasid from context entry */
986         if (intel_iommu_sm && pasid == INVALID_IOASID)
987                 pasid = PASID_RID2PASID;
988
989         dir_index = pasid >> PASID_PDE_SHIFT;
990         pde = &dir[dir_index];
991         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
992
993         /* get the pointer to the pasid table entry */
994         entries = get_pasid_table_from_pde(pde);
995         if (!entries) {
996                 pr_info("pasid table entry is not present\n");
997                 return;
998         }
999         index = pasid & PASID_PTE_MASK;
1000         pte = &entries[index];
1001         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1002                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1003
1004 pgtable_walk:
1005         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1006 }
1007 #endif
1008
1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1010                                       unsigned long pfn, int *target_level)
1011 {
1012         struct dma_pte *parent, *pte;
1013         int level = agaw_to_level(domain->agaw);
1014         int offset;
1015
1016         BUG_ON(!domain->pgd);
1017
1018         if (!domain_pfn_supported(domain, pfn))
1019                 /* Address beyond IOMMU's addressing capabilities. */
1020                 return NULL;
1021
1022         parent = domain->pgd;
1023
1024         while (1) {
1025                 void *tmp_page;
1026
1027                 offset = pfn_level_offset(pfn, level);
1028                 pte = &parent[offset];
1029                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1030                         break;
1031                 if (level == *target_level)
1032                         break;
1033
1034                 if (!dma_pte_present(pte)) {
1035                         uint64_t pteval;
1036
1037                         tmp_page = alloc_pgtable_page(domain->nid);
1038
1039                         if (!tmp_page)
1040                                 return NULL;
1041
1042                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1043                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1044                         if (domain_use_first_level(domain)) {
1045                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1046                                 if (iommu_is_dma_domain(&domain->domain))
1047                                         pteval |= DMA_FL_PTE_ACCESS;
1048                         }
1049                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1050                                 /* Someone else set it while we were thinking; use theirs. */
1051                                 free_pgtable_page(tmp_page);
1052                         else
1053                                 domain_flush_cache(domain, pte, sizeof(*pte));
1054                 }
1055                 if (level == 1)
1056                         break;
1057
1058                 parent = phys_to_virt(dma_pte_addr(pte));
1059                 level--;
1060         }
1061
1062         if (!*target_level)
1063                 *target_level = level;
1064
1065         return pte;
1066 }
1067
1068 /* return address's pte at specific level */
1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1070                                          unsigned long pfn,
1071                                          int level, int *large_page)
1072 {
1073         struct dma_pte *parent, *pte;
1074         int total = agaw_to_level(domain->agaw);
1075         int offset;
1076
1077         parent = domain->pgd;
1078         while (level <= total) {
1079                 offset = pfn_level_offset(pfn, total);
1080                 pte = &parent[offset];
1081                 if (level == total)
1082                         return pte;
1083
1084                 if (!dma_pte_present(pte)) {
1085                         *large_page = total;
1086                         break;
1087                 }
1088
1089                 if (dma_pte_superpage(pte)) {
1090                         *large_page = total;
1091                         return pte;
1092                 }
1093
1094                 parent = phys_to_virt(dma_pte_addr(pte));
1095                 total--;
1096         }
1097         return NULL;
1098 }
1099
1100 /* clear last level pte, a tlb flush should be followed */
1101 static void dma_pte_clear_range(struct dmar_domain *domain,
1102                                 unsigned long start_pfn,
1103                                 unsigned long last_pfn)
1104 {
1105         unsigned int large_page;
1106         struct dma_pte *first_pte, *pte;
1107
1108         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1109         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1110         BUG_ON(start_pfn > last_pfn);
1111
1112         /* we don't need lock here; nobody else touches the iova range */
1113         do {
1114                 large_page = 1;
1115                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1116                 if (!pte) {
1117                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1118                         continue;
1119                 }
1120                 do {
1121                         dma_clear_pte(pte);
1122                         start_pfn += lvl_to_nr_pages(large_page);
1123                         pte++;
1124                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1125
1126                 domain_flush_cache(domain, first_pte,
1127                                    (void *)pte - (void *)first_pte);
1128
1129         } while (start_pfn && start_pfn <= last_pfn);
1130 }
1131
1132 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1133                                int retain_level, struct dma_pte *pte,
1134                                unsigned long pfn, unsigned long start_pfn,
1135                                unsigned long last_pfn)
1136 {
1137         pfn = max(start_pfn, pfn);
1138         pte = &pte[pfn_level_offset(pfn, level)];
1139
1140         do {
1141                 unsigned long level_pfn;
1142                 struct dma_pte *level_pte;
1143
1144                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1145                         goto next;
1146
1147                 level_pfn = pfn & level_mask(level);
1148                 level_pte = phys_to_virt(dma_pte_addr(pte));
1149
1150                 if (level > 2) {
1151                         dma_pte_free_level(domain, level - 1, retain_level,
1152                                            level_pte, level_pfn, start_pfn,
1153                                            last_pfn);
1154                 }
1155
1156                 /*
1157                  * Free the page table if we're below the level we want to
1158                  * retain and the range covers the entire table.
1159                  */
1160                 if (level < retain_level && !(start_pfn > level_pfn ||
1161                       last_pfn < level_pfn + level_size(level) - 1)) {
1162                         dma_clear_pte(pte);
1163                         domain_flush_cache(domain, pte, sizeof(*pte));
1164                         free_pgtable_page(level_pte);
1165                 }
1166 next:
1167                 pfn += level_size(level);
1168         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1169 }
1170
1171 /*
1172  * clear last level (leaf) ptes and free page table pages below the
1173  * level we wish to keep intact.
1174  */
1175 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1176                                    unsigned long start_pfn,
1177                                    unsigned long last_pfn,
1178                                    int retain_level)
1179 {
1180         dma_pte_clear_range(domain, start_pfn, last_pfn);
1181
1182         /* We don't need lock here; nobody else touches the iova range */
1183         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1184                            domain->pgd, 0, start_pfn, last_pfn);
1185
1186         /* free pgd */
1187         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188                 free_pgtable_page(domain->pgd);
1189                 domain->pgd = NULL;
1190         }
1191 }
1192
1193 /* When a page at a given level is being unlinked from its parent, we don't
1194    need to *modify* it at all. All we need to do is make a list of all the
1195    pages which can be freed just as soon as we've flushed the IOTLB and we
1196    know the hardware page-walk will no longer touch them.
1197    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1198    be freed. */
1199 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1200                                     int level, struct dma_pte *pte,
1201                                     struct list_head *freelist)
1202 {
1203         struct page *pg;
1204
1205         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1206         list_add_tail(&pg->lru, freelist);
1207
1208         if (level == 1)
1209                 return;
1210
1211         pte = page_address(pg);
1212         do {
1213                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1214                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1215                 pte++;
1216         } while (!first_pte_in_page(pte));
1217 }
1218
1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1220                                 struct dma_pte *pte, unsigned long pfn,
1221                                 unsigned long start_pfn, unsigned long last_pfn,
1222                                 struct list_head *freelist)
1223 {
1224         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1225
1226         pfn = max(start_pfn, pfn);
1227         pte = &pte[pfn_level_offset(pfn, level)];
1228
1229         do {
1230                 unsigned long level_pfn = pfn & level_mask(level);
1231
1232                 if (!dma_pte_present(pte))
1233                         goto next;
1234
1235                 /* If range covers entire pagetable, free it */
1236                 if (start_pfn <= level_pfn &&
1237                     last_pfn >= level_pfn + level_size(level) - 1) {
1238                         /* These suborbinate page tables are going away entirely. Don't
1239                            bother to clear them; we're just going to *free* them. */
1240                         if (level > 1 && !dma_pte_superpage(pte))
1241                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1242
1243                         dma_clear_pte(pte);
1244                         if (!first_pte)
1245                                 first_pte = pte;
1246                         last_pte = pte;
1247                 } else if (level > 1) {
1248                         /* Recurse down into a level that isn't *entirely* obsolete */
1249                         dma_pte_clear_level(domain, level - 1,
1250                                             phys_to_virt(dma_pte_addr(pte)),
1251                                             level_pfn, start_pfn, last_pfn,
1252                                             freelist);
1253                 }
1254 next:
1255                 pfn = level_pfn + level_size(level);
1256         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1257
1258         if (first_pte)
1259                 domain_flush_cache(domain, first_pte,
1260                                    (void *)++last_pte - (void *)first_pte);
1261 }
1262
1263 /* We can't just free the pages because the IOMMU may still be walking
1264    the page tables, and may have cached the intermediate levels. The
1265    pages can only be freed after the IOTLB flush has been done. */
1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1267                          unsigned long last_pfn, struct list_head *freelist)
1268 {
1269         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1270         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1271         BUG_ON(start_pfn > last_pfn);
1272
1273         /* we don't need lock here; nobody else touches the iova range */
1274         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1275                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1276
1277         /* free pgd */
1278         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1279                 struct page *pgd_page = virt_to_page(domain->pgd);
1280                 list_add_tail(&pgd_page->lru, freelist);
1281                 domain->pgd = NULL;
1282         }
1283 }
1284
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1287 {
1288         struct root_entry *root;
1289         unsigned long flags;
1290
1291         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1292         if (!root) {
1293                 pr_err("Allocating root entry for %s failed\n",
1294                         iommu->name);
1295                 return -ENOMEM;
1296         }
1297
1298         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1299
1300         spin_lock_irqsave(&iommu->lock, flags);
1301         iommu->root_entry = root;
1302         spin_unlock_irqrestore(&iommu->lock, flags);
1303
1304         return 0;
1305 }
1306
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1308 {
1309         u64 addr;
1310         u32 sts;
1311         unsigned long flag;
1312
1313         addr = virt_to_phys(iommu->root_entry);
1314         if (sm_supported(iommu))
1315                 addr |= DMA_RTADDR_SMT;
1316
1317         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1319
1320         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1321
1322         /* Make sure hardware complete it */
1323         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324                       readl, (sts & DMA_GSTS_RTPS), sts);
1325
1326         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327
1328         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1329         if (sm_supported(iommu))
1330                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1331         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1332 }
1333
1334 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1335 {
1336         u32 val;
1337         unsigned long flag;
1338
1339         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1340                 return;
1341
1342         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1343         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1344
1345         /* Make sure hardware complete it */
1346         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1347                       readl, (!(val & DMA_GSTS_WBFS)), val);
1348
1349         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1350 }
1351
1352 /* return value determine if we need a write buffer flush */
1353 static void __iommu_flush_context(struct intel_iommu *iommu,
1354                                   u16 did, u16 source_id, u8 function_mask,
1355                                   u64 type)
1356 {
1357         u64 val = 0;
1358         unsigned long flag;
1359
1360         switch (type) {
1361         case DMA_CCMD_GLOBAL_INVL:
1362                 val = DMA_CCMD_GLOBAL_INVL;
1363                 break;
1364         case DMA_CCMD_DOMAIN_INVL:
1365                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1366                 break;
1367         case DMA_CCMD_DEVICE_INVL:
1368                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1369                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1370                 break;
1371         default:
1372                 BUG();
1373         }
1374         val |= DMA_CCMD_ICC;
1375
1376         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1377         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1378
1379         /* Make sure hardware complete it */
1380         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1381                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1382
1383         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1384 }
1385
1386 /* return value determine if we need a write buffer flush */
1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1388                                 u64 addr, unsigned int size_order, u64 type)
1389 {
1390         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1391         u64 val = 0, val_iva = 0;
1392         unsigned long flag;
1393
1394         switch (type) {
1395         case DMA_TLB_GLOBAL_FLUSH:
1396                 /* global flush doesn't need set IVA_REG */
1397                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1398                 break;
1399         case DMA_TLB_DSI_FLUSH:
1400                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 break;
1402         case DMA_TLB_PSI_FLUSH:
1403                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1404                 /* IH bit is passed in as part of address */
1405                 val_iva = size_order | addr;
1406                 break;
1407         default:
1408                 BUG();
1409         }
1410         /* Note: set drain read/write */
1411 #if 0
1412         /*
1413          * This is probably to be super secure.. Looks like we can
1414          * ignore it without any impact.
1415          */
1416         if (cap_read_drain(iommu->cap))
1417                 val |= DMA_TLB_READ_DRAIN;
1418 #endif
1419         if (cap_write_drain(iommu->cap))
1420                 val |= DMA_TLB_WRITE_DRAIN;
1421
1422         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423         /* Note: Only uses first TLB reg currently */
1424         if (val_iva)
1425                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1426         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1427
1428         /* Make sure hardware complete it */
1429         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1430                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1431
1432         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1433
1434         /* check IOTLB invalidation granularity */
1435         if (DMA_TLB_IAIG(val) == 0)
1436                 pr_err("Flush IOTLB failed\n");
1437         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1438                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1439                         (unsigned long long)DMA_TLB_IIRG(type),
1440                         (unsigned long long)DMA_TLB_IAIG(val));
1441 }
1442
1443 static struct device_domain_info *
1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1445                          u8 bus, u8 devfn)
1446 {
1447         struct device_domain_info *info;
1448
1449         assert_spin_locked(&device_domain_lock);
1450
1451         if (!iommu->qi)
1452                 return NULL;
1453
1454         list_for_each_entry(info, &domain->devices, link)
1455                 if (info->iommu == iommu && info->bus == bus &&
1456                     info->devfn == devfn) {
1457                         if (info->ats_supported && info->dev)
1458                                 return info;
1459                         break;
1460                 }
1461
1462         return NULL;
1463 }
1464
1465 static void domain_update_iotlb(struct dmar_domain *domain)
1466 {
1467         struct device_domain_info *info;
1468         bool has_iotlb_device = false;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         list_for_each_entry(info, &domain->devices, link)
1473                 if (info->ats_enabled) {
1474                         has_iotlb_device = true;
1475                         break;
1476                 }
1477
1478         domain->has_iotlb_device = has_iotlb_device;
1479 }
1480
1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1482 {
1483         struct pci_dev *pdev;
1484
1485         assert_spin_locked(&device_domain_lock);
1486
1487         if (!info || !dev_is_pci(info->dev))
1488                 return;
1489
1490         pdev = to_pci_dev(info->dev);
1491         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1492          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1493          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1494          * reserved, which should be set to 0.
1495          */
1496         if (!ecap_dit(info->iommu->ecap))
1497                 info->pfsid = 0;
1498         else {
1499                 struct pci_dev *pf_pdev;
1500
1501                 /* pdev will be returned if device is not a vf */
1502                 pf_pdev = pci_physfn(pdev);
1503                 info->pfsid = pci_dev_id(pf_pdev);
1504         }
1505
1506 #ifdef CONFIG_INTEL_IOMMU_SVM
1507         /* The PCIe spec, in its wisdom, declares that the behaviour of
1508            the device if you enable PASID support after ATS support is
1509            undefined. So always enable PASID support on devices which
1510            have it, even if we can't yet know if we're ever going to
1511            use it. */
1512         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513                 info->pasid_enabled = 1;
1514
1515         if (info->pri_supported &&
1516             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1517             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1518                 info->pri_enabled = 1;
1519 #endif
1520         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1521             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1522                 info->ats_enabled = 1;
1523                 domain_update_iotlb(info->domain);
1524                 info->ats_qdep = pci_ats_queue_depth(pdev);
1525         }
1526 }
1527
1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1529 {
1530         struct pci_dev *pdev;
1531
1532         assert_spin_locked(&device_domain_lock);
1533
1534         if (!dev_is_pci(info->dev))
1535                 return;
1536
1537         pdev = to_pci_dev(info->dev);
1538
1539         if (info->ats_enabled) {
1540                 pci_disable_ats(pdev);
1541                 info->ats_enabled = 0;
1542                 domain_update_iotlb(info->domain);
1543         }
1544 #ifdef CONFIG_INTEL_IOMMU_SVM
1545         if (info->pri_enabled) {
1546                 pci_disable_pri(pdev);
1547                 info->pri_enabled = 0;
1548         }
1549         if (info->pasid_enabled) {
1550                 pci_disable_pasid(pdev);
1551                 info->pasid_enabled = 0;
1552         }
1553 #endif
1554 }
1555
1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1557                                     u64 addr, unsigned int mask)
1558 {
1559         u16 sid, qdep;
1560
1561         if (!info || !info->ats_enabled)
1562                 return;
1563
1564         sid = info->bus << 8 | info->devfn;
1565         qdep = info->ats_qdep;
1566         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1567                            qdep, addr, mask);
1568 }
1569
1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1571                                   u64 addr, unsigned mask)
1572 {
1573         unsigned long flags;
1574         struct device_domain_info *info;
1575
1576         if (!domain->has_iotlb_device)
1577                 return;
1578
1579         spin_lock_irqsave(&device_domain_lock, flags);
1580         list_for_each_entry(info, &domain->devices, link)
1581                 __iommu_flush_dev_iotlb(info, addr, mask);
1582
1583         spin_unlock_irqrestore(&device_domain_lock, flags);
1584 }
1585
1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1587                                   struct dmar_domain *domain,
1588                                   unsigned long pfn, unsigned int pages,
1589                                   int ih, int map)
1590 {
1591         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1592         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1593         u16 did = domain->iommu_did[iommu->seq_id];
1594
1595         BUG_ON(pages == 0);
1596
1597         if (ih)
1598                 ih = 1 << 6;
1599
1600         if (domain_use_first_level(domain)) {
1601                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1602         } else {
1603                 /*
1604                  * Fallback to domain selective flush if no PSI support or
1605                  * the size is too big. PSI requires page size to be 2 ^ x,
1606                  * and the base address is naturally aligned to the size.
1607                  */
1608                 if (!cap_pgsel_inv(iommu->cap) ||
1609                     mask > cap_max_amask_val(iommu->cap))
1610                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1611                                                         DMA_TLB_DSI_FLUSH);
1612                 else
1613                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1614                                                         DMA_TLB_PSI_FLUSH);
1615         }
1616
1617         /*
1618          * In caching mode, changes of pages from non-present to present require
1619          * flush. However, device IOTLB doesn't need to be flushed in this case.
1620          */
1621         if (!cap_caching_mode(iommu->cap) || !map)
1622                 iommu_flush_dev_iotlb(domain, addr, mask);
1623 }
1624
1625 /* Notification for newly created mappings */
1626 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1627                                         struct dmar_domain *domain,
1628                                         unsigned long pfn, unsigned int pages)
1629 {
1630         /*
1631          * It's a non-present to present mapping. Only flush if caching mode
1632          * and second level.
1633          */
1634         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1635                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1636         else
1637                 iommu_flush_write_buffer(iommu);
1638 }
1639
1640 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1641 {
1642         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1643         int idx;
1644
1645         for_each_domain_iommu(idx, dmar_domain) {
1646                 struct intel_iommu *iommu = g_iommus[idx];
1647                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1648
1649                 if (domain_use_first_level(dmar_domain))
1650                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1651                 else
1652                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1653                                                  DMA_TLB_DSI_FLUSH);
1654
1655                 if (!cap_caching_mode(iommu->cap))
1656                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1657         }
1658 }
1659
1660 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1661 {
1662         u32 pmen;
1663         unsigned long flags;
1664
1665         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1666                 return;
1667
1668         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1670         pmen &= ~DMA_PMEN_EPM;
1671         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1672
1673         /* wait for the protected region status bit to clear */
1674         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1675                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1676
1677         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1678 }
1679
1680 static void iommu_enable_translation(struct intel_iommu *iommu)
1681 {
1682         u32 sts;
1683         unsigned long flags;
1684
1685         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1686         iommu->gcmd |= DMA_GCMD_TE;
1687         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1688
1689         /* Make sure hardware complete it */
1690         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1691                       readl, (sts & DMA_GSTS_TES), sts);
1692
1693         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1694 }
1695
1696 static void iommu_disable_translation(struct intel_iommu *iommu)
1697 {
1698         u32 sts;
1699         unsigned long flag;
1700
1701         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1702             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1703                 return;
1704
1705         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1706         iommu->gcmd &= ~DMA_GCMD_TE;
1707         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1708
1709         /* Make sure hardware complete it */
1710         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1711                       readl, (!(sts & DMA_GSTS_TES)), sts);
1712
1713         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1714 }
1715
1716 static int iommu_init_domains(struct intel_iommu *iommu)
1717 {
1718         u32 ndomains;
1719
1720         ndomains = cap_ndoms(iommu->cap);
1721         pr_debug("%s: Number of Domains supported <%d>\n",
1722                  iommu->name, ndomains);
1723
1724         spin_lock_init(&iommu->lock);
1725
1726         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1727         if (!iommu->domain_ids)
1728                 return -ENOMEM;
1729
1730         /*
1731          * If Caching mode is set, then invalid translations are tagged
1732          * with domain-id 0, hence we need to pre-allocate it. We also
1733          * use domain-id 0 as a marker for non-allocated domain-id, so
1734          * make sure it is not used for a real domain.
1735          */
1736         set_bit(0, iommu->domain_ids);
1737
1738         /*
1739          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1740          * entry for first-level or pass-through translation modes should
1741          * be programmed with a domain id different from those used for
1742          * second-level or nested translation. We reserve a domain id for
1743          * this purpose.
1744          */
1745         if (sm_supported(iommu))
1746                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1747
1748         return 0;
1749 }
1750
1751 static void disable_dmar_iommu(struct intel_iommu *iommu)
1752 {
1753         struct device_domain_info *info, *tmp;
1754         unsigned long flags;
1755
1756         if (!iommu->domain_ids)
1757                 return;
1758
1759         spin_lock_irqsave(&device_domain_lock, flags);
1760         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1761                 if (info->iommu != iommu)
1762                         continue;
1763
1764                 if (!info->dev || !info->domain)
1765                         continue;
1766
1767                 __dmar_remove_one_dev_info(info);
1768         }
1769         spin_unlock_irqrestore(&device_domain_lock, flags);
1770
1771         if (iommu->gcmd & DMA_GCMD_TE)
1772                 iommu_disable_translation(iommu);
1773 }
1774
1775 static void free_dmar_iommu(struct intel_iommu *iommu)
1776 {
1777         if (iommu->domain_ids) {
1778                 bitmap_free(iommu->domain_ids);
1779                 iommu->domain_ids = NULL;
1780         }
1781
1782         g_iommus[iommu->seq_id] = NULL;
1783
1784         /* free context mapping */
1785         free_context_table(iommu);
1786
1787 #ifdef CONFIG_INTEL_IOMMU_SVM
1788         if (pasid_supported(iommu)) {
1789                 if (ecap_prs(iommu->ecap))
1790                         intel_svm_finish_prq(iommu);
1791         }
1792         if (vccap_pasid(iommu->vccap))
1793                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1794
1795 #endif
1796 }
1797
1798 /*
1799  * Check and return whether first level is used by default for
1800  * DMA translation.
1801  */
1802 static bool first_level_by_default(unsigned int type)
1803 {
1804         /* Only SL is available in legacy mode */
1805         if (!scalable_mode_support())
1806                 return false;
1807
1808         /* Only level (either FL or SL) is available, just use it */
1809         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1810                 return intel_cap_flts_sanity();
1811
1812         /* Both levels are available, decide it based on domain type */
1813         return type != IOMMU_DOMAIN_UNMANAGED;
1814 }
1815
1816 static struct dmar_domain *alloc_domain(unsigned int type)
1817 {
1818         struct dmar_domain *domain;
1819
1820         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1821         if (!domain)
1822                 return NULL;
1823
1824         domain->nid = NUMA_NO_NODE;
1825         if (first_level_by_default(type))
1826                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1827         domain->has_iotlb_device = false;
1828         INIT_LIST_HEAD(&domain->devices);
1829
1830         return domain;
1831 }
1832
1833 /* Must be called with iommu->lock */
1834 static int domain_attach_iommu(struct dmar_domain *domain,
1835                                struct intel_iommu *iommu)
1836 {
1837         unsigned long ndomains;
1838         int num;
1839
1840         assert_spin_locked(&device_domain_lock);
1841         assert_spin_locked(&iommu->lock);
1842
1843         domain->iommu_refcnt[iommu->seq_id] += 1;
1844         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845                 ndomains = cap_ndoms(iommu->cap);
1846                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1847
1848                 if (num >= ndomains) {
1849                         pr_err("%s: No free domain ids\n", iommu->name);
1850                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1851                         return -ENOSPC;
1852                 }
1853
1854                 set_bit(num, iommu->domain_ids);
1855                 domain->iommu_did[iommu->seq_id] = num;
1856                 domain->nid                      = iommu->node;
1857                 domain_update_iommu_cap(domain);
1858         }
1859
1860         return 0;
1861 }
1862
1863 static void domain_detach_iommu(struct dmar_domain *domain,
1864                                 struct intel_iommu *iommu)
1865 {
1866         int num;
1867
1868         assert_spin_locked(&device_domain_lock);
1869         assert_spin_locked(&iommu->lock);
1870
1871         domain->iommu_refcnt[iommu->seq_id] -= 1;
1872         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1873                 num = domain->iommu_did[iommu->seq_id];
1874                 clear_bit(num, iommu->domain_ids);
1875                 domain_update_iommu_cap(domain);
1876                 domain->iommu_did[iommu->seq_id] = 0;
1877         }
1878 }
1879
1880 static inline int guestwidth_to_adjustwidth(int gaw)
1881 {
1882         int agaw;
1883         int r = (gaw - 12) % 9;
1884
1885         if (r == 0)
1886                 agaw = gaw;
1887         else
1888                 agaw = gaw + 9 - r;
1889         if (agaw > 64)
1890                 agaw = 64;
1891         return agaw;
1892 }
1893
1894 static void domain_exit(struct dmar_domain *domain)
1895 {
1896
1897         /* Remove associated devices and clear attached or cached domains */
1898         domain_remove_dev_info(domain);
1899
1900         if (domain->pgd) {
1901                 LIST_HEAD(freelist);
1902
1903                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1904                 put_pages_list(&freelist);
1905         }
1906
1907         kfree(domain);
1908 }
1909
1910 /*
1911  * Get the PASID directory size for scalable mode context entry.
1912  * Value of X in the PDTS field of a scalable mode context entry
1913  * indicates PASID directory with 2^(X + 7) entries.
1914  */
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1916 {
1917         unsigned long pds, max_pde;
1918
1919         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1921         if (pds < 7)
1922                 return 0;
1923
1924         return pds - 7;
1925 }
1926
1927 /*
1928  * Set the RID_PASID field of a scalable mode context entry. The
1929  * IOMMU hardware will use the PASID value set in this field for
1930  * DMA translations of DMA requests without PASID.
1931  */
1932 static inline void
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1934 {
1935         context->hi |= pasid & ((1 << 20) - 1);
1936 }
1937
1938 /*
1939  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1940  * entry.
1941  */
1942 static inline void context_set_sm_dte(struct context_entry *context)
1943 {
1944         context->lo |= (1 << 2);
1945 }
1946
1947 /*
1948  * Set the PRE(Page Request Enable) field of a scalable mode context
1949  * entry.
1950  */
1951 static inline void context_set_sm_pre(struct context_entry *context)
1952 {
1953         context->lo |= (1 << 4);
1954 }
1955
1956 /* Convert value to context PASID directory size field coding. */
1957 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1958
1959 static int domain_context_mapping_one(struct dmar_domain *domain,
1960                                       struct intel_iommu *iommu,
1961                                       struct pasid_table *table,
1962                                       u8 bus, u8 devfn)
1963 {
1964         u16 did = domain->iommu_did[iommu->seq_id];
1965         int translation = CONTEXT_TT_MULTI_LEVEL;
1966         struct device_domain_info *info = NULL;
1967         struct context_entry *context;
1968         unsigned long flags;
1969         int ret;
1970
1971         WARN_ON(did == 0);
1972
1973         if (hw_pass_through && domain_type_is_si(domain))
1974                 translation = CONTEXT_TT_PASS_THROUGH;
1975
1976         pr_debug("Set context mapping for %02x:%02x.%d\n",
1977                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1978
1979         BUG_ON(!domain->pgd);
1980
1981         spin_lock_irqsave(&device_domain_lock, flags);
1982         spin_lock(&iommu->lock);
1983
1984         ret = -ENOMEM;
1985         context = iommu_context_addr(iommu, bus, devfn, 1);
1986         if (!context)
1987                 goto out_unlock;
1988
1989         ret = 0;
1990         if (context_present(context))
1991                 goto out_unlock;
1992
1993         /*
1994          * For kdump cases, old valid entries may be cached due to the
1995          * in-flight DMA and copied pgtable, but there is no unmapping
1996          * behaviour for them, thus we need an explicit cache flush for
1997          * the newly-mapped device. For kdump, at this point, the device
1998          * is supposed to finish reset at its driver probe stage, so no
1999          * in-flight DMA will exist, and we don't need to worry anymore
2000          * hereafter.
2001          */
2002         if (context_copied(context)) {
2003                 u16 did_old = context_domain_id(context);
2004
2005                 if (did_old < cap_ndoms(iommu->cap)) {
2006                         iommu->flush.flush_context(iommu, did_old,
2007                                                    (((u16)bus) << 8) | devfn,
2008                                                    DMA_CCMD_MASK_NOBIT,
2009                                                    DMA_CCMD_DEVICE_INVL);
2010                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2011                                                  DMA_TLB_DSI_FLUSH);
2012                 }
2013         }
2014
2015         context_clear_entry(context);
2016
2017         if (sm_supported(iommu)) {
2018                 unsigned long pds;
2019
2020                 WARN_ON(!table);
2021
2022                 /* Setup the PASID DIR pointer: */
2023                 pds = context_get_sm_pds(table);
2024                 context->lo = (u64)virt_to_phys(table->table) |
2025                                 context_pdts(pds);
2026
2027                 /* Setup the RID_PASID field: */
2028                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2029
2030                 /*
2031                  * Setup the Device-TLB enable bit and Page request
2032                  * Enable bit:
2033                  */
2034                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2035                 if (info && info->ats_supported)
2036                         context_set_sm_dte(context);
2037                 if (info && info->pri_supported)
2038                         context_set_sm_pre(context);
2039         } else {
2040                 struct dma_pte *pgd = domain->pgd;
2041                 int agaw;
2042
2043                 context_set_domain_id(context, did);
2044
2045                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2046                         /*
2047                          * Skip top levels of page tables for iommu which has
2048                          * less agaw than default. Unnecessary for PT mode.
2049                          */
2050                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2051                                 ret = -ENOMEM;
2052                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2053                                 if (!dma_pte_present(pgd))
2054                                         goto out_unlock;
2055                         }
2056
2057                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2058                         if (info && info->ats_supported)
2059                                 translation = CONTEXT_TT_DEV_IOTLB;
2060                         else
2061                                 translation = CONTEXT_TT_MULTI_LEVEL;
2062
2063                         context_set_address_root(context, virt_to_phys(pgd));
2064                         context_set_address_width(context, agaw);
2065                 } else {
2066                         /*
2067                          * In pass through mode, AW must be programmed to
2068                          * indicate the largest AGAW value supported by
2069                          * hardware. And ASR is ignored by hardware.
2070                          */
2071                         context_set_address_width(context, iommu->msagaw);
2072                 }
2073
2074                 context_set_translation_type(context, translation);
2075         }
2076
2077         context_set_fault_enable(context);
2078         context_set_present(context);
2079         if (!ecap_coherent(iommu->ecap))
2080                 clflush_cache_range(context, sizeof(*context));
2081
2082         /*
2083          * It's a non-present to present mapping. If hardware doesn't cache
2084          * non-present entry we only need to flush the write-buffer. If the
2085          * _does_ cache non-present entries, then it does so in the special
2086          * domain #0, which we have to flush:
2087          */
2088         if (cap_caching_mode(iommu->cap)) {
2089                 iommu->flush.flush_context(iommu, 0,
2090                                            (((u16)bus) << 8) | devfn,
2091                                            DMA_CCMD_MASK_NOBIT,
2092                                            DMA_CCMD_DEVICE_INVL);
2093                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2094         } else {
2095                 iommu_flush_write_buffer(iommu);
2096         }
2097         iommu_enable_dev_iotlb(info);
2098
2099         ret = 0;
2100
2101 out_unlock:
2102         spin_unlock(&iommu->lock);
2103         spin_unlock_irqrestore(&device_domain_lock, flags);
2104
2105         return ret;
2106 }
2107
2108 struct domain_context_mapping_data {
2109         struct dmar_domain *domain;
2110         struct intel_iommu *iommu;
2111         struct pasid_table *table;
2112 };
2113
2114 static int domain_context_mapping_cb(struct pci_dev *pdev,
2115                                      u16 alias, void *opaque)
2116 {
2117         struct domain_context_mapping_data *data = opaque;
2118
2119         return domain_context_mapping_one(data->domain, data->iommu,
2120                                           data->table, PCI_BUS_NUM(alias),
2121                                           alias & 0xff);
2122 }
2123
2124 static int
2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2126 {
2127         struct domain_context_mapping_data data;
2128         struct pasid_table *table;
2129         struct intel_iommu *iommu;
2130         u8 bus, devfn;
2131
2132         iommu = device_to_iommu(dev, &bus, &devfn);
2133         if (!iommu)
2134                 return -ENODEV;
2135
2136         table = intel_pasid_get_table(dev);
2137
2138         if (!dev_is_pci(dev))
2139                 return domain_context_mapping_one(domain, iommu, table,
2140                                                   bus, devfn);
2141
2142         data.domain = domain;
2143         data.iommu = iommu;
2144         data.table = table;
2145
2146         return pci_for_each_dma_alias(to_pci_dev(dev),
2147                                       &domain_context_mapping_cb, &data);
2148 }
2149
2150 static int domain_context_mapped_cb(struct pci_dev *pdev,
2151                                     u16 alias, void *opaque)
2152 {
2153         struct intel_iommu *iommu = opaque;
2154
2155         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2156 }
2157
2158 static int domain_context_mapped(struct device *dev)
2159 {
2160         struct intel_iommu *iommu;
2161         u8 bus, devfn;
2162
2163         iommu = device_to_iommu(dev, &bus, &devfn);
2164         if (!iommu)
2165                 return -ENODEV;
2166
2167         if (!dev_is_pci(dev))
2168                 return device_context_mapped(iommu, bus, devfn);
2169
2170         return !pci_for_each_dma_alias(to_pci_dev(dev),
2171                                        domain_context_mapped_cb, iommu);
2172 }
2173
2174 /* Returns a number of VTD pages, but aligned to MM page size */
2175 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2176                                             size_t size)
2177 {
2178         host_addr &= ~PAGE_MASK;
2179         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2180 }
2181
2182 /* Return largest possible superpage level for a given mapping */
2183 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2184                                           unsigned long iov_pfn,
2185                                           unsigned long phy_pfn,
2186                                           unsigned long pages)
2187 {
2188         int support, level = 1;
2189         unsigned long pfnmerge;
2190
2191         support = domain->iommu_superpage;
2192
2193         /* To use a large page, the virtual *and* physical addresses
2194            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2195            of them will mean we have to use smaller pages. So just
2196            merge them and check both at once. */
2197         pfnmerge = iov_pfn | phy_pfn;
2198
2199         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2200                 pages >>= VTD_STRIDE_SHIFT;
2201                 if (!pages)
2202                         break;
2203                 pfnmerge >>= VTD_STRIDE_SHIFT;
2204                 level++;
2205                 support--;
2206         }
2207         return level;
2208 }
2209
2210 /*
2211  * Ensure that old small page tables are removed to make room for superpage(s).
2212  * We're going to add new large pages, so make sure we don't remove their parent
2213  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2214  */
2215 static void switch_to_super_page(struct dmar_domain *domain,
2216                                  unsigned long start_pfn,
2217                                  unsigned long end_pfn, int level)
2218 {
2219         unsigned long lvl_pages = lvl_to_nr_pages(level);
2220         struct dma_pte *pte = NULL;
2221         int i;
2222
2223         while (start_pfn <= end_pfn) {
2224                 if (!pte)
2225                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2226
2227                 if (dma_pte_present(pte)) {
2228                         dma_pte_free_pagetable(domain, start_pfn,
2229                                                start_pfn + lvl_pages - 1,
2230                                                level + 1);
2231
2232                         for_each_domain_iommu(i, domain)
2233                                 iommu_flush_iotlb_psi(g_iommus[i], domain,
2234                                                       start_pfn, lvl_pages,
2235                                                       0, 0);
2236                 }
2237
2238                 pte++;
2239                 start_pfn += lvl_pages;
2240                 if (first_pte_in_page(pte))
2241                         pte = NULL;
2242         }
2243 }
2244
2245 static int
2246 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2247                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2248 {
2249         struct dma_pte *first_pte = NULL, *pte = NULL;
2250         unsigned int largepage_lvl = 0;
2251         unsigned long lvl_pages = 0;
2252         phys_addr_t pteval;
2253         u64 attr;
2254
2255         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2256
2257         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2258                 return -EINVAL;
2259
2260         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2261         attr |= DMA_FL_PTE_PRESENT;
2262         if (domain_use_first_level(domain)) {
2263                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2264                 if (prot & DMA_PTE_WRITE)
2265                         attr |= DMA_FL_PTE_DIRTY;
2266         }
2267
2268         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2269
2270         while (nr_pages > 0) {
2271                 uint64_t tmp;
2272
2273                 if (!pte) {
2274                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2275                                         phys_pfn, nr_pages);
2276
2277                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2278                         if (!pte)
2279                                 return -ENOMEM;
2280                         first_pte = pte;
2281
2282                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2283
2284                         /* It is large page*/
2285                         if (largepage_lvl > 1) {
2286                                 unsigned long end_pfn;
2287                                 unsigned long pages_to_remove;
2288
2289                                 pteval |= DMA_PTE_LARGE_PAGE;
2290                                 pages_to_remove = min_t(unsigned long, nr_pages,
2291                                                         nr_pte_to_next_page(pte) * lvl_pages);
2292                                 end_pfn = iov_pfn + pages_to_remove - 1;
2293                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2294                         } else {
2295                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2296                         }
2297
2298                 }
2299                 /* We don't need lock here, nobody else
2300                  * touches the iova range
2301                  */
2302                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2303                 if (tmp) {
2304                         static int dumps = 5;
2305                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2306                                 iov_pfn, tmp, (unsigned long long)pteval);
2307                         if (dumps) {
2308                                 dumps--;
2309                                 debug_dma_dump_mappings(NULL);
2310                         }
2311                         WARN_ON(1);
2312                 }
2313
2314                 nr_pages -= lvl_pages;
2315                 iov_pfn += lvl_pages;
2316                 phys_pfn += lvl_pages;
2317                 pteval += lvl_pages * VTD_PAGE_SIZE;
2318
2319                 /* If the next PTE would be the first in a new page, then we
2320                  * need to flush the cache on the entries we've just written.
2321                  * And then we'll need to recalculate 'pte', so clear it and
2322                  * let it get set again in the if (!pte) block above.
2323                  *
2324                  * If we're done (!nr_pages) we need to flush the cache too.
2325                  *
2326                  * Also if we've been setting superpages, we may need to
2327                  * recalculate 'pte' and switch back to smaller pages for the
2328                  * end of the mapping, if the trailing size is not enough to
2329                  * use another superpage (i.e. nr_pages < lvl_pages).
2330                  */
2331                 pte++;
2332                 if (!nr_pages || first_pte_in_page(pte) ||
2333                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2334                         domain_flush_cache(domain, first_pte,
2335                                            (void *)pte - (void *)first_pte);
2336                         pte = NULL;
2337                 }
2338         }
2339
2340         return 0;
2341 }
2342
2343 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2344 {
2345         struct intel_iommu *iommu = info->iommu;
2346         struct context_entry *context;
2347         unsigned long flags;
2348         u16 did_old;
2349
2350         if (!iommu)
2351                 return;
2352
2353         spin_lock_irqsave(&iommu->lock, flags);
2354         context = iommu_context_addr(iommu, bus, devfn, 0);
2355         if (!context) {
2356                 spin_unlock_irqrestore(&iommu->lock, flags);
2357                 return;
2358         }
2359
2360         if (sm_supported(iommu)) {
2361                 if (hw_pass_through && domain_type_is_si(info->domain))
2362                         did_old = FLPT_DEFAULT_DID;
2363                 else
2364                         did_old = info->domain->iommu_did[iommu->seq_id];
2365         } else {
2366                 did_old = context_domain_id(context);
2367         }
2368
2369         context_clear_entry(context);
2370         __iommu_flush_cache(iommu, context, sizeof(*context));
2371         spin_unlock_irqrestore(&iommu->lock, flags);
2372         iommu->flush.flush_context(iommu,
2373                                    did_old,
2374                                    (((u16)bus) << 8) | devfn,
2375                                    DMA_CCMD_MASK_NOBIT,
2376                                    DMA_CCMD_DEVICE_INVL);
2377
2378         if (sm_supported(iommu))
2379                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2380
2381         iommu->flush.flush_iotlb(iommu,
2382                                  did_old,
2383                                  0,
2384                                  0,
2385                                  DMA_TLB_DSI_FLUSH);
2386
2387         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2388 }
2389
2390 static void domain_remove_dev_info(struct dmar_domain *domain)
2391 {
2392         struct device_domain_info *info, *tmp;
2393         unsigned long flags;
2394
2395         spin_lock_irqsave(&device_domain_lock, flags);
2396         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2397                 __dmar_remove_one_dev_info(info);
2398         spin_unlock_irqrestore(&device_domain_lock, flags);
2399 }
2400
2401 static inline struct device_domain_info *
2402 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2403 {
2404         struct device_domain_info *info;
2405
2406         list_for_each_entry(info, &device_domain_list, global)
2407                 if (info->segment == segment && info->bus == bus &&
2408                     info->devfn == devfn)
2409                         return info;
2410
2411         return NULL;
2412 }
2413
2414 static int domain_setup_first_level(struct intel_iommu *iommu,
2415                                     struct dmar_domain *domain,
2416                                     struct device *dev,
2417                                     u32 pasid)
2418 {
2419         struct dma_pte *pgd = domain->pgd;
2420         int agaw, level;
2421         int flags = 0;
2422
2423         /*
2424          * Skip top levels of page tables for iommu which has
2425          * less agaw than default. Unnecessary for PT mode.
2426          */
2427         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2428                 pgd = phys_to_virt(dma_pte_addr(pgd));
2429                 if (!dma_pte_present(pgd))
2430                         return -ENOMEM;
2431         }
2432
2433         level = agaw_to_level(agaw);
2434         if (level != 4 && level != 5)
2435                 return -EINVAL;
2436
2437         if (pasid != PASID_RID2PASID)
2438                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2439         if (level == 5)
2440                 flags |= PASID_FLAG_FL5LP;
2441
2442         if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2443                 flags |= PASID_FLAG_PAGE_SNOOP;
2444
2445         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2446                                              domain->iommu_did[iommu->seq_id],
2447                                              flags);
2448 }
2449
2450 static bool dev_is_real_dma_subdevice(struct device *dev)
2451 {
2452         return dev && dev_is_pci(dev) &&
2453                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2454 }
2455
2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2457                                                     int bus, int devfn,
2458                                                     struct device *dev,
2459                                                     struct dmar_domain *domain)
2460 {
2461         struct device_domain_info *info = dev_iommu_priv_get(dev);
2462         unsigned long flags;
2463         int ret;
2464
2465         spin_lock_irqsave(&device_domain_lock, flags);
2466         info->domain = domain;
2467         spin_lock(&iommu->lock);
2468         ret = domain_attach_iommu(domain, iommu);
2469         spin_unlock(&iommu->lock);
2470         if (ret) {
2471                 spin_unlock_irqrestore(&device_domain_lock, flags);
2472                 return NULL;
2473         }
2474         list_add(&info->link, &domain->devices);
2475         spin_unlock_irqrestore(&device_domain_lock, flags);
2476
2477         /* PASID table is mandatory for a PCI device in scalable mode. */
2478         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2479                 ret = intel_pasid_alloc_table(dev);
2480                 if (ret) {
2481                         dev_err(dev, "PASID table allocation failed\n");
2482                         dmar_remove_one_dev_info(dev);
2483                         return NULL;
2484                 }
2485
2486                 /* Setup the PASID entry for requests without PASID: */
2487                 spin_lock_irqsave(&iommu->lock, flags);
2488                 if (hw_pass_through && domain_type_is_si(domain))
2489                         ret = intel_pasid_setup_pass_through(iommu, domain,
2490                                         dev, PASID_RID2PASID);
2491                 else if (domain_use_first_level(domain))
2492                         ret = domain_setup_first_level(iommu, domain, dev,
2493                                         PASID_RID2PASID);
2494                 else
2495                         ret = intel_pasid_setup_second_level(iommu, domain,
2496                                         dev, PASID_RID2PASID);
2497                 spin_unlock_irqrestore(&iommu->lock, flags);
2498                 if (ret) {
2499                         dev_err(dev, "Setup RID2PASID failed\n");
2500                         dmar_remove_one_dev_info(dev);
2501                         return NULL;
2502                 }
2503         }
2504
2505         if (dev && domain_context_mapping(domain, dev)) {
2506                 dev_err(dev, "Domain context map failed\n");
2507                 dmar_remove_one_dev_info(dev);
2508                 return NULL;
2509         }
2510
2511         return domain;
2512 }
2513
2514 static int iommu_domain_identity_map(struct dmar_domain *domain,
2515                                      unsigned long first_vpfn,
2516                                      unsigned long last_vpfn)
2517 {
2518         /*
2519          * RMRR range might have overlap with physical memory range,
2520          * clear it first
2521          */
2522         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2523
2524         return __domain_mapping(domain, first_vpfn,
2525                                 first_vpfn, last_vpfn - first_vpfn + 1,
2526                                 DMA_PTE_READ|DMA_PTE_WRITE);
2527 }
2528
2529 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2530
2531 static int __init si_domain_init(int hw)
2532 {
2533         struct dmar_rmrr_unit *rmrr;
2534         struct device *dev;
2535         int i, nid, ret;
2536
2537         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2538         if (!si_domain)
2539                 return -EFAULT;
2540
2541         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2542                 domain_exit(si_domain);
2543                 return -EFAULT;
2544         }
2545
2546         if (hw)
2547                 return 0;
2548
2549         for_each_online_node(nid) {
2550                 unsigned long start_pfn, end_pfn;
2551                 int i;
2552
2553                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2554                         ret = iommu_domain_identity_map(si_domain,
2555                                         mm_to_dma_pfn(start_pfn),
2556                                         mm_to_dma_pfn(end_pfn));
2557                         if (ret)
2558                                 return ret;
2559                 }
2560         }
2561
2562         /*
2563          * Identity map the RMRRs so that devices with RMRRs could also use
2564          * the si_domain.
2565          */
2566         for_each_rmrr_units(rmrr) {
2567                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2568                                           i, dev) {
2569                         unsigned long long start = rmrr->base_address;
2570                         unsigned long long end = rmrr->end_address;
2571
2572                         if (WARN_ON(end < start ||
2573                                     end >> agaw_to_width(si_domain->agaw)))
2574                                 continue;
2575
2576                         ret = iommu_domain_identity_map(si_domain,
2577                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2578                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2579                         if (ret)
2580                                 return ret;
2581                 }
2582         }
2583
2584         return 0;
2585 }
2586
2587 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2588 {
2589         struct dmar_domain *ndomain;
2590         struct intel_iommu *iommu;
2591         u8 bus, devfn;
2592
2593         iommu = device_to_iommu(dev, &bus, &devfn);
2594         if (!iommu)
2595                 return -ENODEV;
2596
2597         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2598         if (ndomain != domain)
2599                 return -EBUSY;
2600
2601         return 0;
2602 }
2603
2604 static bool device_has_rmrr(struct device *dev)
2605 {
2606         struct dmar_rmrr_unit *rmrr;
2607         struct device *tmp;
2608         int i;
2609
2610         rcu_read_lock();
2611         for_each_rmrr_units(rmrr) {
2612                 /*
2613                  * Return TRUE if this RMRR contains the device that
2614                  * is passed in.
2615                  */
2616                 for_each_active_dev_scope(rmrr->devices,
2617                                           rmrr->devices_cnt, i, tmp)
2618                         if (tmp == dev ||
2619                             is_downstream_to_pci_bridge(dev, tmp)) {
2620                                 rcu_read_unlock();
2621                                 return true;
2622                         }
2623         }
2624         rcu_read_unlock();
2625         return false;
2626 }
2627
2628 /**
2629  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2630  * is relaxable (ie. is allowed to be not enforced under some conditions)
2631  * @dev: device handle
2632  *
2633  * We assume that PCI USB devices with RMRRs have them largely
2634  * for historical reasons and that the RMRR space is not actively used post
2635  * boot.  This exclusion may change if vendors begin to abuse it.
2636  *
2637  * The same exception is made for graphics devices, with the requirement that
2638  * any use of the RMRR regions will be torn down before assigning the device
2639  * to a guest.
2640  *
2641  * Return: true if the RMRR is relaxable, false otherwise
2642  */
2643 static bool device_rmrr_is_relaxable(struct device *dev)
2644 {
2645         struct pci_dev *pdev;
2646
2647         if (!dev_is_pci(dev))
2648                 return false;
2649
2650         pdev = to_pci_dev(dev);
2651         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2652                 return true;
2653         else
2654                 return false;
2655 }
2656
2657 /*
2658  * There are a couple cases where we need to restrict the functionality of
2659  * devices associated with RMRRs.  The first is when evaluating a device for
2660  * identity mapping because problems exist when devices are moved in and out
2661  * of domains and their respective RMRR information is lost.  This means that
2662  * a device with associated RMRRs will never be in a "passthrough" domain.
2663  * The second is use of the device through the IOMMU API.  This interface
2664  * expects to have full control of the IOVA space for the device.  We cannot
2665  * satisfy both the requirement that RMRR access is maintained and have an
2666  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2667  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2668  * We therefore prevent devices associated with an RMRR from participating in
2669  * the IOMMU API, which eliminates them from device assignment.
2670  *
2671  * In both cases, devices which have relaxable RMRRs are not concerned by this
2672  * restriction. See device_rmrr_is_relaxable comment.
2673  */
2674 static bool device_is_rmrr_locked(struct device *dev)
2675 {
2676         if (!device_has_rmrr(dev))
2677                 return false;
2678
2679         if (device_rmrr_is_relaxable(dev))
2680                 return false;
2681
2682         return true;
2683 }
2684
2685 /*
2686  * Return the required default domain type for a specific device.
2687  *
2688  * @dev: the device in query
2689  * @startup: true if this is during early boot
2690  *
2691  * Returns:
2692  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2693  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2694  *  - 0: both identity and dynamic domains work for this device
2695  */
2696 static int device_def_domain_type(struct device *dev)
2697 {
2698         if (dev_is_pci(dev)) {
2699                 struct pci_dev *pdev = to_pci_dev(dev);
2700
2701                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2702                         return IOMMU_DOMAIN_IDENTITY;
2703
2704                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2705                         return IOMMU_DOMAIN_IDENTITY;
2706         }
2707
2708         return 0;
2709 }
2710
2711 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2712 {
2713         /*
2714          * Start from the sane iommu hardware state.
2715          * If the queued invalidation is already initialized by us
2716          * (for example, while enabling interrupt-remapping) then
2717          * we got the things already rolling from a sane state.
2718          */
2719         if (!iommu->qi) {
2720                 /*
2721                  * Clear any previous faults.
2722                  */
2723                 dmar_fault(-1, iommu);
2724                 /*
2725                  * Disable queued invalidation if supported and already enabled
2726                  * before OS handover.
2727                  */
2728                 dmar_disable_qi(iommu);
2729         }
2730
2731         if (dmar_enable_qi(iommu)) {
2732                 /*
2733                  * Queued Invalidate not enabled, use Register Based Invalidate
2734                  */
2735                 iommu->flush.flush_context = __iommu_flush_context;
2736                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2737                 pr_info("%s: Using Register based invalidation\n",
2738                         iommu->name);
2739         } else {
2740                 iommu->flush.flush_context = qi_flush_context;
2741                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2742                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2743         }
2744 }
2745
2746 static int copy_context_table(struct intel_iommu *iommu,
2747                               struct root_entry *old_re,
2748                               struct context_entry **tbl,
2749                               int bus, bool ext)
2750 {
2751         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2752         struct context_entry *new_ce = NULL, ce;
2753         struct context_entry *old_ce = NULL;
2754         struct root_entry re;
2755         phys_addr_t old_ce_phys;
2756
2757         tbl_idx = ext ? bus * 2 : bus;
2758         memcpy(&re, old_re, sizeof(re));
2759
2760         for (devfn = 0; devfn < 256; devfn++) {
2761                 /* First calculate the correct index */
2762                 idx = (ext ? devfn * 2 : devfn) % 256;
2763
2764                 if (idx == 0) {
2765                         /* First save what we may have and clean up */
2766                         if (new_ce) {
2767                                 tbl[tbl_idx] = new_ce;
2768                                 __iommu_flush_cache(iommu, new_ce,
2769                                                     VTD_PAGE_SIZE);
2770                                 pos = 1;
2771                         }
2772
2773                         if (old_ce)
2774                                 memunmap(old_ce);
2775
2776                         ret = 0;
2777                         if (devfn < 0x80)
2778                                 old_ce_phys = root_entry_lctp(&re);
2779                         else
2780                                 old_ce_phys = root_entry_uctp(&re);
2781
2782                         if (!old_ce_phys) {
2783                                 if (ext && devfn == 0) {
2784                                         /* No LCTP, try UCTP */
2785                                         devfn = 0x7f;
2786                                         continue;
2787                                 } else {
2788                                         goto out;
2789                                 }
2790                         }
2791
2792                         ret = -ENOMEM;
2793                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2794                                         MEMREMAP_WB);
2795                         if (!old_ce)
2796                                 goto out;
2797
2798                         new_ce = alloc_pgtable_page(iommu->node);
2799                         if (!new_ce)
2800                                 goto out_unmap;
2801
2802                         ret = 0;
2803                 }
2804
2805                 /* Now copy the context entry */
2806                 memcpy(&ce, old_ce + idx, sizeof(ce));
2807
2808                 if (!__context_present(&ce))
2809                         continue;
2810
2811                 did = context_domain_id(&ce);
2812                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2813                         set_bit(did, iommu->domain_ids);
2814
2815                 /*
2816                  * We need a marker for copied context entries. This
2817                  * marker needs to work for the old format as well as
2818                  * for extended context entries.
2819                  *
2820                  * Bit 67 of the context entry is used. In the old
2821                  * format this bit is available to software, in the
2822                  * extended format it is the PGE bit, but PGE is ignored
2823                  * by HW if PASIDs are disabled (and thus still
2824                  * available).
2825                  *
2826                  * So disable PASIDs first and then mark the entry
2827                  * copied. This means that we don't copy PASID
2828                  * translations from the old kernel, but this is fine as
2829                  * faults there are not fatal.
2830                  */
2831                 context_clear_pasid_enable(&ce);
2832                 context_set_copied(&ce);
2833
2834                 new_ce[idx] = ce;
2835         }
2836
2837         tbl[tbl_idx + pos] = new_ce;
2838
2839         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2840
2841 out_unmap:
2842         memunmap(old_ce);
2843
2844 out:
2845         return ret;
2846 }
2847
2848 static int copy_translation_tables(struct intel_iommu *iommu)
2849 {
2850         struct context_entry **ctxt_tbls;
2851         struct root_entry *old_rt;
2852         phys_addr_t old_rt_phys;
2853         int ctxt_table_entries;
2854         unsigned long flags;
2855         u64 rtaddr_reg;
2856         int bus, ret;
2857         bool new_ext, ext;
2858
2859         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2860         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2861         new_ext    = !!ecap_ecs(iommu->ecap);
2862
2863         /*
2864          * The RTT bit can only be changed when translation is disabled,
2865          * but disabling translation means to open a window for data
2866          * corruption. So bail out and don't copy anything if we would
2867          * have to change the bit.
2868          */
2869         if (new_ext != ext)
2870                 return -EINVAL;
2871
2872         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2873         if (!old_rt_phys)
2874                 return -EINVAL;
2875
2876         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2877         if (!old_rt)
2878                 return -ENOMEM;
2879
2880         /* This is too big for the stack - allocate it from slab */
2881         ctxt_table_entries = ext ? 512 : 256;
2882         ret = -ENOMEM;
2883         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2884         if (!ctxt_tbls)
2885                 goto out_unmap;
2886
2887         for (bus = 0; bus < 256; bus++) {
2888                 ret = copy_context_table(iommu, &old_rt[bus],
2889                                          ctxt_tbls, bus, ext);
2890                 if (ret) {
2891                         pr_err("%s: Failed to copy context table for bus %d\n",
2892                                 iommu->name, bus);
2893                         continue;
2894                 }
2895         }
2896
2897         spin_lock_irqsave(&iommu->lock, flags);
2898
2899         /* Context tables are copied, now write them to the root_entry table */
2900         for (bus = 0; bus < 256; bus++) {
2901                 int idx = ext ? bus * 2 : bus;
2902                 u64 val;
2903
2904                 if (ctxt_tbls[idx]) {
2905                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2906                         iommu->root_entry[bus].lo = val;
2907                 }
2908
2909                 if (!ext || !ctxt_tbls[idx + 1])
2910                         continue;
2911
2912                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2913                 iommu->root_entry[bus].hi = val;
2914         }
2915
2916         spin_unlock_irqrestore(&iommu->lock, flags);
2917
2918         kfree(ctxt_tbls);
2919
2920         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2921
2922         ret = 0;
2923
2924 out_unmap:
2925         memunmap(old_rt);
2926
2927         return ret;
2928 }
2929
2930 #ifdef CONFIG_INTEL_IOMMU_SVM
2931 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2932 {
2933         struct intel_iommu *iommu = data;
2934         ioasid_t ioasid;
2935
2936         if (!iommu)
2937                 return INVALID_IOASID;
2938         /*
2939          * VT-d virtual command interface always uses the full 20 bit
2940          * PASID range. Host can partition guest PASID range based on
2941          * policies but it is out of guest's control.
2942          */
2943         if (min < PASID_MIN || max > intel_pasid_max_id)
2944                 return INVALID_IOASID;
2945
2946         if (vcmd_alloc_pasid(iommu, &ioasid))
2947                 return INVALID_IOASID;
2948
2949         return ioasid;
2950 }
2951
2952 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2953 {
2954         struct intel_iommu *iommu = data;
2955
2956         if (!iommu)
2957                 return;
2958         /*
2959          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2960          * We can only free the PASID when all the devices are unbound.
2961          */
2962         if (ioasid_find(NULL, ioasid, NULL)) {
2963                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2964                 return;
2965         }
2966         vcmd_free_pasid(iommu, ioasid);
2967 }
2968
2969 static void register_pasid_allocator(struct intel_iommu *iommu)
2970 {
2971         /*
2972          * If we are running in the host, no need for custom allocator
2973          * in that PASIDs are allocated from the host system-wide.
2974          */
2975         if (!cap_caching_mode(iommu->cap))
2976                 return;
2977
2978         if (!sm_supported(iommu)) {
2979                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2980                 return;
2981         }
2982
2983         /*
2984          * Register a custom PASID allocator if we are running in a guest,
2985          * guest PASID must be obtained via virtual command interface.
2986          * There can be multiple vIOMMUs in each guest but only one allocator
2987          * is active. All vIOMMU allocators will eventually be calling the same
2988          * host allocator.
2989          */
2990         if (!vccap_pasid(iommu->vccap))
2991                 return;
2992
2993         pr_info("Register custom PASID allocator\n");
2994         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2995         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2996         iommu->pasid_allocator.pdata = (void *)iommu;
2997         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2998                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2999                 /*
3000                  * Disable scalable mode on this IOMMU if there
3001                  * is no custom allocator. Mixing SM capable vIOMMU
3002                  * and non-SM vIOMMU are not supported.
3003                  */
3004                 intel_iommu_sm = 0;
3005         }
3006 }
3007 #endif
3008
3009 static int __init init_dmars(void)
3010 {
3011         struct dmar_drhd_unit *drhd;
3012         struct intel_iommu *iommu;
3013         int ret;
3014
3015         /*
3016          * for each drhd
3017          *    allocate root
3018          *    initialize and program root entry to not present
3019          * endfor
3020          */
3021         for_each_drhd_unit(drhd) {
3022                 /*
3023                  * lock not needed as this is only incremented in the single
3024                  * threaded kernel __init code path all other access are read
3025                  * only
3026                  */
3027                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3028                         g_num_of_iommus++;
3029                         continue;
3030                 }
3031                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3032         }
3033
3034         /* Preallocate enough resources for IOMMU hot-addition */
3035         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3036                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3037
3038         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3039                         GFP_KERNEL);
3040         if (!g_iommus) {
3041                 ret = -ENOMEM;
3042                 goto error;
3043         }
3044
3045         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3046         if (ret)
3047                 goto free_iommu;
3048
3049         for_each_iommu(iommu, drhd) {
3050                 if (drhd->ignored) {
3051                         iommu_disable_translation(iommu);
3052                         continue;
3053                 }
3054
3055                 /*
3056                  * Find the max pasid size of all IOMMU's in the system.
3057                  * We need to ensure the system pasid table is no bigger
3058                  * than the smallest supported.
3059                  */
3060                 if (pasid_supported(iommu)) {
3061                         u32 temp = 2 << ecap_pss(iommu->ecap);
3062
3063                         intel_pasid_max_id = min_t(u32, temp,
3064                                                    intel_pasid_max_id);
3065                 }
3066
3067                 g_iommus[iommu->seq_id] = iommu;
3068
3069                 intel_iommu_init_qi(iommu);
3070
3071                 ret = iommu_init_domains(iommu);
3072                 if (ret)
3073                         goto free_iommu;
3074
3075                 init_translation_status(iommu);
3076
3077                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3078                         iommu_disable_translation(iommu);
3079                         clear_translation_pre_enabled(iommu);
3080                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3081                                 iommu->name);
3082                 }
3083
3084                 /*
3085                  * TBD:
3086                  * we could share the same root & context tables
3087                  * among all IOMMU's. Need to Split it later.
3088                  */
3089                 ret = iommu_alloc_root_entry(iommu);
3090                 if (ret)
3091                         goto free_iommu;
3092
3093                 if (translation_pre_enabled(iommu)) {
3094                         pr_info("Translation already enabled - trying to copy translation structures\n");
3095
3096                         ret = copy_translation_tables(iommu);
3097                         if (ret) {
3098                                 /*
3099                                  * We found the IOMMU with translation
3100                                  * enabled - but failed to copy over the
3101                                  * old root-entry table. Try to proceed
3102                                  * by disabling translation now and
3103                                  * allocating a clean root-entry table.
3104                                  * This might cause DMAR faults, but
3105                                  * probably the dump will still succeed.
3106                                  */
3107                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3108                                        iommu->name);
3109                                 iommu_disable_translation(iommu);
3110                                 clear_translation_pre_enabled(iommu);
3111                         } else {
3112                                 pr_info("Copied translation tables from previous kernel for %s\n",
3113                                         iommu->name);
3114                         }
3115                 }
3116
3117                 if (!ecap_pass_through(iommu->ecap))
3118                         hw_pass_through = 0;
3119                 intel_svm_check(iommu);
3120         }
3121
3122         /*
3123          * Now that qi is enabled on all iommus, set the root entry and flush
3124          * caches. This is required on some Intel X58 chipsets, otherwise the
3125          * flush_context function will loop forever and the boot hangs.
3126          */
3127         for_each_active_iommu(iommu, drhd) {
3128                 iommu_flush_write_buffer(iommu);
3129 #ifdef CONFIG_INTEL_IOMMU_SVM
3130                 register_pasid_allocator(iommu);
3131 #endif
3132                 iommu_set_root_entry(iommu);
3133         }
3134
3135 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3136         dmar_map_gfx = 0;
3137 #endif
3138
3139         if (!dmar_map_gfx)
3140                 iommu_identity_mapping |= IDENTMAP_GFX;
3141
3142         check_tylersburg_isoch();
3143
3144         ret = si_domain_init(hw_pass_through);
3145         if (ret)
3146                 goto free_iommu;
3147
3148         /*
3149          * for each drhd
3150          *   enable fault log
3151          *   global invalidate context cache
3152          *   global invalidate iotlb
3153          *   enable translation
3154          */
3155         for_each_iommu(iommu, drhd) {
3156                 if (drhd->ignored) {
3157                         /*
3158                          * we always have to disable PMRs or DMA may fail on
3159                          * this device
3160                          */
3161                         if (force_on)
3162                                 iommu_disable_protect_mem_regions(iommu);
3163                         continue;
3164                 }
3165
3166                 iommu_flush_write_buffer(iommu);
3167
3168 #ifdef CONFIG_INTEL_IOMMU_SVM
3169                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3170                         /*
3171                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3172                          * could cause possible lock race condition.
3173                          */
3174                         up_write(&dmar_global_lock);
3175                         ret = intel_svm_enable_prq(iommu);
3176                         down_write(&dmar_global_lock);
3177                         if (ret)
3178                                 goto free_iommu;
3179                 }
3180 #endif
3181                 ret = dmar_set_interrupt(iommu);
3182                 if (ret)
3183                         goto free_iommu;
3184         }
3185
3186         return 0;
3187
3188 free_iommu:
3189         for_each_active_iommu(iommu, drhd) {
3190                 disable_dmar_iommu(iommu);
3191                 free_dmar_iommu(iommu);
3192         }
3193
3194         kfree(g_iommus);
3195
3196 error:
3197         return ret;
3198 }
3199
3200 static void __init init_no_remapping_devices(void)
3201 {
3202         struct dmar_drhd_unit *drhd;
3203         struct device *dev;
3204         int i;
3205
3206         for_each_drhd_unit(drhd) {
3207                 if (!drhd->include_all) {
3208                         for_each_active_dev_scope(drhd->devices,
3209                                                   drhd->devices_cnt, i, dev)
3210                                 break;
3211                         /* ignore DMAR unit if no devices exist */
3212                         if (i == drhd->devices_cnt)
3213                                 drhd->ignored = 1;
3214                 }
3215         }
3216
3217         for_each_active_drhd_unit(drhd) {
3218                 if (drhd->include_all)
3219                         continue;
3220
3221                 for_each_active_dev_scope(drhd->devices,
3222                                           drhd->devices_cnt, i, dev)
3223                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3224                                 break;
3225                 if (i < drhd->devices_cnt)
3226                         continue;
3227
3228                 /* This IOMMU has *only* gfx devices. Either bypass it or
3229                    set the gfx_mapped flag, as appropriate */
3230                 drhd->gfx_dedicated = 1;
3231                 if (!dmar_map_gfx)
3232                         drhd->ignored = 1;
3233         }
3234 }
3235
3236 #ifdef CONFIG_SUSPEND
3237 static int init_iommu_hw(void)
3238 {
3239         struct dmar_drhd_unit *drhd;
3240         struct intel_iommu *iommu = NULL;
3241
3242         for_each_active_iommu(iommu, drhd)
3243                 if (iommu->qi)
3244                         dmar_reenable_qi(iommu);
3245
3246         for_each_iommu(iommu, drhd) {
3247                 if (drhd->ignored) {
3248                         /*
3249                          * we always have to disable PMRs or DMA may fail on
3250                          * this device
3251                          */
3252                         if (force_on)
3253                                 iommu_disable_protect_mem_regions(iommu);
3254                         continue;
3255                 }
3256
3257                 iommu_flush_write_buffer(iommu);
3258                 iommu_set_root_entry(iommu);
3259                 iommu_enable_translation(iommu);
3260                 iommu_disable_protect_mem_regions(iommu);
3261         }
3262
3263         return 0;
3264 }
3265
3266 static void iommu_flush_all(void)
3267 {
3268         struct dmar_drhd_unit *drhd;
3269         struct intel_iommu *iommu;
3270
3271         for_each_active_iommu(iommu, drhd) {
3272                 iommu->flush.flush_context(iommu, 0, 0, 0,
3273                                            DMA_CCMD_GLOBAL_INVL);
3274                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3275                                          DMA_TLB_GLOBAL_FLUSH);
3276         }
3277 }
3278
3279 static int iommu_suspend(void)
3280 {
3281         struct dmar_drhd_unit *drhd;
3282         struct intel_iommu *iommu = NULL;
3283         unsigned long flag;
3284
3285         for_each_active_iommu(iommu, drhd) {
3286                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3287                                              GFP_KERNEL);
3288                 if (!iommu->iommu_state)
3289                         goto nomem;
3290         }
3291
3292         iommu_flush_all();
3293
3294         for_each_active_iommu(iommu, drhd) {
3295                 iommu_disable_translation(iommu);
3296
3297                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3298
3299                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3300                         readl(iommu->reg + DMAR_FECTL_REG);
3301                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3302                         readl(iommu->reg + DMAR_FEDATA_REG);
3303                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3304                         readl(iommu->reg + DMAR_FEADDR_REG);
3305                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3306                         readl(iommu->reg + DMAR_FEUADDR_REG);
3307
3308                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3309         }
3310         return 0;
3311
3312 nomem:
3313         for_each_active_iommu(iommu, drhd)
3314                 kfree(iommu->iommu_state);
3315
3316         return -ENOMEM;
3317 }
3318
3319 static void iommu_resume(void)
3320 {
3321         struct dmar_drhd_unit *drhd;
3322         struct intel_iommu *iommu = NULL;
3323         unsigned long flag;
3324
3325         if (init_iommu_hw()) {
3326                 if (force_on)
3327                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3328                 else
3329                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3330                 return;
3331         }
3332
3333         for_each_active_iommu(iommu, drhd) {
3334
3335                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3336
3337                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3338                         iommu->reg + DMAR_FECTL_REG);
3339                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3340                         iommu->reg + DMAR_FEDATA_REG);
3341                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3342                         iommu->reg + DMAR_FEADDR_REG);
3343                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3344                         iommu->reg + DMAR_FEUADDR_REG);
3345
3346                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3347         }
3348
3349         for_each_active_iommu(iommu, drhd)
3350                 kfree(iommu->iommu_state);
3351 }
3352
3353 static struct syscore_ops iommu_syscore_ops = {
3354         .resume         = iommu_resume,
3355         .suspend        = iommu_suspend,
3356 };
3357
3358 static void __init init_iommu_pm_ops(void)
3359 {
3360         register_syscore_ops(&iommu_syscore_ops);
3361 }
3362
3363 #else
3364 static inline void init_iommu_pm_ops(void) {}
3365 #endif  /* CONFIG_PM */
3366
3367 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3368 {
3369         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3370             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3371             rmrr->end_address <= rmrr->base_address ||
3372             arch_rmrr_sanity_check(rmrr))
3373                 return -EINVAL;
3374
3375         return 0;
3376 }
3377
3378 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3379 {
3380         struct acpi_dmar_reserved_memory *rmrr;
3381         struct dmar_rmrr_unit *rmrru;
3382
3383         rmrr = (struct acpi_dmar_reserved_memory *)header;
3384         if (rmrr_sanity_check(rmrr)) {
3385                 pr_warn(FW_BUG
3386                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3387                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3388                            rmrr->base_address, rmrr->end_address,
3389                            dmi_get_system_info(DMI_BIOS_VENDOR),
3390                            dmi_get_system_info(DMI_BIOS_VERSION),
3391                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3392                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3393         }
3394
3395         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3396         if (!rmrru)
3397                 goto out;
3398
3399         rmrru->hdr = header;
3400
3401         rmrru->base_address = rmrr->base_address;
3402         rmrru->end_address = rmrr->end_address;
3403
3404         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3405                                 ((void *)rmrr) + rmrr->header.length,
3406                                 &rmrru->devices_cnt);
3407         if (rmrru->devices_cnt && rmrru->devices == NULL)
3408                 goto free_rmrru;
3409
3410         list_add(&rmrru->list, &dmar_rmrr_units);
3411
3412         return 0;
3413 free_rmrru:
3414         kfree(rmrru);
3415 out:
3416         return -ENOMEM;
3417 }
3418
3419 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3420 {
3421         struct dmar_atsr_unit *atsru;
3422         struct acpi_dmar_atsr *tmp;
3423
3424         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3425                                 dmar_rcu_check()) {
3426                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3427                 if (atsr->segment != tmp->segment)
3428                         continue;
3429                 if (atsr->header.length != tmp->header.length)
3430                         continue;
3431                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3432                         return atsru;
3433         }
3434
3435         return NULL;
3436 }
3437
3438 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3439 {
3440         struct acpi_dmar_atsr *atsr;
3441         struct dmar_atsr_unit *atsru;
3442
3443         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3444                 return 0;
3445
3446         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3447         atsru = dmar_find_atsr(atsr);
3448         if (atsru)
3449                 return 0;
3450
3451         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3452         if (!atsru)
3453                 return -ENOMEM;
3454
3455         /*
3456          * If memory is allocated from slab by ACPI _DSM method, we need to
3457          * copy the memory content because the memory buffer will be freed
3458          * on return.
3459          */
3460         atsru->hdr = (void *)(atsru + 1);
3461         memcpy(atsru->hdr, hdr, hdr->length);
3462         atsru->include_all = atsr->flags & 0x1;
3463         if (!atsru->include_all) {
3464                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3465                                 (void *)atsr + atsr->header.length,
3466                                 &atsru->devices_cnt);
3467                 if (atsru->devices_cnt && atsru->devices == NULL) {
3468                         kfree(atsru);
3469                         return -ENOMEM;
3470                 }
3471         }
3472
3473         list_add_rcu(&atsru->list, &dmar_atsr_units);
3474
3475         return 0;
3476 }
3477
3478 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3479 {
3480         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3481         kfree(atsru);
3482 }
3483
3484 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3485 {
3486         struct acpi_dmar_atsr *atsr;
3487         struct dmar_atsr_unit *atsru;
3488
3489         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3490         atsru = dmar_find_atsr(atsr);
3491         if (atsru) {
3492                 list_del_rcu(&atsru->list);
3493                 synchronize_rcu();
3494                 intel_iommu_free_atsr(atsru);
3495         }
3496
3497         return 0;
3498 }
3499
3500 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3501 {
3502         int i;
3503         struct device *dev;
3504         struct acpi_dmar_atsr *atsr;
3505         struct dmar_atsr_unit *atsru;
3506
3507         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3508         atsru = dmar_find_atsr(atsr);
3509         if (!atsru)
3510                 return 0;
3511
3512         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3513                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3514                                           i, dev)
3515                         return -EBUSY;
3516         }
3517
3518         return 0;
3519 }
3520
3521 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3522 {
3523         struct dmar_satc_unit *satcu;
3524         struct acpi_dmar_satc *tmp;
3525
3526         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3527                                 dmar_rcu_check()) {
3528                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3529                 if (satc->segment != tmp->segment)
3530                         continue;
3531                 if (satc->header.length != tmp->header.length)
3532                         continue;
3533                 if (memcmp(satc, tmp, satc->header.length) == 0)
3534                         return satcu;
3535         }
3536
3537         return NULL;
3538 }
3539
3540 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3541 {
3542         struct acpi_dmar_satc *satc;
3543         struct dmar_satc_unit *satcu;
3544
3545         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3546                 return 0;
3547
3548         satc = container_of(hdr, struct acpi_dmar_satc, header);
3549         satcu = dmar_find_satc(satc);
3550         if (satcu)
3551                 return 0;
3552
3553         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3554         if (!satcu)
3555                 return -ENOMEM;
3556
3557         satcu->hdr = (void *)(satcu + 1);
3558         memcpy(satcu->hdr, hdr, hdr->length);
3559         satcu->atc_required = satc->flags & 0x1;
3560         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3561                                               (void *)satc + satc->header.length,
3562                                               &satcu->devices_cnt);
3563         if (satcu->devices_cnt && !satcu->devices) {
3564                 kfree(satcu);
3565                 return -ENOMEM;
3566         }
3567         list_add_rcu(&satcu->list, &dmar_satc_units);
3568
3569         return 0;
3570 }
3571
3572 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3573 {
3574         int sp, ret;
3575         struct intel_iommu *iommu = dmaru->iommu;
3576
3577         if (g_iommus[iommu->seq_id])
3578                 return 0;
3579
3580         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3581         if (ret)
3582                 goto out;
3583
3584         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3585                 pr_warn("%s: Doesn't support hardware pass through.\n",
3586                         iommu->name);
3587                 return -ENXIO;
3588         }
3589         if (!ecap_sc_support(iommu->ecap) &&
3590             domain_update_iommu_snooping(iommu)) {
3591                 pr_warn("%s: Doesn't support snooping.\n",
3592                         iommu->name);
3593                 return -ENXIO;
3594         }
3595         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3596         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3597                 pr_warn("%s: Doesn't support large page.\n",
3598                         iommu->name);
3599                 return -ENXIO;
3600         }
3601
3602         /*
3603          * Disable translation if already enabled prior to OS handover.
3604          */
3605         if (iommu->gcmd & DMA_GCMD_TE)
3606                 iommu_disable_translation(iommu);
3607
3608         g_iommus[iommu->seq_id] = iommu;
3609         ret = iommu_init_domains(iommu);
3610         if (ret == 0)
3611                 ret = iommu_alloc_root_entry(iommu);
3612         if (ret)
3613                 goto out;
3614
3615         intel_svm_check(iommu);
3616
3617         if (dmaru->ignored) {
3618                 /*
3619                  * we always have to disable PMRs or DMA may fail on this device
3620                  */
3621                 if (force_on)
3622                         iommu_disable_protect_mem_regions(iommu);
3623                 return 0;
3624         }
3625
3626         intel_iommu_init_qi(iommu);
3627         iommu_flush_write_buffer(iommu);
3628
3629 #ifdef CONFIG_INTEL_IOMMU_SVM
3630         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3631                 ret = intel_svm_enable_prq(iommu);
3632                 if (ret)
3633                         goto disable_iommu;
3634         }
3635 #endif
3636         ret = dmar_set_interrupt(iommu);
3637         if (ret)
3638                 goto disable_iommu;
3639
3640         iommu_set_root_entry(iommu);
3641         iommu_enable_translation(iommu);
3642
3643         iommu_disable_protect_mem_regions(iommu);
3644         return 0;
3645
3646 disable_iommu:
3647         disable_dmar_iommu(iommu);
3648 out:
3649         free_dmar_iommu(iommu);
3650         return ret;
3651 }
3652
3653 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3654 {
3655         int ret = 0;
3656         struct intel_iommu *iommu = dmaru->iommu;
3657
3658         if (!intel_iommu_enabled)
3659                 return 0;
3660         if (iommu == NULL)
3661                 return -EINVAL;
3662
3663         if (insert) {
3664                 ret = intel_iommu_add(dmaru);
3665         } else {
3666                 disable_dmar_iommu(iommu);
3667                 free_dmar_iommu(iommu);
3668         }
3669
3670         return ret;
3671 }
3672
3673 static void intel_iommu_free_dmars(void)
3674 {
3675         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3676         struct dmar_atsr_unit *atsru, *atsr_n;
3677         struct dmar_satc_unit *satcu, *satc_n;
3678
3679         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3680                 list_del(&rmrru->list);
3681                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3682                 kfree(rmrru);
3683         }
3684
3685         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3686                 list_del(&atsru->list);
3687                 intel_iommu_free_atsr(atsru);
3688         }
3689         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3690                 list_del(&satcu->list);
3691                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3692                 kfree(satcu);
3693         }
3694 }
3695
3696 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3697 {
3698         struct dmar_satc_unit *satcu;
3699         struct acpi_dmar_satc *satc;
3700         struct device *tmp;
3701         int i;
3702
3703         dev = pci_physfn(dev);
3704         rcu_read_lock();
3705
3706         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3707                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3708                 if (satc->segment != pci_domain_nr(dev->bus))
3709                         continue;
3710                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3711                         if (to_pci_dev(tmp) == dev)
3712                                 goto out;
3713         }
3714         satcu = NULL;
3715 out:
3716         rcu_read_unlock();
3717         return satcu;
3718 }
3719
3720 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3721 {
3722         int i, ret = 1;
3723         struct pci_bus *bus;
3724         struct pci_dev *bridge = NULL;
3725         struct device *tmp;
3726         struct acpi_dmar_atsr *atsr;
3727         struct dmar_atsr_unit *atsru;
3728         struct dmar_satc_unit *satcu;
3729
3730         dev = pci_physfn(dev);
3731         satcu = dmar_find_matched_satc_unit(dev);
3732         if (satcu)
3733                 /*
3734                  * This device supports ATS as it is in SATC table.
3735                  * When IOMMU is in legacy mode, enabling ATS is done
3736                  * automatically by HW for the device that requires
3737                  * ATS, hence OS should not enable this device ATS
3738                  * to avoid duplicated TLB invalidation.
3739                  */
3740                 return !(satcu->atc_required && !sm_supported(iommu));
3741
3742         for (bus = dev->bus; bus; bus = bus->parent) {
3743                 bridge = bus->self;
3744                 /* If it's an integrated device, allow ATS */
3745                 if (!bridge)
3746                         return 1;
3747                 /* Connected via non-PCIe: no ATS */
3748                 if (!pci_is_pcie(bridge) ||
3749                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3750                         return 0;
3751                 /* If we found the root port, look it up in the ATSR */
3752                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3753                         break;
3754         }
3755
3756         rcu_read_lock();
3757         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3758                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3759                 if (atsr->segment != pci_domain_nr(dev->bus))
3760                         continue;
3761
3762                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3763                         if (tmp == &bridge->dev)
3764                                 goto out;
3765
3766                 if (atsru->include_all)
3767                         goto out;
3768         }
3769         ret = 0;
3770 out:
3771         rcu_read_unlock();
3772
3773         return ret;
3774 }
3775
3776 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3777 {
3778         int ret;
3779         struct dmar_rmrr_unit *rmrru;
3780         struct dmar_atsr_unit *atsru;
3781         struct dmar_satc_unit *satcu;
3782         struct acpi_dmar_atsr *atsr;
3783         struct acpi_dmar_reserved_memory *rmrr;
3784         struct acpi_dmar_satc *satc;
3785
3786         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3787                 return 0;
3788
3789         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3790                 rmrr = container_of(rmrru->hdr,
3791                                     struct acpi_dmar_reserved_memory, header);
3792                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3793                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3794                                 ((void *)rmrr) + rmrr->header.length,
3795                                 rmrr->segment, rmrru->devices,
3796                                 rmrru->devices_cnt);
3797                         if (ret < 0)
3798                                 return ret;
3799                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3800                         dmar_remove_dev_scope(info, rmrr->segment,
3801                                 rmrru->devices, rmrru->devices_cnt);
3802                 }
3803         }
3804
3805         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3806                 if (atsru->include_all)
3807                         continue;
3808
3809                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3810                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3811                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3812                                         (void *)atsr + atsr->header.length,
3813                                         atsr->segment, atsru->devices,
3814                                         atsru->devices_cnt);
3815                         if (ret > 0)
3816                                 break;
3817                         else if (ret < 0)
3818                                 return ret;
3819                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3820                         if (dmar_remove_dev_scope(info, atsr->segment,
3821                                         atsru->devices, atsru->devices_cnt))
3822                                 break;
3823                 }
3824         }
3825         list_for_each_entry(satcu, &dmar_satc_units, list) {
3826                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3827                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3828                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3829                                         (void *)satc + satc->header.length,
3830                                         satc->segment, satcu->devices,
3831                                         satcu->devices_cnt);
3832                         if (ret > 0)
3833                                 break;
3834                         else if (ret < 0)
3835                                 return ret;
3836                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3837                         if (dmar_remove_dev_scope(info, satc->segment,
3838                                         satcu->devices, satcu->devices_cnt))
3839                                 break;
3840                 }
3841         }
3842
3843         return 0;
3844 }
3845
3846 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3847                                        unsigned long val, void *v)
3848 {
3849         struct memory_notify *mhp = v;
3850         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3851         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3852                         mhp->nr_pages - 1);
3853
3854         switch (val) {
3855         case MEM_GOING_ONLINE:
3856                 if (iommu_domain_identity_map(si_domain,
3857                                               start_vpfn, last_vpfn)) {
3858                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3859                                 start_vpfn, last_vpfn);
3860                         return NOTIFY_BAD;
3861                 }
3862                 break;
3863
3864         case MEM_OFFLINE:
3865         case MEM_CANCEL_ONLINE:
3866                 {
3867                         struct dmar_drhd_unit *drhd;
3868                         struct intel_iommu *iommu;
3869                         LIST_HEAD(freelist);
3870
3871                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3872
3873                         rcu_read_lock();
3874                         for_each_active_iommu(iommu, drhd)
3875                                 iommu_flush_iotlb_psi(iommu, si_domain,
3876                                         start_vpfn, mhp->nr_pages,
3877                                         list_empty(&freelist), 0);
3878                         rcu_read_unlock();
3879                         put_pages_list(&freelist);
3880                 }
3881                 break;
3882         }
3883
3884         return NOTIFY_OK;
3885 }
3886
3887 static struct notifier_block intel_iommu_memory_nb = {
3888         .notifier_call = intel_iommu_memory_notifier,
3889         .priority = 0
3890 };
3891
3892 static void intel_disable_iommus(void)
3893 {
3894         struct intel_iommu *iommu = NULL;
3895         struct dmar_drhd_unit *drhd;
3896
3897         for_each_iommu(iommu, drhd)
3898                 iommu_disable_translation(iommu);
3899 }
3900
3901 void intel_iommu_shutdown(void)
3902 {
3903         struct dmar_drhd_unit *drhd;
3904         struct intel_iommu *iommu = NULL;
3905
3906         if (no_iommu || dmar_disabled)
3907                 return;
3908
3909         down_write(&dmar_global_lock);
3910
3911         /* Disable PMRs explicitly here. */
3912         for_each_iommu(iommu, drhd)
3913                 iommu_disable_protect_mem_regions(iommu);
3914
3915         /* Make sure the IOMMUs are switched off */
3916         intel_disable_iommus();
3917
3918         up_write(&dmar_global_lock);
3919 }
3920
3921 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3922 {
3923         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3924
3925         return container_of(iommu_dev, struct intel_iommu, iommu);
3926 }
3927
3928 static ssize_t version_show(struct device *dev,
3929                             struct device_attribute *attr, char *buf)
3930 {
3931         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3932         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3933         return sprintf(buf, "%d:%d\n",
3934                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3935 }
3936 static DEVICE_ATTR_RO(version);
3937
3938 static ssize_t address_show(struct device *dev,
3939                             struct device_attribute *attr, char *buf)
3940 {
3941         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3942         return sprintf(buf, "%llx\n", iommu->reg_phys);
3943 }
3944 static DEVICE_ATTR_RO(address);
3945
3946 static ssize_t cap_show(struct device *dev,
3947                         struct device_attribute *attr, char *buf)
3948 {
3949         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3950         return sprintf(buf, "%llx\n", iommu->cap);
3951 }
3952 static DEVICE_ATTR_RO(cap);
3953
3954 static ssize_t ecap_show(struct device *dev,
3955                          struct device_attribute *attr, char *buf)
3956 {
3957         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3958         return sprintf(buf, "%llx\n", iommu->ecap);
3959 }
3960 static DEVICE_ATTR_RO(ecap);
3961
3962 static ssize_t domains_supported_show(struct device *dev,
3963                                       struct device_attribute *attr, char *buf)
3964 {
3965         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3966         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3967 }
3968 static DEVICE_ATTR_RO(domains_supported);
3969
3970 static ssize_t domains_used_show(struct device *dev,
3971                                  struct device_attribute *attr, char *buf)
3972 {
3973         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3974         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3975                                                   cap_ndoms(iommu->cap)));
3976 }
3977 static DEVICE_ATTR_RO(domains_used);
3978
3979 static struct attribute *intel_iommu_attrs[] = {
3980         &dev_attr_version.attr,
3981         &dev_attr_address.attr,
3982         &dev_attr_cap.attr,
3983         &dev_attr_ecap.attr,
3984         &dev_attr_domains_supported.attr,
3985         &dev_attr_domains_used.attr,
3986         NULL,
3987 };
3988
3989 static struct attribute_group intel_iommu_group = {
3990         .name = "intel-iommu",
3991         .attrs = intel_iommu_attrs,
3992 };
3993
3994 const struct attribute_group *intel_iommu_groups[] = {
3995         &intel_iommu_group,
3996         NULL,
3997 };
3998
3999 static inline bool has_external_pci(void)
4000 {
4001         struct pci_dev *pdev = NULL;
4002
4003         for_each_pci_dev(pdev)
4004                 if (pdev->external_facing)
4005                         return true;
4006
4007         return false;
4008 }
4009
4010 static int __init platform_optin_force_iommu(void)
4011 {
4012         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4013                 return 0;
4014
4015         if (no_iommu || dmar_disabled)
4016                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4017
4018         /*
4019          * If Intel-IOMMU is disabled by default, we will apply identity
4020          * map for all devices except those marked as being untrusted.
4021          */
4022         if (dmar_disabled)
4023                 iommu_set_default_passthrough(false);
4024
4025         dmar_disabled = 0;
4026         no_iommu = 0;
4027
4028         return 1;
4029 }
4030
4031 static int __init probe_acpi_namespace_devices(void)
4032 {
4033         struct dmar_drhd_unit *drhd;
4034         /* To avoid a -Wunused-but-set-variable warning. */
4035         struct intel_iommu *iommu __maybe_unused;
4036         struct device *dev;
4037         int i, ret = 0;
4038
4039         for_each_active_iommu(iommu, drhd) {
4040                 for_each_active_dev_scope(drhd->devices,
4041                                           drhd->devices_cnt, i, dev) {
4042                         struct acpi_device_physical_node *pn;
4043                         struct iommu_group *group;
4044                         struct acpi_device *adev;
4045
4046                         if (dev->bus != &acpi_bus_type)
4047                                 continue;
4048
4049                         adev = to_acpi_device(dev);
4050                         mutex_lock(&adev->physical_node_lock);
4051                         list_for_each_entry(pn,
4052                                             &adev->physical_node_list, node) {
4053                                 group = iommu_group_get(pn->dev);
4054                                 if (group) {
4055                                         iommu_group_put(group);
4056                                         continue;
4057                                 }
4058
4059                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4060                                 ret = iommu_probe_device(pn->dev);
4061                                 if (ret)
4062                                         break;
4063                         }
4064                         mutex_unlock(&adev->physical_node_lock);
4065
4066                         if (ret)
4067                                 return ret;
4068                 }
4069         }
4070
4071         return 0;
4072 }
4073
4074 int __init intel_iommu_init(void)
4075 {
4076         int ret = -ENODEV;
4077         struct dmar_drhd_unit *drhd;
4078         struct intel_iommu *iommu;
4079
4080         /*
4081          * Intel IOMMU is required for a TXT/tboot launch or platform
4082          * opt in, so enforce that.
4083          */
4084         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4085                     platform_optin_force_iommu();
4086
4087         down_write(&dmar_global_lock);
4088         if (dmar_table_init()) {
4089                 if (force_on)
4090                         panic("tboot: Failed to initialize DMAR table\n");
4091                 goto out_free_dmar;
4092         }
4093
4094         if (dmar_dev_scope_init() < 0) {
4095                 if (force_on)
4096                         panic("tboot: Failed to initialize DMAR device scope\n");
4097                 goto out_free_dmar;
4098         }
4099
4100         up_write(&dmar_global_lock);
4101
4102         /*
4103          * The bus notifier takes the dmar_global_lock, so lockdep will
4104          * complain later when we register it under the lock.
4105          */
4106         dmar_register_bus_notifier();
4107
4108         down_write(&dmar_global_lock);
4109
4110         if (!no_iommu)
4111                 intel_iommu_debugfs_init();
4112
4113         if (no_iommu || dmar_disabled) {
4114                 /*
4115                  * We exit the function here to ensure IOMMU's remapping and
4116                  * mempool aren't setup, which means that the IOMMU's PMRs
4117                  * won't be disabled via the call to init_dmars(). So disable
4118                  * it explicitly here. The PMRs were setup by tboot prior to
4119                  * calling SENTER, but the kernel is expected to reset/tear
4120                  * down the PMRs.
4121                  */
4122                 if (intel_iommu_tboot_noforce) {
4123                         for_each_iommu(iommu, drhd)
4124                                 iommu_disable_protect_mem_regions(iommu);
4125                 }
4126
4127                 /*
4128                  * Make sure the IOMMUs are switched off, even when we
4129                  * boot into a kexec kernel and the previous kernel left
4130                  * them enabled
4131                  */
4132                 intel_disable_iommus();
4133                 goto out_free_dmar;
4134         }
4135
4136         if (list_empty(&dmar_rmrr_units))
4137                 pr_info("No RMRR found\n");
4138
4139         if (list_empty(&dmar_atsr_units))
4140                 pr_info("No ATSR found\n");
4141
4142         if (list_empty(&dmar_satc_units))
4143                 pr_info("No SATC found\n");
4144
4145         if (dmar_map_gfx)
4146                 intel_iommu_gfx_mapped = 1;
4147
4148         init_no_remapping_devices();
4149
4150         ret = init_dmars();
4151         if (ret) {
4152                 if (force_on)
4153                         panic("tboot: Failed to initialize DMARs\n");
4154                 pr_err("Initialization failed\n");
4155                 goto out_free_dmar;
4156         }
4157         up_write(&dmar_global_lock);
4158
4159         init_iommu_pm_ops();
4160
4161         down_read(&dmar_global_lock);
4162         for_each_active_iommu(iommu, drhd) {
4163                 /*
4164                  * The flush queue implementation does not perform
4165                  * page-selective invalidations that are required for efficient
4166                  * TLB flushes in virtual environments.  The benefit of batching
4167                  * is likely to be much lower than the overhead of synchronizing
4168                  * the virtual and physical IOMMU page-tables.
4169                  */
4170                 if (cap_caching_mode(iommu->cap)) {
4171                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4172                         iommu_set_dma_strict();
4173                 }
4174                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4175                                        intel_iommu_groups,
4176                                        "%s", iommu->name);
4177                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4178         }
4179         up_read(&dmar_global_lock);
4180
4181         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4182         if (si_domain && !hw_pass_through)
4183                 register_memory_notifier(&intel_iommu_memory_nb);
4184
4185         down_read(&dmar_global_lock);
4186         if (probe_acpi_namespace_devices())
4187                 pr_warn("ACPI name space devices didn't probe correctly\n");
4188
4189         /* Finally, we enable the DMA remapping hardware. */
4190         for_each_iommu(iommu, drhd) {
4191                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4192                         iommu_enable_translation(iommu);
4193
4194                 iommu_disable_protect_mem_regions(iommu);
4195         }
4196         up_read(&dmar_global_lock);
4197
4198         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4199
4200         intel_iommu_enabled = 1;
4201
4202         return 0;
4203
4204 out_free_dmar:
4205         intel_iommu_free_dmars();
4206         up_write(&dmar_global_lock);
4207         return ret;
4208 }
4209
4210 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4211 {
4212         struct device_domain_info *info = opaque;
4213
4214         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4215         return 0;
4216 }
4217
4218 /*
4219  * NB - intel-iommu lacks any sort of reference counting for the users of
4220  * dependent devices.  If multiple endpoints have intersecting dependent
4221  * devices, unbinding the driver from any one of them will possibly leave
4222  * the others unable to operate.
4223  */
4224 static void domain_context_clear(struct device_domain_info *info)
4225 {
4226         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4227                 return;
4228
4229         pci_for_each_dma_alias(to_pci_dev(info->dev),
4230                                &domain_context_clear_one_cb, info);
4231 }
4232
4233 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4234 {
4235         struct dmar_domain *domain;
4236         struct intel_iommu *iommu;
4237         unsigned long flags;
4238
4239         assert_spin_locked(&device_domain_lock);
4240
4241         if (WARN_ON(!info))
4242                 return;
4243
4244         iommu = info->iommu;
4245         domain = info->domain;
4246
4247         if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4248                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4249                         intel_pasid_tear_down_entry(iommu, info->dev,
4250                                         PASID_RID2PASID, false);
4251
4252                 iommu_disable_dev_iotlb(info);
4253                 domain_context_clear(info);
4254                 intel_pasid_free_table(info->dev);
4255         }
4256
4257         list_del(&info->link);
4258
4259         spin_lock_irqsave(&iommu->lock, flags);
4260         domain_detach_iommu(domain, iommu);
4261         spin_unlock_irqrestore(&iommu->lock, flags);
4262 }
4263
4264 static void dmar_remove_one_dev_info(struct device *dev)
4265 {
4266         struct device_domain_info *info;
4267         unsigned long flags;
4268
4269         spin_lock_irqsave(&device_domain_lock, flags);
4270         info = dev_iommu_priv_get(dev);
4271         if (info)
4272                 __dmar_remove_one_dev_info(info);
4273         spin_unlock_irqrestore(&device_domain_lock, flags);
4274 }
4275
4276 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4277 {
4278         int adjust_width;
4279
4280         /* calculate AGAW */
4281         domain->gaw = guest_width;
4282         adjust_width = guestwidth_to_adjustwidth(guest_width);
4283         domain->agaw = width_to_agaw(adjust_width);
4284
4285         domain->iommu_coherency = false;
4286         domain->iommu_snooping = false;
4287         domain->iommu_superpage = 0;
4288         domain->max_addr = 0;
4289
4290         /* always allocate the top pgd */
4291         domain->pgd = alloc_pgtable_page(domain->nid);
4292         if (!domain->pgd)
4293                 return -ENOMEM;
4294         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4295         return 0;
4296 }
4297
4298 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4299 {
4300         struct dmar_domain *dmar_domain;
4301         struct iommu_domain *domain;
4302
4303         switch (type) {
4304         case IOMMU_DOMAIN_DMA:
4305         case IOMMU_DOMAIN_DMA_FQ:
4306         case IOMMU_DOMAIN_UNMANAGED:
4307                 dmar_domain = alloc_domain(type);
4308                 if (!dmar_domain) {
4309                         pr_err("Can't allocate dmar_domain\n");
4310                         return NULL;
4311                 }
4312                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4313                         pr_err("Domain initialization failed\n");
4314                         domain_exit(dmar_domain);
4315                         return NULL;
4316                 }
4317
4318                 domain = &dmar_domain->domain;
4319                 domain->geometry.aperture_start = 0;
4320                 domain->geometry.aperture_end   =
4321                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4322                 domain->geometry.force_aperture = true;
4323
4324                 return domain;
4325         case IOMMU_DOMAIN_IDENTITY:
4326                 return &si_domain->domain;
4327         default:
4328                 return NULL;
4329         }
4330
4331         return NULL;
4332 }
4333
4334 static void intel_iommu_domain_free(struct iommu_domain *domain)
4335 {
4336         if (domain != &si_domain->domain)
4337                 domain_exit(to_dmar_domain(domain));
4338 }
4339
4340 static int prepare_domain_attach_device(struct iommu_domain *domain,
4341                                         struct device *dev)
4342 {
4343         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4344         struct intel_iommu *iommu;
4345         int addr_width;
4346
4347         iommu = device_to_iommu(dev, NULL, NULL);
4348         if (!iommu)
4349                 return -ENODEV;
4350
4351         /* check if this iommu agaw is sufficient for max mapped address */
4352         addr_width = agaw_to_width(iommu->agaw);
4353         if (addr_width > cap_mgaw(iommu->cap))
4354                 addr_width = cap_mgaw(iommu->cap);
4355
4356         if (dmar_domain->max_addr > (1LL << addr_width)) {
4357                 dev_err(dev, "%s: iommu width (%d) is not "
4358                         "sufficient for the mapped address (%llx)\n",
4359                         __func__, addr_width, dmar_domain->max_addr);
4360                 return -EFAULT;
4361         }
4362         dmar_domain->gaw = addr_width;
4363
4364         /*
4365          * Knock out extra levels of page tables if necessary
4366          */
4367         while (iommu->agaw < dmar_domain->agaw) {
4368                 struct dma_pte *pte;
4369
4370                 pte = dmar_domain->pgd;
4371                 if (dma_pte_present(pte)) {
4372                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4373                         free_pgtable_page(pte);
4374                 }
4375                 dmar_domain->agaw--;
4376         }
4377
4378         return 0;
4379 }
4380
4381 static int intel_iommu_attach_device(struct iommu_domain *domain,
4382                                      struct device *dev)
4383 {
4384         int ret;
4385
4386         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4387             device_is_rmrr_locked(dev)) {
4388                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4389                 return -EPERM;
4390         }
4391
4392         /* normally dev is not mapped */
4393         if (unlikely(domain_context_mapped(dev))) {
4394                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4395
4396                 if (info->domain)
4397                         dmar_remove_one_dev_info(dev);
4398         }
4399
4400         ret = prepare_domain_attach_device(domain, dev);
4401         if (ret)
4402                 return ret;
4403
4404         return domain_add_dev_info(to_dmar_domain(domain), dev);
4405 }
4406
4407 static void intel_iommu_detach_device(struct iommu_domain *domain,
4408                                       struct device *dev)
4409 {
4410         dmar_remove_one_dev_info(dev);
4411 }
4412
4413 static int intel_iommu_map(struct iommu_domain *domain,
4414                            unsigned long iova, phys_addr_t hpa,
4415                            size_t size, int iommu_prot, gfp_t gfp)
4416 {
4417         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4418         u64 max_addr;
4419         int prot = 0;
4420
4421         if (iommu_prot & IOMMU_READ)
4422                 prot |= DMA_PTE_READ;
4423         if (iommu_prot & IOMMU_WRITE)
4424                 prot |= DMA_PTE_WRITE;
4425         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4426                 prot |= DMA_PTE_SNP;
4427
4428         max_addr = iova + size;
4429         if (dmar_domain->max_addr < max_addr) {
4430                 u64 end;
4431
4432                 /* check if minimum agaw is sufficient for mapped address */
4433                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4434                 if (end < max_addr) {
4435                         pr_err("%s: iommu width (%d) is not "
4436                                "sufficient for the mapped address (%llx)\n",
4437                                __func__, dmar_domain->gaw, max_addr);
4438                         return -EFAULT;
4439                 }
4440                 dmar_domain->max_addr = max_addr;
4441         }
4442         /* Round up size to next multiple of PAGE_SIZE, if it and
4443            the low bits of hpa would take us onto the next page */
4444         size = aligned_nrpages(hpa, size);
4445         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4446                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4447 }
4448
4449 static int intel_iommu_map_pages(struct iommu_domain *domain,
4450                                  unsigned long iova, phys_addr_t paddr,
4451                                  size_t pgsize, size_t pgcount,
4452                                  int prot, gfp_t gfp, size_t *mapped)
4453 {
4454         unsigned long pgshift = __ffs(pgsize);
4455         size_t size = pgcount << pgshift;
4456         int ret;
4457
4458         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4459                 return -EINVAL;
4460
4461         if (!IS_ALIGNED(iova | paddr, pgsize))
4462                 return -EINVAL;
4463
4464         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4465         if (!ret && mapped)
4466                 *mapped = size;
4467
4468         return ret;
4469 }
4470
4471 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4472                                 unsigned long iova, size_t size,
4473                                 struct iommu_iotlb_gather *gather)
4474 {
4475         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4476         unsigned long start_pfn, last_pfn;
4477         int level = 0;
4478
4479         /* Cope with horrid API which requires us to unmap more than the
4480            size argument if it happens to be a large-page mapping. */
4481         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4482
4483         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4484                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4485
4486         start_pfn = iova >> VTD_PAGE_SHIFT;
4487         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4488
4489         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4490
4491         if (dmar_domain->max_addr == iova + size)
4492                 dmar_domain->max_addr = iova;
4493
4494         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4495
4496         return size;
4497 }
4498
4499 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4500                                       unsigned long iova,
4501                                       size_t pgsize, size_t pgcount,
4502                                       struct iommu_iotlb_gather *gather)
4503 {
4504         unsigned long pgshift = __ffs(pgsize);
4505         size_t size = pgcount << pgshift;
4506
4507         return intel_iommu_unmap(domain, iova, size, gather);
4508 }
4509
4510 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4511                                  struct iommu_iotlb_gather *gather)
4512 {
4513         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4514         unsigned long iova_pfn = IOVA_PFN(gather->start);
4515         size_t size = gather->end - gather->start;
4516         unsigned long start_pfn;
4517         unsigned long nrpages;
4518         int iommu_id;
4519
4520         nrpages = aligned_nrpages(gather->start, size);
4521         start_pfn = mm_to_dma_pfn(iova_pfn);
4522
4523         for_each_domain_iommu(iommu_id, dmar_domain)
4524                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4525                                       start_pfn, nrpages,
4526                                       list_empty(&gather->freelist), 0);
4527
4528         put_pages_list(&gather->freelist);
4529 }
4530
4531 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4532                                             dma_addr_t iova)
4533 {
4534         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4535         struct dma_pte *pte;
4536         int level = 0;
4537         u64 phys = 0;
4538
4539         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4540         if (pte && dma_pte_present(pte))
4541                 phys = dma_pte_addr(pte) +
4542                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4543                                                 VTD_PAGE_SHIFT) - 1));
4544
4545         return phys;
4546 }
4547
4548 static bool intel_iommu_capable(enum iommu_cap cap)
4549 {
4550         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4551                 return domain_update_iommu_snooping(NULL);
4552         if (cap == IOMMU_CAP_INTR_REMAP)
4553                 return irq_remapping_enabled == 1;
4554
4555         return false;
4556 }
4557
4558 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4559 {
4560         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4561         struct device_domain_info *info;
4562         struct intel_iommu *iommu;
4563         unsigned long flags;
4564         u8 bus, devfn;
4565
4566         iommu = device_to_iommu(dev, &bus, &devfn);
4567         if (!iommu)
4568                 return ERR_PTR(-ENODEV);
4569
4570         info = kzalloc(sizeof(*info), GFP_KERNEL);
4571         if (!info)
4572                 return ERR_PTR(-ENOMEM);
4573
4574         if (dev_is_real_dma_subdevice(dev)) {
4575                 info->bus = pdev->bus->number;
4576                 info->devfn = pdev->devfn;
4577                 info->segment = pci_domain_nr(pdev->bus);
4578         } else {
4579                 info->bus = bus;
4580                 info->devfn = devfn;
4581                 info->segment = iommu->segment;
4582         }
4583
4584         info->dev = dev;
4585         info->iommu = iommu;
4586         if (dev_is_pci(dev)) {
4587                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4588                     pci_ats_supported(pdev) &&
4589                     dmar_ats_supported(pdev, iommu))
4590                         info->ats_supported = 1;
4591
4592                 if (sm_supported(iommu)) {
4593                         if (pasid_supported(iommu)) {
4594                                 int features = pci_pasid_features(pdev);
4595
4596                                 if (features >= 0)
4597                                         info->pasid_supported = features | 1;
4598                         }
4599
4600                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4601                             pci_pri_supported(pdev))
4602                                 info->pri_supported = 1;
4603                 }
4604         }
4605
4606         spin_lock_irqsave(&device_domain_lock, flags);
4607         list_add(&info->global, &device_domain_list);
4608         dev_iommu_priv_set(dev, info);
4609         spin_unlock_irqrestore(&device_domain_lock, flags);
4610
4611         return &iommu->iommu;
4612 }
4613
4614 static void intel_iommu_release_device(struct device *dev)
4615 {
4616         struct device_domain_info *info = dev_iommu_priv_get(dev);
4617         unsigned long flags;
4618
4619         dmar_remove_one_dev_info(dev);
4620
4621         spin_lock_irqsave(&device_domain_lock, flags);
4622         dev_iommu_priv_set(dev, NULL);
4623         list_del(&info->global);
4624         spin_unlock_irqrestore(&device_domain_lock, flags);
4625
4626         kfree(info);
4627         set_dma_ops(dev, NULL);
4628 }
4629
4630 static void intel_iommu_probe_finalize(struct device *dev)
4631 {
4632         set_dma_ops(dev, NULL);
4633         iommu_setup_dma_ops(dev, 0, U64_MAX);
4634 }
4635
4636 static void intel_iommu_get_resv_regions(struct device *device,
4637                                          struct list_head *head)
4638 {
4639         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4640         struct iommu_resv_region *reg;
4641         struct dmar_rmrr_unit *rmrr;
4642         struct device *i_dev;
4643         int i;
4644
4645         down_read(&dmar_global_lock);
4646         for_each_rmrr_units(rmrr) {
4647                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4648                                           i, i_dev) {
4649                         struct iommu_resv_region *resv;
4650                         enum iommu_resv_type type;
4651                         size_t length;
4652
4653                         if (i_dev != device &&
4654                             !is_downstream_to_pci_bridge(device, i_dev))
4655                                 continue;
4656
4657                         length = rmrr->end_address - rmrr->base_address + 1;
4658
4659                         type = device_rmrr_is_relaxable(device) ?
4660                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4661
4662                         resv = iommu_alloc_resv_region(rmrr->base_address,
4663                                                        length, prot, type);
4664                         if (!resv)
4665                                 break;
4666
4667                         list_add_tail(&resv->list, head);
4668                 }
4669         }
4670         up_read(&dmar_global_lock);
4671
4672 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4673         if (dev_is_pci(device)) {
4674                 struct pci_dev *pdev = to_pci_dev(device);
4675
4676                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4677                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4678                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4679                         if (reg)
4680                                 list_add_tail(&reg->list, head);
4681                 }
4682         }
4683 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4684
4685         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4686                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4687                                       0, IOMMU_RESV_MSI);
4688         if (!reg)
4689                 return;
4690         list_add_tail(&reg->list, head);
4691 }
4692
4693 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4694 {
4695         struct device_domain_info *info = dev_iommu_priv_get(dev);
4696         struct context_entry *context;
4697         struct dmar_domain *domain;
4698         unsigned long flags;
4699         u64 ctx_lo;
4700         int ret;
4701
4702         domain = info->domain;
4703         if (!domain)
4704                 return -EINVAL;
4705
4706         spin_lock_irqsave(&device_domain_lock, flags);
4707         spin_lock(&iommu->lock);
4708
4709         ret = -EINVAL;
4710         if (!info->pasid_supported)
4711                 goto out;
4712
4713         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4714         if (WARN_ON(!context))
4715                 goto out;
4716
4717         ctx_lo = context[0].lo;
4718
4719         if (!(ctx_lo & CONTEXT_PASIDE)) {
4720                 ctx_lo |= CONTEXT_PASIDE;
4721                 context[0].lo = ctx_lo;
4722                 wmb();
4723                 iommu->flush.flush_context(iommu,
4724                                            domain->iommu_did[iommu->seq_id],
4725                                            PCI_DEVID(info->bus, info->devfn),
4726                                            DMA_CCMD_MASK_NOBIT,
4727                                            DMA_CCMD_DEVICE_INVL);
4728         }
4729
4730         /* Enable PASID support in the device, if it wasn't already */
4731         if (!info->pasid_enabled)
4732                 iommu_enable_dev_iotlb(info);
4733
4734         ret = 0;
4735
4736  out:
4737         spin_unlock(&iommu->lock);
4738         spin_unlock_irqrestore(&device_domain_lock, flags);
4739
4740         return ret;
4741 }
4742
4743 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4744 {
4745         if (dev_is_pci(dev))
4746                 return pci_device_group(dev);
4747         return generic_device_group(dev);
4748 }
4749
4750 static int intel_iommu_enable_sva(struct device *dev)
4751 {
4752         struct device_domain_info *info = dev_iommu_priv_get(dev);
4753         struct intel_iommu *iommu;
4754         int ret;
4755
4756         if (!info || dmar_disabled)
4757                 return -EINVAL;
4758
4759         iommu = info->iommu;
4760         if (!iommu)
4761                 return -EINVAL;
4762
4763         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4764                 return -ENODEV;
4765
4766         if (intel_iommu_enable_pasid(iommu, dev))
4767                 return -ENODEV;
4768
4769         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4770                 return -EINVAL;
4771
4772         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4773         if (!ret)
4774                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4775
4776         return ret;
4777 }
4778
4779 static int intel_iommu_disable_sva(struct device *dev)
4780 {
4781         struct device_domain_info *info = dev_iommu_priv_get(dev);
4782         struct intel_iommu *iommu = info->iommu;
4783         int ret;
4784
4785         ret = iommu_unregister_device_fault_handler(dev);
4786         if (!ret)
4787                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4788
4789         return ret;
4790 }
4791
4792 static int intel_iommu_enable_iopf(struct device *dev)
4793 {
4794         struct device_domain_info *info = dev_iommu_priv_get(dev);
4795
4796         if (info && info->pri_supported)
4797                 return 0;
4798
4799         return -ENODEV;
4800 }
4801
4802 static int
4803 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4804 {
4805         switch (feat) {
4806         case IOMMU_DEV_FEAT_IOPF:
4807                 return intel_iommu_enable_iopf(dev);
4808
4809         case IOMMU_DEV_FEAT_SVA:
4810                 return intel_iommu_enable_sva(dev);
4811
4812         default:
4813                 return -ENODEV;
4814         }
4815 }
4816
4817 static int
4818 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4819 {
4820         switch (feat) {
4821         case IOMMU_DEV_FEAT_IOPF:
4822                 return 0;
4823
4824         case IOMMU_DEV_FEAT_SVA:
4825                 return intel_iommu_disable_sva(dev);
4826
4827         default:
4828                 return -ENODEV;
4829         }
4830 }
4831
4832 static bool intel_iommu_is_attach_deferred(struct device *dev)
4833 {
4834         struct device_domain_info *info = dev_iommu_priv_get(dev);
4835
4836         return translation_pre_enabled(info->iommu) && !info->domain;
4837 }
4838
4839 /*
4840  * Check that the device does not live on an external facing PCI port that is
4841  * marked as untrusted. Such devices should not be able to apply quirks and
4842  * thus not be able to bypass the IOMMU restrictions.
4843  */
4844 static bool risky_device(struct pci_dev *pdev)
4845 {
4846         if (pdev->untrusted) {
4847                 pci_info(pdev,
4848                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4849                          pdev->vendor, pdev->device);
4850                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4851                 return true;
4852         }
4853         return false;
4854 }
4855
4856 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4857                                        unsigned long iova, size_t size)
4858 {
4859         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860         unsigned long pages = aligned_nrpages(iova, size);
4861         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4862         struct intel_iommu *iommu;
4863         int iommu_id;
4864
4865         for_each_domain_iommu(iommu_id, dmar_domain) {
4866                 iommu = g_iommus[iommu_id];
4867                 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
4868         }
4869 }
4870
4871 const struct iommu_ops intel_iommu_ops = {
4872         .capable                = intel_iommu_capable,
4873         .domain_alloc           = intel_iommu_domain_alloc,
4874         .probe_device           = intel_iommu_probe_device,
4875         .probe_finalize         = intel_iommu_probe_finalize,
4876         .release_device         = intel_iommu_release_device,
4877         .get_resv_regions       = intel_iommu_get_resv_regions,
4878         .put_resv_regions       = generic_iommu_put_resv_regions,
4879         .device_group           = intel_iommu_device_group,
4880         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4881         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4882         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4883         .def_domain_type        = device_def_domain_type,
4884         .pgsize_bitmap          = SZ_4K,
4885 #ifdef CONFIG_INTEL_IOMMU_SVM
4886         .sva_bind               = intel_svm_bind,
4887         .sva_unbind             = intel_svm_unbind,
4888         .sva_get_pasid          = intel_svm_get_pasid,
4889         .page_response          = intel_svm_page_response,
4890 #endif
4891         .default_domain_ops = &(const struct iommu_domain_ops) {
4892                 .attach_dev             = intel_iommu_attach_device,
4893                 .detach_dev             = intel_iommu_detach_device,
4894                 .map_pages              = intel_iommu_map_pages,
4895                 .unmap_pages            = intel_iommu_unmap_pages,
4896                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4897                 .flush_iotlb_all        = intel_flush_iotlb_all,
4898                 .iotlb_sync             = intel_iommu_tlb_sync,
4899                 .iova_to_phys           = intel_iommu_iova_to_phys,
4900                 .free                   = intel_iommu_domain_free,
4901         }
4902 };
4903
4904 static void quirk_iommu_igfx(struct pci_dev *dev)
4905 {
4906         if (risky_device(dev))
4907                 return;
4908
4909         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4910         dmar_map_gfx = 0;
4911 }
4912
4913 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4921
4922 /* Broadwell igfx malfunctions with dmar */
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4947
4948 static void quirk_iommu_rwbf(struct pci_dev *dev)
4949 {
4950         if (risky_device(dev))
4951                 return;
4952
4953         /*
4954          * Mobile 4 Series Chipset neglects to set RWBF capability,
4955          * but needs it. Same seems to hold for the desktop versions.
4956          */
4957         pci_info(dev, "Forcing write-buffer flush capability\n");
4958         rwbf_quirk = 1;
4959 }
4960
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4968
4969 #define GGC 0x52
4970 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4971 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4972 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4973 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4974 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4975 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4976 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4977 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4978
4979 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4980 {
4981         unsigned short ggc;
4982
4983         if (risky_device(dev))
4984                 return;
4985
4986         if (pci_read_config_word(dev, GGC, &ggc))
4987                 return;
4988
4989         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4990                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4991                 dmar_map_gfx = 0;
4992         } else if (dmar_map_gfx) {
4993                 /* we have to ensure the gfx device is idle before we flush */
4994                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4995                 iommu_set_dma_strict();
4996         }
4997 }
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5002
5003 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5004 {
5005         unsigned short ver;
5006
5007         if (!IS_GFX_DEVICE(dev))
5008                 return;
5009
5010         ver = (dev->device >> 8) & 0xff;
5011         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5012             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5013             ver != 0x9a && ver != 0xa7)
5014                 return;
5015
5016         if (risky_device(dev))
5017                 return;
5018
5019         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5020         iommu_skip_te_disable = 1;
5021 }
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5023
5024 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5025    ISOCH DMAR unit for the Azalia sound device, but not give it any
5026    TLB entries, which causes it to deadlock. Check for that.  We do
5027    this in a function called from init_dmars(), instead of in a PCI
5028    quirk, because we don't want to print the obnoxious "BIOS broken"
5029    message if VT-d is actually disabled.
5030 */
5031 static void __init check_tylersburg_isoch(void)
5032 {
5033         struct pci_dev *pdev;
5034         uint32_t vtisochctrl;
5035
5036         /* If there's no Azalia in the system anyway, forget it. */
5037         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5038         if (!pdev)
5039                 return;
5040
5041         if (risky_device(pdev)) {
5042                 pci_dev_put(pdev);
5043                 return;
5044         }
5045
5046         pci_dev_put(pdev);
5047
5048         /* System Management Registers. Might be hidden, in which case
5049            we can't do the sanity check. But that's OK, because the
5050            known-broken BIOSes _don't_ actually hide it, so far. */
5051         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5052         if (!pdev)
5053                 return;
5054
5055         if (risky_device(pdev)) {
5056                 pci_dev_put(pdev);
5057                 return;
5058         }
5059
5060         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5061                 pci_dev_put(pdev);
5062                 return;
5063         }
5064
5065         pci_dev_put(pdev);
5066
5067         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5068         if (vtisochctrl & 1)
5069                 return;
5070
5071         /* Drop all bits other than the number of TLB entries */
5072         vtisochctrl &= 0x1c;
5073
5074         /* If we have the recommended number of TLB entries (16), fine. */
5075         if (vtisochctrl == 0x10)
5076                 return;
5077
5078         /* Zero TLB entries? You get to ride the short bus to school. */
5079         if (!vtisochctrl) {
5080                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5081                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5082                      dmi_get_system_info(DMI_BIOS_VENDOR),
5083                      dmi_get_system_info(DMI_BIOS_VERSION),
5084                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5085                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5086                 return;
5087         }
5088
5089         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5090                vtisochctrl);
5091 }