Merge tag 'docs-5.10-2' of git://git.lwn.net/linux
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev_iommu_priv_get(dev);
376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377                 return NULL;
378
379         return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
386                                 to_pci_dev(d)->untrusted)
387
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393                                      void *data), void *data)
394 {
395         int ret = 0;
396         unsigned long flags;
397         struct device_domain_info *info;
398
399         spin_lock_irqsave(&device_domain_lock, flags);
400         list_for_each_entry(info, &device_domain_list, global) {
401                 ret = fn(info, data);
402                 if (ret) {
403                         spin_unlock_irqrestore(&device_domain_lock, flags);
404                         return ret;
405                 }
406         }
407         spin_unlock_irqrestore(&device_domain_lock, flags);
408
409         return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426         u32 gsts;
427
428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
429         if (gsts & DMA_GSTS_TES)
430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
433 static int __init intel_iommu_setup(char *str)
434 {
435         if (!str)
436                 return -EINVAL;
437         while (*str) {
438                 if (!strncmp(str, "on", 2)) {
439                         dmar_disabled = 0;
440                         pr_info("IOMMU enabled\n");
441                 } else if (!strncmp(str, "off", 3)) {
442                         dmar_disabled = 1;
443                         no_platform_optin = 1;
444                         pr_info("IOMMU disabled\n");
445                 } else if (!strncmp(str, "igfx_off", 8)) {
446                         dmar_map_gfx = 0;
447                         pr_info("Disable GFX device mapping\n");
448                 } else if (!strncmp(str, "forcedac", 8)) {
449                         pr_info("Forcing DAC for PCI devices\n");
450                         dmar_forcedac = 1;
451                 } else if (!strncmp(str, "strict", 6)) {
452                         pr_info("Disable batched IOTLB flush\n");
453                         intel_iommu_strict = 1;
454                 } else if (!strncmp(str, "sp_off", 6)) {
455                         pr_info("Disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 } else if (!strncmp(str, "sm_on", 5)) {
458                         pr_info("Intel-IOMMU: scalable mode supported\n");
459                         intel_iommu_sm = 1;
460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462                         intel_iommu_tboot_noforce = 1;
463                 } else if (!strncmp(str, "nobounce", 8)) {
464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465                         intel_no_bounce = 1;
466                 }
467
468                 str += strcspn(str, ",");
469                 while (*str == ',')
470                         str++;
471         }
472         return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481         struct dmar_domain **domains;
482         int idx = did >> 8;
483
484         domains = iommu->domains[idx];
485         if (!domains)
486                 return NULL;
487
488         return domains[did & 0xff];
489 }
490
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492                              struct dmar_domain *domain)
493 {
494         struct dmar_domain **domains;
495         int idx = did >> 8;
496
497         if (!iommu->domains[idx]) {
498                 size_t size = 256 * sizeof(struct dmar_domain *);
499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500         }
501
502         domains = iommu->domains[idx];
503         if (WARN_ON(!domains))
504                 return;
505         else
506                 domains[did & 0xff] = domain;
507 }
508
509 void *alloc_pgtable_page(int node)
510 {
511         struct page *page;
512         void *vaddr = NULL;
513
514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515         if (page)
516                 vaddr = page_address(page);
517         return vaddr;
518 }
519
520 void free_pgtable_page(void *vaddr)
521 {
522         free_page((unsigned long)vaddr);
523 }
524
525 static inline void *alloc_domain_mem(void)
526 {
527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
530 static void free_domain_mem(void *vaddr)
531 {
532         kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
535 static inline void * alloc_devinfo_mem(void)
536 {
537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616         return sm_supported(iommu) ?
617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu;
624         bool found = false;
625         int i;
626
627         domain->iommu_coherency = 1;
628
629         for_each_domain_iommu(i, domain) {
630                 found = true;
631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632                         domain->iommu_coherency = 0;
633                         break;
634                 }
635         }
636         if (found)
637                 return;
638
639         /* No hardware attached; use lowest common denominator */
640         rcu_read_lock();
641         for_each_active_iommu(iommu, drhd) {
642                 if (!iommu_paging_structure_coherency(iommu)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         rcu_read_unlock();
648 }
649
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652         struct dmar_drhd_unit *drhd;
653         struct intel_iommu *iommu;
654         int ret = 1;
655
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         if (!ecap_sc_support(iommu->ecap)) {
660                                 ret = 0;
661                                 break;
662                         }
663                 }
664         }
665         rcu_read_unlock();
666
667         return ret;
668 }
669
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671                                          struct intel_iommu *skip)
672 {
673         struct dmar_drhd_unit *drhd;
674         struct intel_iommu *iommu;
675         int mask = 0x3;
676
677         if (!intel_iommu_superpage) {
678                 return 0;
679         }
680
681         /* set iommu_superpage to the smallest common denominator */
682         rcu_read_lock();
683         for_each_active_iommu(iommu, drhd) {
684                 if (iommu != skip) {
685                         if (domain && domain_use_first_level(domain)) {
686                                 if (!cap_fl1gp_support(iommu->cap))
687                                         mask = 0x1;
688                         } else {
689                                 mask &= cap_super_page_val(iommu->cap);
690                         }
691
692                         if (!mask)
693                                 break;
694                 }
695         }
696         rcu_read_unlock();
697
698         return fls(mask);
699 }
700
701 static int domain_update_device_node(struct dmar_domain *domain)
702 {
703         struct device_domain_info *info;
704         int nid = NUMA_NO_NODE;
705
706         assert_spin_locked(&device_domain_lock);
707
708         if (list_empty(&domain->devices))
709                 return NUMA_NO_NODE;
710
711         list_for_each_entry(info, &domain->devices, link) {
712                 if (!info->dev)
713                         continue;
714
715                 /*
716                  * There could possibly be multiple device numa nodes as devices
717                  * within the same domain may sit behind different IOMMUs. There
718                  * isn't perfect answer in such situation, so we select first
719                  * come first served policy.
720                  */
721                 nid = dev_to_node(info->dev);
722                 if (nid != NUMA_NO_NODE)
723                         break;
724         }
725
726         return nid;
727 }
728
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
731 {
732         domain_update_iommu_coherency(domain);
733         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
735
736         /*
737          * If RHSA is missing, we should default to the device numa domain
738          * as fall back.
739          */
740         if (domain->nid == NUMA_NO_NODE)
741                 domain->nid = domain_update_device_node(domain);
742 }
743
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
745                                          u8 devfn, int alloc)
746 {
747         struct root_entry *root = &iommu->root_entry[bus];
748         struct context_entry *context;
749         u64 *entry;
750
751         entry = &root->lo;
752         if (sm_supported(iommu)) {
753                 if (devfn >= 0x80) {
754                         devfn -= 0x80;
755                         entry = &root->hi;
756                 }
757                 devfn *= 2;
758         }
759         if (*entry & 1)
760                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
761         else {
762                 unsigned long phy_addr;
763                 if (!alloc)
764                         return NULL;
765
766                 context = alloc_pgtable_page(iommu->node);
767                 if (!context)
768                         return NULL;
769
770                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771                 phy_addr = virt_to_phys((void *)context);
772                 *entry = phy_addr | 1;
773                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
774         }
775         return &context[devfn];
776 }
777
778 static bool attach_deferred(struct device *dev)
779 {
780         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
781 }
782
783 /**
784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785  *                               sub-hierarchy of a candidate PCI-PCI bridge
786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787  * @bridge: the candidate PCI-PCI bridge
788  *
789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
790  */
791 static bool
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
793 {
794         struct pci_dev *pdev, *pbridge;
795
796         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
797                 return false;
798
799         pdev = to_pci_dev(dev);
800         pbridge = to_pci_dev(bridge);
801
802         if (pbridge->subordinate &&
803             pbridge->subordinate->number <= pdev->bus->number &&
804             pbridge->subordinate->busn_res.end >= pdev->bus->number)
805                 return true;
806
807         return false;
808 }
809
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
811 {
812         struct dmar_drhd_unit *drhd;
813         u32 vtbar;
814         int rc;
815
816         /* We know that this device on this chipset has its own IOMMU.
817          * If we find it under a different IOMMU, then the BIOS is lying
818          * to us. Hope that the IOMMU for this device is actually
819          * disabled, and it needs no translation...
820          */
821         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
822         if (rc) {
823                 /* "can't" happen */
824                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
825                 return false;
826         }
827         vtbar &= 0xffff0000;
828
829         /* we know that the this iommu should be at offset 0xa000 from vtbar */
830         drhd = dmar_find_matched_drhd_unit(pdev);
831         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
834                 return true;
835         }
836
837         return false;
838 }
839
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
841 {
842         if (!iommu || iommu->drhd->ignored)
843                 return true;
844
845         if (dev_is_pci(dev)) {
846                 struct pci_dev *pdev = to_pci_dev(dev);
847
848                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850                     quirk_ioat_snb_local_iommu(pdev))
851                         return true;
852         }
853
854         return false;
855 }
856
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
858 {
859         struct dmar_drhd_unit *drhd = NULL;
860         struct pci_dev *pdev = NULL;
861         struct intel_iommu *iommu;
862         struct device *tmp;
863         u16 segment = 0;
864         int i;
865
866         if (!dev)
867                 return NULL;
868
869         if (dev_is_pci(dev)) {
870                 struct pci_dev *pf_pdev;
871
872                 pdev = pci_real_dma_dev(to_pci_dev(dev));
873
874                 /* VFs aren't listed in scope tables; we need to look up
875                  * the PF instead to find the IOMMU. */
876                 pf_pdev = pci_physfn(pdev);
877                 dev = &pf_pdev->dev;
878                 segment = pci_domain_nr(pdev->bus);
879         } else if (has_acpi_companion(dev))
880                 dev = &ACPI_COMPANION(dev)->dev;
881
882         rcu_read_lock();
883         for_each_iommu(iommu, drhd) {
884                 if (pdev && segment != drhd->segment)
885                         continue;
886
887                 for_each_active_dev_scope(drhd->devices,
888                                           drhd->devices_cnt, i, tmp) {
889                         if (tmp == dev) {
890                                 /* For a VF use its original BDF# not that of the PF
891                                  * which we used for the IOMMU lookup. Strictly speaking
892                                  * we could do this for all PCI devices; we only need to
893                                  * get the BDF# from the scope table for ACPI matches. */
894                                 if (pdev && pdev->is_virtfn)
895                                         goto got_pdev;
896
897                                 if (bus && devfn) {
898                                         *bus = drhd->devices[i].bus;
899                                         *devfn = drhd->devices[i].devfn;
900                                 }
901                                 goto out;
902                         }
903
904                         if (is_downstream_to_pci_bridge(dev, tmp))
905                                 goto got_pdev;
906                 }
907
908                 if (pdev && drhd->include_all) {
909                 got_pdev:
910                         if (bus && devfn) {
911                                 *bus = pdev->bus->number;
912                                 *devfn = pdev->devfn;
913                         }
914                         goto out;
915                 }
916         }
917         iommu = NULL;
918  out:
919         if (iommu_is_dummy(iommu, dev))
920                 iommu = NULL;
921
922         rcu_read_unlock();
923
924         return iommu;
925 }
926
927 static void domain_flush_cache(struct dmar_domain *domain,
928                                void *addr, int size)
929 {
930         if (!domain->iommu_coherency)
931                 clflush_cache_range(addr, size);
932 }
933
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
935 {
936         struct context_entry *context;
937         int ret = 0;
938         unsigned long flags;
939
940         spin_lock_irqsave(&iommu->lock, flags);
941         context = iommu_context_addr(iommu, bus, devfn, 0);
942         if (context)
943                 ret = context_present(context);
944         spin_unlock_irqrestore(&iommu->lock, flags);
945         return ret;
946 }
947
948 static void free_context_table(struct intel_iommu *iommu)
949 {
950         int i;
951         unsigned long flags;
952         struct context_entry *context;
953
954         spin_lock_irqsave(&iommu->lock, flags);
955         if (!iommu->root_entry) {
956                 goto out;
957         }
958         for (i = 0; i < ROOT_ENTRY_NR; i++) {
959                 context = iommu_context_addr(iommu, i, 0, 0);
960                 if (context)
961                         free_pgtable_page(context);
962
963                 if (!sm_supported(iommu))
964                         continue;
965
966                 context = iommu_context_addr(iommu, i, 0x80, 0);
967                 if (context)
968                         free_pgtable_page(context);
969
970         }
971         free_pgtable_page(iommu->root_entry);
972         iommu->root_entry = NULL;
973 out:
974         spin_unlock_irqrestore(&iommu->lock, flags);
975 }
976
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978                                       unsigned long pfn, int *target_level)
979 {
980         struct dma_pte *parent, *pte;
981         int level = agaw_to_level(domain->agaw);
982         int offset;
983
984         BUG_ON(!domain->pgd);
985
986         if (!domain_pfn_supported(domain, pfn))
987                 /* Address beyond IOMMU's addressing capabilities. */
988                 return NULL;
989
990         parent = domain->pgd;
991
992         while (1) {
993                 void *tmp_page;
994
995                 offset = pfn_level_offset(pfn, level);
996                 pte = &parent[offset];
997                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
998                         break;
999                 if (level == *target_level)
1000                         break;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         uint64_t pteval;
1004
1005                         tmp_page = alloc_pgtable_page(domain->nid);
1006
1007                         if (!tmp_page)
1008                                 return NULL;
1009
1010                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012                         if (domain_use_first_level(domain))
1013                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1015                                 /* Someone else set it while we were thinking; use theirs. */
1016                                 free_pgtable_page(tmp_page);
1017                         else
1018                                 domain_flush_cache(domain, pte, sizeof(*pte));
1019                 }
1020                 if (level == 1)
1021                         break;
1022
1023                 parent = phys_to_virt(dma_pte_addr(pte));
1024                 level--;
1025         }
1026
1027         if (!*target_level)
1028                 *target_level = level;
1029
1030         return pte;
1031 }
1032
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035                                          unsigned long pfn,
1036                                          int level, int *large_page)
1037 {
1038         struct dma_pte *parent, *pte;
1039         int total = agaw_to_level(domain->agaw);
1040         int offset;
1041
1042         parent = domain->pgd;
1043         while (level <= total) {
1044                 offset = pfn_level_offset(pfn, total);
1045                 pte = &parent[offset];
1046                 if (level == total)
1047                         return pte;
1048
1049                 if (!dma_pte_present(pte)) {
1050                         *large_page = total;
1051                         break;
1052                 }
1053
1054                 if (dma_pte_superpage(pte)) {
1055                         *large_page = total;
1056                         return pte;
1057                 }
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 total--;
1061         }
1062         return NULL;
1063 }
1064
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067                                 unsigned long start_pfn,
1068                                 unsigned long last_pfn)
1069 {
1070         unsigned int large_page;
1071         struct dma_pte *first_pte, *pte;
1072
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         /* we don't need lock here; nobody else touches the iova range */
1078         do {
1079                 large_page = 1;
1080                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081                 if (!pte) {
1082                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083                         continue;
1084                 }
1085                 do {
1086                         dma_clear_pte(pte);
1087                         start_pfn += lvl_to_nr_pages(large_page);
1088                         pte++;
1089                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090
1091                 domain_flush_cache(domain, first_pte,
1092                                    (void *)pte - (void *)first_pte);
1093
1094         } while (start_pfn && start_pfn <= last_pfn);
1095 }
1096
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098                                int retain_level, struct dma_pte *pte,
1099                                unsigned long pfn, unsigned long start_pfn,
1100                                unsigned long last_pfn)
1101 {
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107                 struct dma_pte *level_pte;
1108
1109                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110                         goto next;
1111
1112                 level_pfn = pfn & level_mask(level);
1113                 level_pte = phys_to_virt(dma_pte_addr(pte));
1114
1115                 if (level > 2) {
1116                         dma_pte_free_level(domain, level - 1, retain_level,
1117                                            level_pte, level_pfn, start_pfn,
1118                                            last_pfn);
1119                 }
1120
1121                 /*
1122                  * Free the page table if we're below the level we want to
1123                  * retain and the range covers the entire table.
1124                  */
1125                 if (level < retain_level && !(start_pfn > level_pfn ||
1126                       last_pfn < level_pfn + level_size(level) - 1)) {
1127                         dma_clear_pte(pte);
1128                         domain_flush_cache(domain, pte, sizeof(*pte));
1129                         free_pgtable_page(level_pte);
1130                 }
1131 next:
1132                 pfn += level_size(level);
1133         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141                                    unsigned long start_pfn,
1142                                    unsigned long last_pfn,
1143                                    int retain_level)
1144 {
1145         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147         BUG_ON(start_pfn > last_pfn);
1148
1149         dma_pte_clear_range(domain, start_pfn, last_pfn);
1150
1151         /* We don't need lock here; nobody else touches the iova range */
1152         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153                            domain->pgd, 0, start_pfn, last_pfn);
1154
1155         /* free pgd */
1156         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157                 free_pgtable_page(domain->pgd);
1158                 domain->pgd = NULL;
1159         }
1160 }
1161
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169                                             int level, struct dma_pte *pte,
1170                                             struct page *freelist)
1171 {
1172         struct page *pg;
1173
1174         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175         pg->freelist = freelist;
1176         freelist = pg;
1177
1178         if (level == 1)
1179                 return freelist;
1180
1181         pte = page_address(pg);
1182         do {
1183                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184                         freelist = dma_pte_list_pagetables(domain, level - 1,
1185                                                            pte, freelist);
1186                 pte++;
1187         } while (!first_pte_in_page(pte));
1188
1189         return freelist;
1190 }
1191
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193                                         struct dma_pte *pte, unsigned long pfn,
1194                                         unsigned long start_pfn,
1195                                         unsigned long last_pfn,
1196                                         struct page *freelist)
1197 {
1198         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199
1200         pfn = max(start_pfn, pfn);
1201         pte = &pte[pfn_level_offset(pfn, level)];
1202
1203         do {
1204                 unsigned long level_pfn;
1205
1206                 if (!dma_pte_present(pte))
1207                         goto next;
1208
1209                 level_pfn = pfn & level_mask(level);
1210
1211                 /* If range covers entire pagetable, free it */
1212                 if (start_pfn <= level_pfn &&
1213                     last_pfn >= level_pfn + level_size(level) - 1) {
1214                         /* These suborbinate page tables are going away entirely. Don't
1215                            bother to clear them; we're just going to *free* them. */
1216                         if (level > 1 && !dma_pte_superpage(pte))
1217                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218
1219                         dma_clear_pte(pte);
1220                         if (!first_pte)
1221                                 first_pte = pte;
1222                         last_pte = pte;
1223                 } else if (level > 1) {
1224                         /* Recurse down into a level that isn't *entirely* obsolete */
1225                         freelist = dma_pte_clear_level(domain, level - 1,
1226                                                        phys_to_virt(dma_pte_addr(pte)),
1227                                                        level_pfn, start_pfn, last_pfn,
1228                                                        freelist);
1229                 }
1230 next:
1231                 pfn += level_size(level);
1232         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233
1234         if (first_pte)
1235                 domain_flush_cache(domain, first_pte,
1236                                    (void *)++last_pte - (void *)first_pte);
1237
1238         return freelist;
1239 }
1240
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245                                  unsigned long start_pfn,
1246                                  unsigned long last_pfn)
1247 {
1248         struct page *freelist;
1249
1250         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252         BUG_ON(start_pfn > last_pfn);
1253
1254         /* we don't need lock here; nobody else touches the iova range */
1255         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1257
1258         /* free pgd */
1259         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260                 struct page *pgd_page = virt_to_page(domain->pgd);
1261                 pgd_page->freelist = freelist;
1262                 freelist = pgd_page;
1263
1264                 domain->pgd = NULL;
1265         }
1266
1267         return freelist;
1268 }
1269
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272         struct page *pg;
1273
1274         while ((pg = freelist)) {
1275                 freelist = pg->freelist;
1276                 free_pgtable_page(page_address(pg));
1277         }
1278 }
1279
1280 static void iova_entry_free(unsigned long data)
1281 {
1282         struct page *freelist = (struct page *)data;
1283
1284         dma_free_pagelist(freelist);
1285 }
1286
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290         struct root_entry *root;
1291         unsigned long flags;
1292
1293         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294         if (!root) {
1295                 pr_err("Allocating root entry for %s failed\n",
1296                         iommu->name);
1297                 return -ENOMEM;
1298         }
1299
1300         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         iommu->root_entry = root;
1304         spin_unlock_irqrestore(&iommu->lock, flags);
1305
1306         return 0;
1307 }
1308
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311         u64 addr;
1312         u32 sts;
1313         unsigned long flag;
1314
1315         addr = virt_to_phys(iommu->root_entry);
1316         if (sm_supported(iommu))
1317                 addr |= DMA_RTADDR_SMT;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321
1322         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (sts & DMA_GSTS_RTPS), sts);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333         u32 val;
1334         unsigned long flag;
1335
1336         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337                 return;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344                       readl, (!(val & DMA_GSTS_WBFS)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351                                   u16 did, u16 source_id, u8 function_mask,
1352                                   u64 type)
1353 {
1354         u64 val = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_CCMD_GLOBAL_INVL:
1359                 val = DMA_CCMD_GLOBAL_INVL;
1360                 break;
1361         case DMA_CCMD_DOMAIN_INVL:
1362                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363                 break;
1364         case DMA_CCMD_DEVICE_INVL:
1365                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367                 break;
1368         default:
1369                 BUG();
1370         }
1371         val |= DMA_CCMD_ICC;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385                                 u64 addr, unsigned int size_order, u64 type)
1386 {
1387         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388         u64 val = 0, val_iva = 0;
1389         unsigned long flag;
1390
1391         switch (type) {
1392         case DMA_TLB_GLOBAL_FLUSH:
1393                 /* global flush doesn't need set IVA_REG */
1394                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395                 break;
1396         case DMA_TLB_DSI_FLUSH:
1397                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398                 break;
1399         case DMA_TLB_PSI_FLUSH:
1400                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 /* IH bit is passed in as part of address */
1402                 val_iva = size_order | addr;
1403                 break;
1404         default:
1405                 BUG();
1406         }
1407         /* Note: set drain read/write */
1408 #if 0
1409         /*
1410          * This is probably to be super secure.. Looks like we can
1411          * ignore it without any impact.
1412          */
1413         if (cap_read_drain(iommu->cap))
1414                 val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416         if (cap_write_drain(iommu->cap))
1417                 val |= DMA_TLB_WRITE_DRAIN;
1418
1419         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420         /* Note: Only uses first TLB reg currently */
1421         if (val_iva)
1422                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430
1431         /* check IOTLB invalidation granularity */
1432         if (DMA_TLB_IAIG(val) == 0)
1433                 pr_err("Flush IOTLB failed\n");
1434         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436                         (unsigned long long)DMA_TLB_IIRG(type),
1437                         (unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442                          u8 bus, u8 devfn)
1443 {
1444         struct device_domain_info *info;
1445
1446         assert_spin_locked(&device_domain_lock);
1447
1448         if (!iommu->qi)
1449                 return NULL;
1450
1451         list_for_each_entry(info, &domain->devices, link)
1452                 if (info->iommu == iommu && info->bus == bus &&
1453                     info->devfn == devfn) {
1454                         if (info->ats_supported && info->dev)
1455                                 return info;
1456                         break;
1457                 }
1458
1459         return NULL;
1460 }
1461
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464         struct device_domain_info *info;
1465         bool has_iotlb_device = false;
1466
1467         assert_spin_locked(&device_domain_lock);
1468
1469         list_for_each_entry(info, &domain->devices, link) {
1470                 struct pci_dev *pdev;
1471
1472                 if (!info->dev || !dev_is_pci(info->dev))
1473                         continue;
1474
1475                 pdev = to_pci_dev(info->dev);
1476                 if (pdev->ats_enabled) {
1477                         has_iotlb_device = true;
1478                         break;
1479                 }
1480         }
1481
1482         domain->has_iotlb_device = has_iotlb_device;
1483 }
1484
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487         struct pci_dev *pdev;
1488
1489         assert_spin_locked(&device_domain_lock);
1490
1491         if (!info || !dev_is_pci(info->dev))
1492                 return;
1493
1494         pdev = to_pci_dev(info->dev);
1495         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498          * reserved, which should be set to 0.
1499          */
1500         if (!ecap_dit(info->iommu->ecap))
1501                 info->pfsid = 0;
1502         else {
1503                 struct pci_dev *pf_pdev;
1504
1505                 /* pdev will be returned if device is not a vf */
1506                 pf_pdev = pci_physfn(pdev);
1507                 info->pfsid = pci_dev_id(pf_pdev);
1508         }
1509
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511         /* The PCIe spec, in its wisdom, declares that the behaviour of
1512            the device if you enable PASID support after ATS support is
1513            undefined. So always enable PASID support on devices which
1514            have it, even if we can't yet know if we're ever going to
1515            use it. */
1516         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517                 info->pasid_enabled = 1;
1518
1519         if (info->pri_supported &&
1520             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522                 info->pri_enabled = 1;
1523 #endif
1524         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526                 info->ats_enabled = 1;
1527                 domain_update_iotlb(info->domain);
1528                 info->ats_qdep = pci_ats_queue_depth(pdev);
1529         }
1530 }
1531
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534         struct pci_dev *pdev;
1535
1536         assert_spin_locked(&device_domain_lock);
1537
1538         if (!dev_is_pci(info->dev))
1539                 return;
1540
1541         pdev = to_pci_dev(info->dev);
1542
1543         if (info->ats_enabled) {
1544                 pci_disable_ats(pdev);
1545                 info->ats_enabled = 0;
1546                 domain_update_iotlb(info->domain);
1547         }
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549         if (info->pri_enabled) {
1550                 pci_disable_pri(pdev);
1551                 info->pri_enabled = 0;
1552         }
1553         if (info->pasid_enabled) {
1554                 pci_disable_pasid(pdev);
1555                 info->pasid_enabled = 0;
1556         }
1557 #endif
1558 }
1559
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561                                   u64 addr, unsigned mask)
1562 {
1563         u16 sid, qdep;
1564         unsigned long flags;
1565         struct device_domain_info *info;
1566
1567         if (!domain->has_iotlb_device)
1568                 return;
1569
1570         spin_lock_irqsave(&device_domain_lock, flags);
1571         list_for_each_entry(info, &domain->devices, link) {
1572                 if (!info->ats_enabled)
1573                         continue;
1574
1575                 sid = info->bus << 8 | info->devfn;
1576                 qdep = info->ats_qdep;
1577                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578                                 qdep, addr, mask);
1579         }
1580         spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584                                 struct dmar_domain *domain,
1585                                 u64 addr, unsigned long npages, bool ih)
1586 {
1587         u16 did = domain->iommu_did[iommu->seq_id];
1588
1589         if (domain->default_pasid)
1590                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591                                 addr, npages, ih);
1592
1593         if (!list_empty(&domain->devices))
1594                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598                                   struct dmar_domain *domain,
1599                                   unsigned long pfn, unsigned int pages,
1600                                   int ih, int map)
1601 {
1602         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604         u16 did = domain->iommu_did[iommu->seq_id];
1605
1606         BUG_ON(pages == 0);
1607
1608         if (ih)
1609                 ih = 1 << 6;
1610
1611         if (domain_use_first_level(domain)) {
1612                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613         } else {
1614                 /*
1615                  * Fallback to domain selective flush if no PSI support or
1616                  * the size is too big. PSI requires page size to be 2 ^ x,
1617                  * and the base address is naturally aligned to the size.
1618                  */
1619                 if (!cap_pgsel_inv(iommu->cap) ||
1620                     mask > cap_max_amask_val(iommu->cap))
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                         DMA_TLB_DSI_FLUSH);
1623                 else
1624                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625                                                         DMA_TLB_PSI_FLUSH);
1626         }
1627
1628         /*
1629          * In caching mode, changes of pages from non-present to present require
1630          * flush. However, device IOTLB doesn't need to be flushed in this case.
1631          */
1632         if (!cap_caching_mode(iommu->cap) || !map)
1633                 iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638                                         struct dmar_domain *domain,
1639                                         unsigned long pfn, unsigned int pages)
1640 {
1641         /*
1642          * It's a non-present to present mapping. Only flush if caching mode
1643          * and second level.
1644          */
1645         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647         else
1648                 iommu_flush_write_buffer(iommu);
1649 }
1650
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653         struct dmar_domain *domain;
1654         int idx;
1655
1656         domain = container_of(iovad, struct dmar_domain, iovad);
1657
1658         for_each_domain_iommu(idx, domain) {
1659                 struct intel_iommu *iommu = g_iommus[idx];
1660                 u16 did = domain->iommu_did[iommu->seq_id];
1661
1662                 if (domain_use_first_level(domain))
1663                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666                                                  DMA_TLB_DSI_FLUSH);
1667
1668                 if (!cap_caching_mode(iommu->cap))
1669                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670                                               0, MAX_AGAW_PFN_WIDTH);
1671         }
1672 }
1673
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676         u32 pmen;
1677         unsigned long flags;
1678
1679         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680                 return;
1681
1682         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684         pmen &= ~DMA_PMEN_EPM;
1685         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686
1687         /* wait for the protected region status bit to clear */
1688         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1690
1691         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696         u32 sts;
1697         unsigned long flags;
1698
1699         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700         iommu->gcmd |= DMA_GCMD_TE;
1701         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702
1703         /* Make sure hardware complete it */
1704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705                       readl, (sts & DMA_GSTS_TES), sts);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flag;
1714
1715         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717                 return;
1718
1719         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720         iommu->gcmd &= ~DMA_GCMD_TE;
1721         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722
1723         /* Make sure hardware complete it */
1724         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725                       readl, (!(sts & DMA_GSTS_TES)), sts);
1726
1727         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732         u32 ndomains, nlongs;
1733         size_t size;
1734
1735         ndomains = cap_ndoms(iommu->cap);
1736         pr_debug("%s: Number of Domains supported <%d>\n",
1737                  iommu->name, ndomains);
1738         nlongs = BITS_TO_LONGS(ndomains);
1739
1740         spin_lock_init(&iommu->lock);
1741
1742         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743         if (!iommu->domain_ids) {
1744                 pr_err("%s: Allocating domain id array failed\n",
1745                        iommu->name);
1746                 return -ENOMEM;
1747         }
1748
1749         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750         iommu->domains = kzalloc(size, GFP_KERNEL);
1751
1752         if (iommu->domains) {
1753                 size = 256 * sizeof(struct dmar_domain *);
1754                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755         }
1756
1757         if (!iommu->domains || !iommu->domains[0]) {
1758                 pr_err("%s: Allocating domain array failed\n",
1759                        iommu->name);
1760                 kfree(iommu->domain_ids);
1761                 kfree(iommu->domains);
1762                 iommu->domain_ids = NULL;
1763                 iommu->domains    = NULL;
1764                 return -ENOMEM;
1765         }
1766
1767         /*
1768          * If Caching mode is set, then invalid translations are tagged
1769          * with domain-id 0, hence we need to pre-allocate it. We also
1770          * use domain-id 0 as a marker for non-allocated domain-id, so
1771          * make sure it is not used for a real domain.
1772          */
1773         set_bit(0, iommu->domain_ids);
1774
1775         /*
1776          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777          * entry for first-level or pass-through translation modes should
1778          * be programmed with a domain id different from those used for
1779          * second-level or nested translation. We reserve a domain id for
1780          * this purpose.
1781          */
1782         if (sm_supported(iommu))
1783                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784
1785         return 0;
1786 }
1787
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790         struct device_domain_info *info, *tmp;
1791         unsigned long flags;
1792
1793         if (!iommu->domains || !iommu->domain_ids)
1794                 return;
1795
1796         spin_lock_irqsave(&device_domain_lock, flags);
1797         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798                 if (info->iommu != iommu)
1799                         continue;
1800
1801                 if (!info->dev || !info->domain)
1802                         continue;
1803
1804                 __dmar_remove_one_dev_info(info);
1805         }
1806         spin_unlock_irqrestore(&device_domain_lock, flags);
1807
1808         if (iommu->gcmd & DMA_GCMD_TE)
1809                 iommu_disable_translation(iommu);
1810 }
1811
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814         if ((iommu->domains) && (iommu->domain_ids)) {
1815                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816                 int i;
1817
1818                 for (i = 0; i < elems; i++)
1819                         kfree(iommu->domains[i]);
1820                 kfree(iommu->domains);
1821                 kfree(iommu->domain_ids);
1822                 iommu->domains = NULL;
1823                 iommu->domain_ids = NULL;
1824         }
1825
1826         g_iommus[iommu->seq_id] = NULL;
1827
1828         /* free context mapping */
1829         free_context_table(iommu);
1830
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832         if (pasid_supported(iommu)) {
1833                 if (ecap_prs(iommu->ecap))
1834                         intel_svm_finish_prq(iommu);
1835         }
1836         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1838
1839 #endif
1840 }
1841
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848         struct dmar_drhd_unit *drhd;
1849         struct intel_iommu *iommu;
1850         static int first_level_support = -1;
1851
1852         if (likely(first_level_support != -1))
1853                 return first_level_support;
1854
1855         first_level_support = 1;
1856
1857         rcu_read_lock();
1858         for_each_active_iommu(iommu, drhd) {
1859                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860                         first_level_support = 0;
1861                         break;
1862                 }
1863         }
1864         rcu_read_unlock();
1865
1866         return first_level_support;
1867 }
1868
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871         struct dmar_domain *domain;
1872
1873         domain = alloc_domain_mem();
1874         if (!domain)
1875                 return NULL;
1876
1877         memset(domain, 0, sizeof(*domain));
1878         domain->nid = NUMA_NO_NODE;
1879         domain->flags = flags;
1880         if (first_level_by_default())
1881                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882         domain->has_iotlb_device = false;
1883         INIT_LIST_HEAD(&domain->devices);
1884
1885         return domain;
1886 }
1887
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890                                struct intel_iommu *iommu)
1891 {
1892         unsigned long ndomains;
1893         int num;
1894
1895         assert_spin_locked(&device_domain_lock);
1896         assert_spin_locked(&iommu->lock);
1897
1898         domain->iommu_refcnt[iommu->seq_id] += 1;
1899         domain->iommu_count += 1;
1900         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901                 ndomains = cap_ndoms(iommu->cap);
1902                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903
1904                 if (num >= ndomains) {
1905                         pr_err("%s: No free domain ids\n", iommu->name);
1906                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1907                         domain->iommu_count -= 1;
1908                         return -ENOSPC;
1909                 }
1910
1911                 set_bit(num, iommu->domain_ids);
1912                 set_iommu_domain(iommu, num, domain);
1913
1914                 domain->iommu_did[iommu->seq_id] = num;
1915                 domain->nid                      = iommu->node;
1916
1917                 domain_update_iommu_cap(domain);
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924                                struct intel_iommu *iommu)
1925 {
1926         int num, count;
1927
1928         assert_spin_locked(&device_domain_lock);
1929         assert_spin_locked(&iommu->lock);
1930
1931         domain->iommu_refcnt[iommu->seq_id] -= 1;
1932         count = --domain->iommu_count;
1933         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934                 num = domain->iommu_did[iommu->seq_id];
1935                 clear_bit(num, iommu->domain_ids);
1936                 set_iommu_domain(iommu, num, NULL);
1937
1938                 domain_update_iommu_cap(domain);
1939                 domain->iommu_did[iommu->seq_id] = 0;
1940         }
1941
1942         return count;
1943 }
1944
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950         struct pci_dev *pdev = NULL;
1951         struct iova *iova;
1952         int i;
1953
1954         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955
1956         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957                 &reserved_rbtree_key);
1958
1959         /* IOAPIC ranges shouldn't be accessed by DMA */
1960         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961                 IOVA_PFN(IOAPIC_RANGE_END));
1962         if (!iova) {
1963                 pr_err("Reserve IOAPIC range failed\n");
1964                 return -ENODEV;
1965         }
1966
1967         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968         for_each_pci_dev(pdev) {
1969                 struct resource *r;
1970
1971                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972                         r = &pdev->resource[i];
1973                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974                                 continue;
1975                         iova = reserve_iova(&reserved_iova_list,
1976                                             IOVA_PFN(r->start),
1977                                             IOVA_PFN(r->end));
1978                         if (!iova) {
1979                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980                                 return -ENODEV;
1981                         }
1982                 }
1983         }
1984         return 0;
1985 }
1986
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989         int agaw;
1990         int r = (gaw - 12) % 9;
1991
1992         if (r == 0)
1993                 agaw = gaw;
1994         else
1995                 agaw = gaw + 9 - r;
1996         if (agaw > 64)
1997                 agaw = 64;
1998         return agaw;
1999 }
2000
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003
2004         /* Remove associated devices and clear attached or cached domains */
2005         domain_remove_dev_info(domain);
2006
2007         /* destroy iovas */
2008         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009                 put_iova_domain(&domain->iovad);
2010
2011         if (domain->pgd) {
2012                 struct page *freelist;
2013
2014                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015                 dma_free_pagelist(freelist);
2016         }
2017
2018         free_domain_mem(domain);
2019 }
2020
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028         int pds, max_pde;
2029
2030         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032         if (pds < 7)
2033                 return 0;
2034
2035         return pds - 7;
2036 }
2037
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046         context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 2);
2056 }
2057
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064         context->lo |= (1 << 4);
2065 }
2066
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2069
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071                                       struct intel_iommu *iommu,
2072                                       struct pasid_table *table,
2073                                       u8 bus, u8 devfn)
2074 {
2075         u16 did = domain->iommu_did[iommu->seq_id];
2076         int translation = CONTEXT_TT_MULTI_LEVEL;
2077         struct device_domain_info *info = NULL;
2078         struct context_entry *context;
2079         unsigned long flags;
2080         int ret;
2081
2082         WARN_ON(did == 0);
2083
2084         if (hw_pass_through && domain_type_is_si(domain))
2085                 translation = CONTEXT_TT_PASS_THROUGH;
2086
2087         pr_debug("Set context mapping for %02x:%02x.%d\n",
2088                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089
2090         BUG_ON(!domain->pgd);
2091
2092         spin_lock_irqsave(&device_domain_lock, flags);
2093         spin_lock(&iommu->lock);
2094
2095         ret = -ENOMEM;
2096         context = iommu_context_addr(iommu, bus, devfn, 1);
2097         if (!context)
2098                 goto out_unlock;
2099
2100         ret = 0;
2101         if (context_present(context))
2102                 goto out_unlock;
2103
2104         /*
2105          * For kdump cases, old valid entries may be cached due to the
2106          * in-flight DMA and copied pgtable, but there is no unmapping
2107          * behaviour for them, thus we need an explicit cache flush for
2108          * the newly-mapped device. For kdump, at this point, the device
2109          * is supposed to finish reset at its driver probe stage, so no
2110          * in-flight DMA will exist, and we don't need to worry anymore
2111          * hereafter.
2112          */
2113         if (context_copied(context)) {
2114                 u16 did_old = context_domain_id(context);
2115
2116                 if (did_old < cap_ndoms(iommu->cap)) {
2117                         iommu->flush.flush_context(iommu, did_old,
2118                                                    (((u16)bus) << 8) | devfn,
2119                                                    DMA_CCMD_MASK_NOBIT,
2120                                                    DMA_CCMD_DEVICE_INVL);
2121                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122                                                  DMA_TLB_DSI_FLUSH);
2123                 }
2124         }
2125
2126         context_clear_entry(context);
2127
2128         if (sm_supported(iommu)) {
2129                 unsigned long pds;
2130
2131                 WARN_ON(!table);
2132
2133                 /* Setup the PASID DIR pointer: */
2134                 pds = context_get_sm_pds(table);
2135                 context->lo = (u64)virt_to_phys(table->table) |
2136                                 context_pdts(pds);
2137
2138                 /* Setup the RID_PASID field: */
2139                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140
2141                 /*
2142                  * Setup the Device-TLB enable bit and Page request
2143                  * Enable bit:
2144                  */
2145                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146                 if (info && info->ats_supported)
2147                         context_set_sm_dte(context);
2148                 if (info && info->pri_supported)
2149                         context_set_sm_pre(context);
2150         } else {
2151                 struct dma_pte *pgd = domain->pgd;
2152                 int agaw;
2153
2154                 context_set_domain_id(context, did);
2155
2156                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2157                         /*
2158                          * Skip top levels of page tables for iommu which has
2159                          * less agaw than default. Unnecessary for PT mode.
2160                          */
2161                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162                                 ret = -ENOMEM;
2163                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2164                                 if (!dma_pte_present(pgd))
2165                                         goto out_unlock;
2166                         }
2167
2168                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169                         if (info && info->ats_supported)
2170                                 translation = CONTEXT_TT_DEV_IOTLB;
2171                         else
2172                                 translation = CONTEXT_TT_MULTI_LEVEL;
2173
2174                         context_set_address_root(context, virt_to_phys(pgd));
2175                         context_set_address_width(context, agaw);
2176                 } else {
2177                         /*
2178                          * In pass through mode, AW must be programmed to
2179                          * indicate the largest AGAW value supported by
2180                          * hardware. And ASR is ignored by hardware.
2181                          */
2182                         context_set_address_width(context, iommu->msagaw);
2183                 }
2184
2185                 context_set_translation_type(context, translation);
2186         }
2187
2188         context_set_fault_enable(context);
2189         context_set_present(context);
2190         if (!ecap_coherent(iommu->ecap))
2191                 clflush_cache_range(context, sizeof(*context));
2192
2193         /*
2194          * It's a non-present to present mapping. If hardware doesn't cache
2195          * non-present entry we only need to flush the write-buffer. If the
2196          * _does_ cache non-present entries, then it does so in the special
2197          * domain #0, which we have to flush:
2198          */
2199         if (cap_caching_mode(iommu->cap)) {
2200                 iommu->flush.flush_context(iommu, 0,
2201                                            (((u16)bus) << 8) | devfn,
2202                                            DMA_CCMD_MASK_NOBIT,
2203                                            DMA_CCMD_DEVICE_INVL);
2204                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205         } else {
2206                 iommu_flush_write_buffer(iommu);
2207         }
2208         iommu_enable_dev_iotlb(info);
2209
2210         ret = 0;
2211
2212 out_unlock:
2213         spin_unlock(&iommu->lock);
2214         spin_unlock_irqrestore(&device_domain_lock, flags);
2215
2216         return ret;
2217 }
2218
2219 struct domain_context_mapping_data {
2220         struct dmar_domain *domain;
2221         struct intel_iommu *iommu;
2222         struct pasid_table *table;
2223 };
2224
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226                                      u16 alias, void *opaque)
2227 {
2228         struct domain_context_mapping_data *data = opaque;
2229
2230         return domain_context_mapping_one(data->domain, data->iommu,
2231                                           data->table, PCI_BUS_NUM(alias),
2232                                           alias & 0xff);
2233 }
2234
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238         struct domain_context_mapping_data data;
2239         struct pasid_table *table;
2240         struct intel_iommu *iommu;
2241         u8 bus, devfn;
2242
2243         iommu = device_to_iommu(dev, &bus, &devfn);
2244         if (!iommu)
2245                 return -ENODEV;
2246
2247         table = intel_pasid_get_table(dev);
2248
2249         if (!dev_is_pci(dev))
2250                 return domain_context_mapping_one(domain, iommu, table,
2251                                                   bus, devfn);
2252
2253         data.domain = domain;
2254         data.iommu = iommu;
2255         data.table = table;
2256
2257         return pci_for_each_dma_alias(to_pci_dev(dev),
2258                                       &domain_context_mapping_cb, &data);
2259 }
2260
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262                                     u16 alias, void *opaque)
2263 {
2264         struct intel_iommu *iommu = opaque;
2265
2266         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271         struct intel_iommu *iommu;
2272         u8 bus, devfn;
2273
2274         iommu = device_to_iommu(dev, &bus, &devfn);
2275         if (!iommu)
2276                 return -ENODEV;
2277
2278         if (!dev_is_pci(dev))
2279                 return device_context_mapped(iommu, bus, devfn);
2280
2281         return !pci_for_each_dma_alias(to_pci_dev(dev),
2282                                        domain_context_mapped_cb, iommu);
2283 }
2284
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287                                             size_t size)
2288 {
2289         host_addr &= ~PAGE_MASK;
2290         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295                                           unsigned long iov_pfn,
2296                                           unsigned long phy_pfn,
2297                                           unsigned long pages)
2298 {
2299         int support, level = 1;
2300         unsigned long pfnmerge;
2301
2302         support = domain->iommu_superpage;
2303
2304         /* To use a large page, the virtual *and* physical addresses
2305            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306            of them will mean we have to use smaller pages. So just
2307            merge them and check both at once. */
2308         pfnmerge = iov_pfn | phy_pfn;
2309
2310         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311                 pages >>= VTD_STRIDE_SHIFT;
2312                 if (!pages)
2313                         break;
2314                 pfnmerge >>= VTD_STRIDE_SHIFT;
2315                 level++;
2316                 support--;
2317         }
2318         return level;
2319 }
2320
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                             struct scatterlist *sg, unsigned long phys_pfn,
2323                             unsigned long nr_pages, int prot)
2324 {
2325         struct dma_pte *first_pte = NULL, *pte = NULL;
2326         phys_addr_t pteval;
2327         unsigned long sg_res = 0;
2328         unsigned int largepage_lvl = 0;
2329         unsigned long lvl_pages = 0;
2330         u64 attr;
2331
2332         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333
2334         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335                 return -EINVAL;
2336
2337         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338         if (domain_use_first_level(domain))
2339                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340
2341         if (!sg) {
2342                 sg_res = nr_pages;
2343                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344         }
2345
2346         while (nr_pages > 0) {
2347                 uint64_t tmp;
2348
2349                 if (!sg_res) {
2350                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351
2352                         sg_res = aligned_nrpages(sg->offset, sg->length);
2353                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354                         sg->dma_length = sg->length;
2355                         pteval = (sg_phys(sg) - pgoff) | attr;
2356                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357                 }
2358
2359                 if (!pte) {
2360                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361
2362                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363                         if (!pte)
2364                                 return -ENOMEM;
2365                         /* It is large page*/
2366                         if (largepage_lvl > 1) {
2367                                 unsigned long nr_superpages, end_pfn;
2368
2369                                 pteval |= DMA_PTE_LARGE_PAGE;
2370                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371
2372                                 nr_superpages = sg_res / lvl_pages;
2373                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374
2375                                 /*
2376                                  * Ensure that old small page tables are
2377                                  * removed to make room for superpage(s).
2378                                  * We're adding new large pages, so make sure
2379                                  * we don't remove their parent tables.
2380                                  */
2381                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382                                                        largepage_lvl + 1);
2383                         } else {
2384                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385                         }
2386
2387                 }
2388                 /* We don't need lock here, nobody else
2389                  * touches the iova range
2390                  */
2391                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392                 if (tmp) {
2393                         static int dumps = 5;
2394                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395                                 iov_pfn, tmp, (unsigned long long)pteval);
2396                         if (dumps) {
2397                                 dumps--;
2398                                 debug_dma_dump_mappings(NULL);
2399                         }
2400                         WARN_ON(1);
2401                 }
2402
2403                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404
2405                 BUG_ON(nr_pages < lvl_pages);
2406                 BUG_ON(sg_res < lvl_pages);
2407
2408                 nr_pages -= lvl_pages;
2409                 iov_pfn += lvl_pages;
2410                 phys_pfn += lvl_pages;
2411                 pteval += lvl_pages * VTD_PAGE_SIZE;
2412                 sg_res -= lvl_pages;
2413
2414                 /* If the next PTE would be the first in a new page, then we
2415                    need to flush the cache on the entries we've just written.
2416                    And then we'll need to recalculate 'pte', so clear it and
2417                    let it get set again in the if (!pte) block above.
2418
2419                    If we're done (!nr_pages) we need to flush the cache too.
2420
2421                    Also if we've been setting superpages, we may need to
2422                    recalculate 'pte' and switch back to smaller pages for the
2423                    end of the mapping, if the trailing size is not enough to
2424                    use another superpage (i.e. sg_res < lvl_pages). */
2425                 pte++;
2426                 if (!nr_pages || first_pte_in_page(pte) ||
2427                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428                         domain_flush_cache(domain, first_pte,
2429                                            (void *)pte - (void *)first_pte);
2430                         pte = NULL;
2431                 }
2432
2433                 if (!sg_res && nr_pages)
2434                         sg = sg_next(sg);
2435         }
2436         return 0;
2437 }
2438
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440                           struct scatterlist *sg, unsigned long phys_pfn,
2441                           unsigned long nr_pages, int prot)
2442 {
2443         int iommu_id, ret;
2444         struct intel_iommu *iommu;
2445
2446         /* Do the real mapping first */
2447         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448         if (ret)
2449                 return ret;
2450
2451         for_each_domain_iommu(iommu_id, domain) {
2452                 iommu = g_iommus[iommu_id];
2453                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454         }
2455
2456         return 0;
2457 }
2458
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460                                     struct scatterlist *sg, unsigned long nr_pages,
2461                                     int prot)
2462 {
2463         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467                                      unsigned long phys_pfn, unsigned long nr_pages,
2468                                      int prot)
2469 {
2470         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475         unsigned long flags;
2476         struct context_entry *context;
2477         u16 did_old;
2478
2479         if (!iommu)
2480                 return;
2481
2482         spin_lock_irqsave(&iommu->lock, flags);
2483         context = iommu_context_addr(iommu, bus, devfn, 0);
2484         if (!context) {
2485                 spin_unlock_irqrestore(&iommu->lock, flags);
2486                 return;
2487         }
2488         did_old = context_domain_id(context);
2489         context_clear_entry(context);
2490         __iommu_flush_cache(iommu, context, sizeof(*context));
2491         spin_unlock_irqrestore(&iommu->lock, flags);
2492         iommu->flush.flush_context(iommu,
2493                                    did_old,
2494                                    (((u16)bus) << 8) | devfn,
2495                                    DMA_CCMD_MASK_NOBIT,
2496                                    DMA_CCMD_DEVICE_INVL);
2497         iommu->flush.flush_iotlb(iommu,
2498                                  did_old,
2499                                  0,
2500                                  0,
2501                                  DMA_TLB_DSI_FLUSH);
2502 }
2503
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506         assert_spin_locked(&device_domain_lock);
2507         list_del(&info->link);
2508         list_del(&info->global);
2509         if (info->dev)
2510                 dev_iommu_priv_set(info->dev, NULL);
2511 }
2512
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515         struct device_domain_info *info, *tmp;
2516         unsigned long flags;
2517
2518         spin_lock_irqsave(&device_domain_lock, flags);
2519         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520                 __dmar_remove_one_dev_info(info);
2521         spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526         struct device_domain_info *info;
2527
2528         if (unlikely(attach_deferred(dev)))
2529                 return NULL;
2530
2531         /* No lock here, assumes no domain exit in normal case */
2532         info = get_domain_info(dev);
2533         if (likely(info))
2534                 return info->domain;
2535
2536         return NULL;
2537 }
2538
2539 static void do_deferred_attach(struct device *dev)
2540 {
2541         struct iommu_domain *domain;
2542
2543         dev_iommu_priv_set(dev, NULL);
2544         domain = iommu_get_domain_for_dev(dev);
2545         if (domain)
2546                 intel_iommu_attach_device(domain, dev);
2547 }
2548
2549 static inline struct device_domain_info *
2550 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2551 {
2552         struct device_domain_info *info;
2553
2554         list_for_each_entry(info, &device_domain_list, global)
2555                 if (info->segment == segment && info->bus == bus &&
2556                     info->devfn == devfn)
2557                         return info;
2558
2559         return NULL;
2560 }
2561
2562 static int domain_setup_first_level(struct intel_iommu *iommu,
2563                                     struct dmar_domain *domain,
2564                                     struct device *dev,
2565                                     u32 pasid)
2566 {
2567         int flags = PASID_FLAG_SUPERVISOR_MODE;
2568         struct dma_pte *pgd = domain->pgd;
2569         int agaw, level;
2570
2571         /*
2572          * Skip top levels of page tables for iommu which has
2573          * less agaw than default. Unnecessary for PT mode.
2574          */
2575         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2576                 pgd = phys_to_virt(dma_pte_addr(pgd));
2577                 if (!dma_pte_present(pgd))
2578                         return -ENOMEM;
2579         }
2580
2581         level = agaw_to_level(agaw);
2582         if (level != 4 && level != 5)
2583                 return -EINVAL;
2584
2585         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2586
2587         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2588                                              domain->iommu_did[iommu->seq_id],
2589                                              flags);
2590 }
2591
2592 static bool dev_is_real_dma_subdevice(struct device *dev)
2593 {
2594         return dev && dev_is_pci(dev) &&
2595                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2596 }
2597
2598 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2599                                                     int bus, int devfn,
2600                                                     struct device *dev,
2601                                                     struct dmar_domain *domain)
2602 {
2603         struct dmar_domain *found = NULL;
2604         struct device_domain_info *info;
2605         unsigned long flags;
2606         int ret;
2607
2608         info = alloc_devinfo_mem();
2609         if (!info)
2610                 return NULL;
2611
2612         if (!dev_is_real_dma_subdevice(dev)) {
2613                 info->bus = bus;
2614                 info->devfn = devfn;
2615                 info->segment = iommu->segment;
2616         } else {
2617                 struct pci_dev *pdev = to_pci_dev(dev);
2618
2619                 info->bus = pdev->bus->number;
2620                 info->devfn = pdev->devfn;
2621                 info->segment = pci_domain_nr(pdev->bus);
2622         }
2623
2624         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2625         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2626         info->ats_qdep = 0;
2627         info->dev = dev;
2628         info->domain = domain;
2629         info->iommu = iommu;
2630         info->pasid_table = NULL;
2631         info->auxd_enabled = 0;
2632         INIT_LIST_HEAD(&info->auxiliary_domains);
2633
2634         if (dev && dev_is_pci(dev)) {
2635                 struct pci_dev *pdev = to_pci_dev(info->dev);
2636
2637                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2638                     pci_ats_supported(pdev) &&
2639                     dmar_find_matched_atsr_unit(pdev))
2640                         info->ats_supported = 1;
2641
2642                 if (sm_supported(iommu)) {
2643                         if (pasid_supported(iommu)) {
2644                                 int features = pci_pasid_features(pdev);
2645                                 if (features >= 0)
2646                                         info->pasid_supported = features | 1;
2647                         }
2648
2649                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2650                             pci_pri_supported(pdev))
2651                                 info->pri_supported = 1;
2652                 }
2653         }
2654
2655         spin_lock_irqsave(&device_domain_lock, flags);
2656         if (dev)
2657                 found = find_domain(dev);
2658
2659         if (!found) {
2660                 struct device_domain_info *info2;
2661                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2662                                                        info->devfn);
2663                 if (info2) {
2664                         found      = info2->domain;
2665                         info2->dev = dev;
2666                 }
2667         }
2668
2669         if (found) {
2670                 spin_unlock_irqrestore(&device_domain_lock, flags);
2671                 free_devinfo_mem(info);
2672                 /* Caller must free the original domain */
2673                 return found;
2674         }
2675
2676         spin_lock(&iommu->lock);
2677         ret = domain_attach_iommu(domain, iommu);
2678         spin_unlock(&iommu->lock);
2679
2680         if (ret) {
2681                 spin_unlock_irqrestore(&device_domain_lock, flags);
2682                 free_devinfo_mem(info);
2683                 return NULL;
2684         }
2685
2686         list_add(&info->link, &domain->devices);
2687         list_add(&info->global, &device_domain_list);
2688         if (dev)
2689                 dev_iommu_priv_set(dev, info);
2690         spin_unlock_irqrestore(&device_domain_lock, flags);
2691
2692         /* PASID table is mandatory for a PCI device in scalable mode. */
2693         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2694                 ret = intel_pasid_alloc_table(dev);
2695                 if (ret) {
2696                         dev_err(dev, "PASID table allocation failed\n");
2697                         dmar_remove_one_dev_info(dev);
2698                         return NULL;
2699                 }
2700
2701                 /* Setup the PASID entry for requests without PASID: */
2702                 spin_lock_irqsave(&iommu->lock, flags);
2703                 if (hw_pass_through && domain_type_is_si(domain))
2704                         ret = intel_pasid_setup_pass_through(iommu, domain,
2705                                         dev, PASID_RID2PASID);
2706                 else if (domain_use_first_level(domain))
2707                         ret = domain_setup_first_level(iommu, domain, dev,
2708                                         PASID_RID2PASID);
2709                 else
2710                         ret = intel_pasid_setup_second_level(iommu, domain,
2711                                         dev, PASID_RID2PASID);
2712                 spin_unlock_irqrestore(&iommu->lock, flags);
2713                 if (ret) {
2714                         dev_err(dev, "Setup RID2PASID failed\n");
2715                         dmar_remove_one_dev_info(dev);
2716                         return NULL;
2717                 }
2718         }
2719
2720         if (dev && domain_context_mapping(domain, dev)) {
2721                 dev_err(dev, "Domain context map failed\n");
2722                 dmar_remove_one_dev_info(dev);
2723                 return NULL;
2724         }
2725
2726         return domain;
2727 }
2728
2729 static int iommu_domain_identity_map(struct dmar_domain *domain,
2730                                      unsigned long first_vpfn,
2731                                      unsigned long last_vpfn)
2732 {
2733         /*
2734          * RMRR range might have overlap with physical memory range,
2735          * clear it first
2736          */
2737         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2738
2739         return __domain_mapping(domain, first_vpfn, NULL,
2740                                 first_vpfn, last_vpfn - first_vpfn + 1,
2741                                 DMA_PTE_READ|DMA_PTE_WRITE);
2742 }
2743
2744 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2745
2746 static int __init si_domain_init(int hw)
2747 {
2748         struct dmar_rmrr_unit *rmrr;
2749         struct device *dev;
2750         int i, nid, ret;
2751
2752         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2753         if (!si_domain)
2754                 return -EFAULT;
2755
2756         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2757                 domain_exit(si_domain);
2758                 return -EFAULT;
2759         }
2760
2761         if (hw)
2762                 return 0;
2763
2764         for_each_online_node(nid) {
2765                 unsigned long start_pfn, end_pfn;
2766                 int i;
2767
2768                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2769                         ret = iommu_domain_identity_map(si_domain,
2770                                         mm_to_dma_pfn(start_pfn),
2771                                         mm_to_dma_pfn(end_pfn));
2772                         if (ret)
2773                                 return ret;
2774                 }
2775         }
2776
2777         /*
2778          * Identity map the RMRRs so that devices with RMRRs could also use
2779          * the si_domain.
2780          */
2781         for_each_rmrr_units(rmrr) {
2782                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2783                                           i, dev) {
2784                         unsigned long long start = rmrr->base_address;
2785                         unsigned long long end = rmrr->end_address;
2786
2787                         if (WARN_ON(end < start ||
2788                                     end >> agaw_to_width(si_domain->agaw)))
2789                                 continue;
2790
2791                         ret = iommu_domain_identity_map(si_domain,
2792                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2793                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2794                         if (ret)
2795                                 return ret;
2796                 }
2797         }
2798
2799         return 0;
2800 }
2801
2802 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2803 {
2804         struct dmar_domain *ndomain;
2805         struct intel_iommu *iommu;
2806         u8 bus, devfn;
2807
2808         iommu = device_to_iommu(dev, &bus, &devfn);
2809         if (!iommu)
2810                 return -ENODEV;
2811
2812         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2813         if (ndomain != domain)
2814                 return -EBUSY;
2815
2816         return 0;
2817 }
2818
2819 static bool device_has_rmrr(struct device *dev)
2820 {
2821         struct dmar_rmrr_unit *rmrr;
2822         struct device *tmp;
2823         int i;
2824
2825         rcu_read_lock();
2826         for_each_rmrr_units(rmrr) {
2827                 /*
2828                  * Return TRUE if this RMRR contains the device that
2829                  * is passed in.
2830                  */
2831                 for_each_active_dev_scope(rmrr->devices,
2832                                           rmrr->devices_cnt, i, tmp)
2833                         if (tmp == dev ||
2834                             is_downstream_to_pci_bridge(dev, tmp)) {
2835                                 rcu_read_unlock();
2836                                 return true;
2837                         }
2838         }
2839         rcu_read_unlock();
2840         return false;
2841 }
2842
2843 /**
2844  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2845  * is relaxable (ie. is allowed to be not enforced under some conditions)
2846  * @dev: device handle
2847  *
2848  * We assume that PCI USB devices with RMRRs have them largely
2849  * for historical reasons and that the RMRR space is not actively used post
2850  * boot.  This exclusion may change if vendors begin to abuse it.
2851  *
2852  * The same exception is made for graphics devices, with the requirement that
2853  * any use of the RMRR regions will be torn down before assigning the device
2854  * to a guest.
2855  *
2856  * Return: true if the RMRR is relaxable, false otherwise
2857  */
2858 static bool device_rmrr_is_relaxable(struct device *dev)
2859 {
2860         struct pci_dev *pdev;
2861
2862         if (!dev_is_pci(dev))
2863                 return false;
2864
2865         pdev = to_pci_dev(dev);
2866         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2867                 return true;
2868         else
2869                 return false;
2870 }
2871
2872 /*
2873  * There are a couple cases where we need to restrict the functionality of
2874  * devices associated with RMRRs.  The first is when evaluating a device for
2875  * identity mapping because problems exist when devices are moved in and out
2876  * of domains and their respective RMRR information is lost.  This means that
2877  * a device with associated RMRRs will never be in a "passthrough" domain.
2878  * The second is use of the device through the IOMMU API.  This interface
2879  * expects to have full control of the IOVA space for the device.  We cannot
2880  * satisfy both the requirement that RMRR access is maintained and have an
2881  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2882  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2883  * We therefore prevent devices associated with an RMRR from participating in
2884  * the IOMMU API, which eliminates them from device assignment.
2885  *
2886  * In both cases, devices which have relaxable RMRRs are not concerned by this
2887  * restriction. See device_rmrr_is_relaxable comment.
2888  */
2889 static bool device_is_rmrr_locked(struct device *dev)
2890 {
2891         if (!device_has_rmrr(dev))
2892                 return false;
2893
2894         if (device_rmrr_is_relaxable(dev))
2895                 return false;
2896
2897         return true;
2898 }
2899
2900 /*
2901  * Return the required default domain type for a specific device.
2902  *
2903  * @dev: the device in query
2904  * @startup: true if this is during early boot
2905  *
2906  * Returns:
2907  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2908  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2909  *  - 0: both identity and dynamic domains work for this device
2910  */
2911 static int device_def_domain_type(struct device *dev)
2912 {
2913         if (dev_is_pci(dev)) {
2914                 struct pci_dev *pdev = to_pci_dev(dev);
2915
2916                 /*
2917                  * Prevent any device marked as untrusted from getting
2918                  * placed into the statically identity mapping domain.
2919                  */
2920                 if (pdev->untrusted)
2921                         return IOMMU_DOMAIN_DMA;
2922
2923                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2924                         return IOMMU_DOMAIN_IDENTITY;
2925
2926                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2927                         return IOMMU_DOMAIN_IDENTITY;
2928         }
2929
2930         return 0;
2931 }
2932
2933 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2934 {
2935         /*
2936          * Start from the sane iommu hardware state.
2937          * If the queued invalidation is already initialized by us
2938          * (for example, while enabling interrupt-remapping) then
2939          * we got the things already rolling from a sane state.
2940          */
2941         if (!iommu->qi) {
2942                 /*
2943                  * Clear any previous faults.
2944                  */
2945                 dmar_fault(-1, iommu);
2946                 /*
2947                  * Disable queued invalidation if supported and already enabled
2948                  * before OS handover.
2949                  */
2950                 dmar_disable_qi(iommu);
2951         }
2952
2953         if (dmar_enable_qi(iommu)) {
2954                 /*
2955                  * Queued Invalidate not enabled, use Register Based Invalidate
2956                  */
2957                 iommu->flush.flush_context = __iommu_flush_context;
2958                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2959                 pr_info("%s: Using Register based invalidation\n",
2960                         iommu->name);
2961         } else {
2962                 iommu->flush.flush_context = qi_flush_context;
2963                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2964                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2965         }
2966 }
2967
2968 static int copy_context_table(struct intel_iommu *iommu,
2969                               struct root_entry *old_re,
2970                               struct context_entry **tbl,
2971                               int bus, bool ext)
2972 {
2973         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2974         struct context_entry *new_ce = NULL, ce;
2975         struct context_entry *old_ce = NULL;
2976         struct root_entry re;
2977         phys_addr_t old_ce_phys;
2978
2979         tbl_idx = ext ? bus * 2 : bus;
2980         memcpy(&re, old_re, sizeof(re));
2981
2982         for (devfn = 0; devfn < 256; devfn++) {
2983                 /* First calculate the correct index */
2984                 idx = (ext ? devfn * 2 : devfn) % 256;
2985
2986                 if (idx == 0) {
2987                         /* First save what we may have and clean up */
2988                         if (new_ce) {
2989                                 tbl[tbl_idx] = new_ce;
2990                                 __iommu_flush_cache(iommu, new_ce,
2991                                                     VTD_PAGE_SIZE);
2992                                 pos = 1;
2993                         }
2994
2995                         if (old_ce)
2996                                 memunmap(old_ce);
2997
2998                         ret = 0;
2999                         if (devfn < 0x80)
3000                                 old_ce_phys = root_entry_lctp(&re);
3001                         else
3002                                 old_ce_phys = root_entry_uctp(&re);
3003
3004                         if (!old_ce_phys) {
3005                                 if (ext && devfn == 0) {
3006                                         /* No LCTP, try UCTP */
3007                                         devfn = 0x7f;
3008                                         continue;
3009                                 } else {
3010                                         goto out;
3011                                 }
3012                         }
3013
3014                         ret = -ENOMEM;
3015                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3016                                         MEMREMAP_WB);
3017                         if (!old_ce)
3018                                 goto out;
3019
3020                         new_ce = alloc_pgtable_page(iommu->node);
3021                         if (!new_ce)
3022                                 goto out_unmap;
3023
3024                         ret = 0;
3025                 }
3026
3027                 /* Now copy the context entry */
3028                 memcpy(&ce, old_ce + idx, sizeof(ce));
3029
3030                 if (!__context_present(&ce))
3031                         continue;
3032
3033                 did = context_domain_id(&ce);
3034                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3035                         set_bit(did, iommu->domain_ids);
3036
3037                 /*
3038                  * We need a marker for copied context entries. This
3039                  * marker needs to work for the old format as well as
3040                  * for extended context entries.
3041                  *
3042                  * Bit 67 of the context entry is used. In the old
3043                  * format this bit is available to software, in the
3044                  * extended format it is the PGE bit, but PGE is ignored
3045                  * by HW if PASIDs are disabled (and thus still
3046                  * available).
3047                  *
3048                  * So disable PASIDs first and then mark the entry
3049                  * copied. This means that we don't copy PASID
3050                  * translations from the old kernel, but this is fine as
3051                  * faults there are not fatal.
3052                  */
3053                 context_clear_pasid_enable(&ce);
3054                 context_set_copied(&ce);
3055
3056                 new_ce[idx] = ce;
3057         }
3058
3059         tbl[tbl_idx + pos] = new_ce;
3060
3061         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3062
3063 out_unmap:
3064         memunmap(old_ce);
3065
3066 out:
3067         return ret;
3068 }
3069
3070 static int copy_translation_tables(struct intel_iommu *iommu)
3071 {
3072         struct context_entry **ctxt_tbls;
3073         struct root_entry *old_rt;
3074         phys_addr_t old_rt_phys;
3075         int ctxt_table_entries;
3076         unsigned long flags;
3077         u64 rtaddr_reg;
3078         int bus, ret;
3079         bool new_ext, ext;
3080
3081         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3082         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3083         new_ext    = !!ecap_ecs(iommu->ecap);
3084
3085         /*
3086          * The RTT bit can only be changed when translation is disabled,
3087          * but disabling translation means to open a window for data
3088          * corruption. So bail out and don't copy anything if we would
3089          * have to change the bit.
3090          */
3091         if (new_ext != ext)
3092                 return -EINVAL;
3093
3094         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3095         if (!old_rt_phys)
3096                 return -EINVAL;
3097
3098         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3099         if (!old_rt)
3100                 return -ENOMEM;
3101
3102         /* This is too big for the stack - allocate it from slab */
3103         ctxt_table_entries = ext ? 512 : 256;
3104         ret = -ENOMEM;
3105         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3106         if (!ctxt_tbls)
3107                 goto out_unmap;
3108
3109         for (bus = 0; bus < 256; bus++) {
3110                 ret = copy_context_table(iommu, &old_rt[bus],
3111                                          ctxt_tbls, bus, ext);
3112                 if (ret) {
3113                         pr_err("%s: Failed to copy context table for bus %d\n",
3114                                 iommu->name, bus);
3115                         continue;
3116                 }
3117         }
3118
3119         spin_lock_irqsave(&iommu->lock, flags);
3120
3121         /* Context tables are copied, now write them to the root_entry table */
3122         for (bus = 0; bus < 256; bus++) {
3123                 int idx = ext ? bus * 2 : bus;
3124                 u64 val;
3125
3126                 if (ctxt_tbls[idx]) {
3127                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3128                         iommu->root_entry[bus].lo = val;
3129                 }
3130
3131                 if (!ext || !ctxt_tbls[idx + 1])
3132                         continue;
3133
3134                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3135                 iommu->root_entry[bus].hi = val;
3136         }
3137
3138         spin_unlock_irqrestore(&iommu->lock, flags);
3139
3140         kfree(ctxt_tbls);
3141
3142         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3143
3144         ret = 0;
3145
3146 out_unmap:
3147         memunmap(old_rt);
3148
3149         return ret;
3150 }
3151
3152 #ifdef CONFIG_INTEL_IOMMU_SVM
3153 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3154 {
3155         struct intel_iommu *iommu = data;
3156         ioasid_t ioasid;
3157
3158         if (!iommu)
3159                 return INVALID_IOASID;
3160         /*
3161          * VT-d virtual command interface always uses the full 20 bit
3162          * PASID range. Host can partition guest PASID range based on
3163          * policies but it is out of guest's control.
3164          */
3165         if (min < PASID_MIN || max > intel_pasid_max_id)
3166                 return INVALID_IOASID;
3167
3168         if (vcmd_alloc_pasid(iommu, &ioasid))
3169                 return INVALID_IOASID;
3170
3171         return ioasid;
3172 }
3173
3174 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3175 {
3176         struct intel_iommu *iommu = data;
3177
3178         if (!iommu)
3179                 return;
3180         /*
3181          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3182          * We can only free the PASID when all the devices are unbound.
3183          */
3184         if (ioasid_find(NULL, ioasid, NULL)) {
3185                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3186                 return;
3187         }
3188         vcmd_free_pasid(iommu, ioasid);
3189 }
3190
3191 static void register_pasid_allocator(struct intel_iommu *iommu)
3192 {
3193         /*
3194          * If we are running in the host, no need for custom allocator
3195          * in that PASIDs are allocated from the host system-wide.
3196          */
3197         if (!cap_caching_mode(iommu->cap))
3198                 return;
3199
3200         if (!sm_supported(iommu)) {
3201                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3202                 return;
3203         }
3204
3205         /*
3206          * Register a custom PASID allocator if we are running in a guest,
3207          * guest PASID must be obtained via virtual command interface.
3208          * There can be multiple vIOMMUs in each guest but only one allocator
3209          * is active. All vIOMMU allocators will eventually be calling the same
3210          * host allocator.
3211          */
3212         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3213                 return;
3214
3215         pr_info("Register custom PASID allocator\n");
3216         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3217         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3218         iommu->pasid_allocator.pdata = (void *)iommu;
3219         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3220                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3221                 /*
3222                  * Disable scalable mode on this IOMMU if there
3223                  * is no custom allocator. Mixing SM capable vIOMMU
3224                  * and non-SM vIOMMU are not supported.
3225                  */
3226                 intel_iommu_sm = 0;
3227         }
3228 }
3229 #endif
3230
3231 static int __init init_dmars(void)
3232 {
3233         struct dmar_drhd_unit *drhd;
3234         struct intel_iommu *iommu;
3235         int ret;
3236
3237         /*
3238          * for each drhd
3239          *    allocate root
3240          *    initialize and program root entry to not present
3241          * endfor
3242          */
3243         for_each_drhd_unit(drhd) {
3244                 /*
3245                  * lock not needed as this is only incremented in the single
3246                  * threaded kernel __init code path all other access are read
3247                  * only
3248                  */
3249                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3250                         g_num_of_iommus++;
3251                         continue;
3252                 }
3253                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3254         }
3255
3256         /* Preallocate enough resources for IOMMU hot-addition */
3257         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3258                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3259
3260         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3261                         GFP_KERNEL);
3262         if (!g_iommus) {
3263                 pr_err("Allocating global iommu array failed\n");
3264                 ret = -ENOMEM;
3265                 goto error;
3266         }
3267
3268         for_each_iommu(iommu, drhd) {
3269                 if (drhd->ignored) {
3270                         iommu_disable_translation(iommu);
3271                         continue;
3272                 }
3273
3274                 /*
3275                  * Find the max pasid size of all IOMMU's in the system.
3276                  * We need to ensure the system pasid table is no bigger
3277                  * than the smallest supported.
3278                  */
3279                 if (pasid_supported(iommu)) {
3280                         u32 temp = 2 << ecap_pss(iommu->ecap);
3281
3282                         intel_pasid_max_id = min_t(u32, temp,
3283                                                    intel_pasid_max_id);
3284                 }
3285
3286                 g_iommus[iommu->seq_id] = iommu;
3287
3288                 intel_iommu_init_qi(iommu);
3289
3290                 ret = iommu_init_domains(iommu);
3291                 if (ret)
3292                         goto free_iommu;
3293
3294                 init_translation_status(iommu);
3295
3296                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3297                         iommu_disable_translation(iommu);
3298                         clear_translation_pre_enabled(iommu);
3299                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3300                                 iommu->name);
3301                 }
3302
3303                 /*
3304                  * TBD:
3305                  * we could share the same root & context tables
3306                  * among all IOMMU's. Need to Split it later.
3307                  */
3308                 ret = iommu_alloc_root_entry(iommu);
3309                 if (ret)
3310                         goto free_iommu;
3311
3312                 if (translation_pre_enabled(iommu)) {
3313                         pr_info("Translation already enabled - trying to copy translation structures\n");
3314
3315                         ret = copy_translation_tables(iommu);
3316                         if (ret) {
3317                                 /*
3318                                  * We found the IOMMU with translation
3319                                  * enabled - but failed to copy over the
3320                                  * old root-entry table. Try to proceed
3321                                  * by disabling translation now and
3322                                  * allocating a clean root-entry table.
3323                                  * This might cause DMAR faults, but
3324                                  * probably the dump will still succeed.
3325                                  */
3326                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3327                                        iommu->name);
3328                                 iommu_disable_translation(iommu);
3329                                 clear_translation_pre_enabled(iommu);
3330                         } else {
3331                                 pr_info("Copied translation tables from previous kernel for %s\n",
3332                                         iommu->name);
3333                         }
3334                 }
3335
3336                 if (!ecap_pass_through(iommu->ecap))
3337                         hw_pass_through = 0;
3338                 intel_svm_check(iommu);
3339         }
3340
3341         /*
3342          * Now that qi is enabled on all iommus, set the root entry and flush
3343          * caches. This is required on some Intel X58 chipsets, otherwise the
3344          * flush_context function will loop forever and the boot hangs.
3345          */
3346         for_each_active_iommu(iommu, drhd) {
3347                 iommu_flush_write_buffer(iommu);
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349                 register_pasid_allocator(iommu);
3350 #endif
3351                 iommu_set_root_entry(iommu);
3352                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3353                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3354         }
3355
3356 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3357         dmar_map_gfx = 0;
3358 #endif
3359
3360         if (!dmar_map_gfx)
3361                 iommu_identity_mapping |= IDENTMAP_GFX;
3362
3363         check_tylersburg_isoch();
3364
3365         ret = si_domain_init(hw_pass_through);
3366         if (ret)
3367                 goto free_iommu;
3368
3369         /*
3370          * for each drhd
3371          *   enable fault log
3372          *   global invalidate context cache
3373          *   global invalidate iotlb
3374          *   enable translation
3375          */
3376         for_each_iommu(iommu, drhd) {
3377                 if (drhd->ignored) {
3378                         /*
3379                          * we always have to disable PMRs or DMA may fail on
3380                          * this device
3381                          */
3382                         if (force_on)
3383                                 iommu_disable_protect_mem_regions(iommu);
3384                         continue;
3385                 }
3386
3387                 iommu_flush_write_buffer(iommu);
3388
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3391                         /*
3392                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3393                          * could cause possible lock race condition.
3394                          */
3395                         up_write(&dmar_global_lock);
3396                         ret = intel_svm_enable_prq(iommu);
3397                         down_write(&dmar_global_lock);
3398                         if (ret)
3399                                 goto free_iommu;
3400                 }
3401 #endif
3402                 ret = dmar_set_interrupt(iommu);
3403                 if (ret)
3404                         goto free_iommu;
3405         }
3406
3407         return 0;
3408
3409 free_iommu:
3410         for_each_active_iommu(iommu, drhd) {
3411                 disable_dmar_iommu(iommu);
3412                 free_dmar_iommu(iommu);
3413         }
3414
3415         kfree(g_iommus);
3416
3417 error:
3418         return ret;
3419 }
3420
3421 /* This takes a number of _MM_ pages, not VTD pages */
3422 static unsigned long intel_alloc_iova(struct device *dev,
3423                                      struct dmar_domain *domain,
3424                                      unsigned long nrpages, uint64_t dma_mask)
3425 {
3426         unsigned long iova_pfn;
3427
3428         /*
3429          * Restrict dma_mask to the width that the iommu can handle.
3430          * First-level translation restricts the input-address to a
3431          * canonical address (i.e., address bits 63:N have the same
3432          * value as address bit [N-1], where N is 48-bits with 4-level
3433          * paging and 57-bits with 5-level paging). Hence, skip bit
3434          * [N-1].
3435          */
3436         if (domain_use_first_level(domain))
3437                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3438                                  dma_mask);
3439         else
3440                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3441                                  dma_mask);
3442
3443         /* Ensure we reserve the whole size-aligned region */
3444         nrpages = __roundup_pow_of_two(nrpages);
3445
3446         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3447                 /*
3448                  * First try to allocate an io virtual address in
3449                  * DMA_BIT_MASK(32) and if that fails then try allocating
3450                  * from higher range
3451                  */
3452                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3453                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3454                 if (iova_pfn)
3455                         return iova_pfn;
3456         }
3457         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3458                                    IOVA_PFN(dma_mask), true);
3459         if (unlikely(!iova_pfn)) {
3460                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3461                              nrpages);
3462                 return 0;
3463         }
3464
3465         return iova_pfn;
3466 }
3467
3468 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3469                                      size_t size, int dir, u64 dma_mask)
3470 {
3471         struct dmar_domain *domain;
3472         phys_addr_t start_paddr;
3473         unsigned long iova_pfn;
3474         int prot = 0;
3475         int ret;
3476         struct intel_iommu *iommu;
3477         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3478
3479         BUG_ON(dir == DMA_NONE);
3480
3481         if (unlikely(attach_deferred(dev)))
3482                 do_deferred_attach(dev);
3483
3484         domain = find_domain(dev);
3485         if (!domain)
3486                 return DMA_MAPPING_ERROR;
3487
3488         iommu = domain_get_iommu(domain);
3489         size = aligned_nrpages(paddr, size);
3490
3491         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3492         if (!iova_pfn)
3493                 goto error;
3494
3495         /*
3496          * Check if DMAR supports zero-length reads on write only
3497          * mappings..
3498          */
3499         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3500                         !cap_zlr(iommu->cap))
3501                 prot |= DMA_PTE_READ;
3502         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3503                 prot |= DMA_PTE_WRITE;
3504         /*
3505          * paddr - (paddr + size) might be partial page, we should map the whole
3506          * page.  Note: if two part of one page are separately mapped, we
3507          * might have two guest_addr mapping to the same host paddr, but this
3508          * is not a big problem
3509          */
3510         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3511                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3512         if (ret)
3513                 goto error;
3514
3515         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3516         start_paddr += paddr & ~PAGE_MASK;
3517
3518         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3519
3520         return start_paddr;
3521
3522 error:
3523         if (iova_pfn)
3524                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3525         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3526                 size, (unsigned long long)paddr, dir);
3527         return DMA_MAPPING_ERROR;
3528 }
3529
3530 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3531                                  unsigned long offset, size_t size,
3532                                  enum dma_data_direction dir,
3533                                  unsigned long attrs)
3534 {
3535         return __intel_map_single(dev, page_to_phys(page) + offset,
3536                                   size, dir, *dev->dma_mask);
3537 }
3538
3539 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3540                                      size_t size, enum dma_data_direction dir,
3541                                      unsigned long attrs)
3542 {
3543         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3544 }
3545
3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3547 {
3548         struct dmar_domain *domain;
3549         unsigned long start_pfn, last_pfn;
3550         unsigned long nrpages;
3551         unsigned long iova_pfn;
3552         struct intel_iommu *iommu;
3553         struct page *freelist;
3554         struct pci_dev *pdev = NULL;
3555
3556         domain = find_domain(dev);
3557         BUG_ON(!domain);
3558
3559         iommu = domain_get_iommu(domain);
3560
3561         iova_pfn = IOVA_PFN(dev_addr);
3562
3563         nrpages = aligned_nrpages(dev_addr, size);
3564         start_pfn = mm_to_dma_pfn(iova_pfn);
3565         last_pfn = start_pfn + nrpages - 1;
3566
3567         if (dev_is_pci(dev))
3568                 pdev = to_pci_dev(dev);
3569
3570         freelist = domain_unmap(domain, start_pfn, last_pfn);
3571         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3572                         !has_iova_flush_queue(&domain->iovad)) {
3573                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3574                                       nrpages, !freelist, 0);
3575                 /* free iova */
3576                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3577                 dma_free_pagelist(freelist);
3578         } else {
3579                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3580                            (unsigned long)freelist);
3581                 /*
3582                  * queue up the release of the unmap to save the 1/6th of the
3583                  * cpu used up by the iotlb flush operation...
3584                  */
3585         }
3586
3587         trace_unmap_single(dev, dev_addr, size);
3588 }
3589
3590 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3591                              size_t size, enum dma_data_direction dir,
3592                              unsigned long attrs)
3593 {
3594         intel_unmap(dev, dev_addr, size);
3595 }
3596
3597 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3598                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3599 {
3600         intel_unmap(dev, dev_addr, size);
3601 }
3602
3603 static void *intel_alloc_coherent(struct device *dev, size_t size,
3604                                   dma_addr_t *dma_handle, gfp_t flags,
3605                                   unsigned long attrs)
3606 {
3607         struct page *page = NULL;
3608         int order;
3609
3610         if (unlikely(attach_deferred(dev)))
3611                 do_deferred_attach(dev);
3612
3613         size = PAGE_ALIGN(size);
3614         order = get_order(size);
3615
3616         if (gfpflags_allow_blocking(flags)) {
3617                 unsigned int count = size >> PAGE_SHIFT;
3618
3619                 page = dma_alloc_from_contiguous(dev, count, order,
3620                                                  flags & __GFP_NOWARN);
3621         }
3622
3623         if (!page)
3624                 page = alloc_pages(flags, order);
3625         if (!page)
3626                 return NULL;
3627         memset(page_address(page), 0, size);
3628
3629         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3630                                          DMA_BIDIRECTIONAL,
3631                                          dev->coherent_dma_mask);
3632         if (*dma_handle != DMA_MAPPING_ERROR)
3633                 return page_address(page);
3634         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3635                 __free_pages(page, order);
3636
3637         return NULL;
3638 }
3639
3640 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3641                                 dma_addr_t dma_handle, unsigned long attrs)
3642 {
3643         int order;
3644         struct page *page = virt_to_page(vaddr);
3645
3646         size = PAGE_ALIGN(size);
3647         order = get_order(size);
3648
3649         intel_unmap(dev, dma_handle, size);
3650         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651                 __free_pages(page, order);
3652 }
3653
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655                            int nelems, enum dma_data_direction dir,
3656                            unsigned long attrs)
3657 {
3658         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659         unsigned long nrpages = 0;
3660         struct scatterlist *sg;
3661         int i;
3662
3663         for_each_sg(sglist, sg, nelems, i) {
3664                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3665         }
3666
3667         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3668
3669         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3670 }
3671
3672 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3673                         enum dma_data_direction dir, unsigned long attrs)
3674 {
3675         int i;
3676         struct dmar_domain *domain;
3677         size_t size = 0;
3678         int prot = 0;
3679         unsigned long iova_pfn;
3680         int ret;
3681         struct scatterlist *sg;
3682         unsigned long start_vpfn;
3683         struct intel_iommu *iommu;
3684
3685         BUG_ON(dir == DMA_NONE);
3686
3687         if (unlikely(attach_deferred(dev)))
3688                 do_deferred_attach(dev);
3689
3690         domain = find_domain(dev);
3691         if (!domain)
3692                 return 0;
3693
3694         iommu = domain_get_iommu(domain);
3695
3696         for_each_sg(sglist, sg, nelems, i)
3697                 size += aligned_nrpages(sg->offset, sg->length);
3698
3699         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3700                                 *dev->dma_mask);
3701         if (!iova_pfn) {
3702                 sglist->dma_length = 0;
3703                 return 0;
3704         }
3705
3706         /*
3707          * Check if DMAR supports zero-length reads on write only
3708          * mappings..
3709          */
3710         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711                         !cap_zlr(iommu->cap))
3712                 prot |= DMA_PTE_READ;
3713         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714                 prot |= DMA_PTE_WRITE;
3715
3716         start_vpfn = mm_to_dma_pfn(iova_pfn);
3717
3718         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719         if (unlikely(ret)) {
3720                 dma_pte_free_pagetable(domain, start_vpfn,
3721                                        start_vpfn + size - 1,
3722                                        agaw_to_level(domain->agaw) + 1);
3723                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3724                 return 0;
3725         }
3726
3727         for_each_sg(sglist, sg, nelems, i)
3728                 trace_map_sg(dev, i + 1, nelems, sg);
3729
3730         return nelems;
3731 }
3732
3733 static u64 intel_get_required_mask(struct device *dev)
3734 {
3735         return DMA_BIT_MASK(32);
3736 }
3737
3738 static const struct dma_map_ops intel_dma_ops = {
3739         .alloc = intel_alloc_coherent,
3740         .free = intel_free_coherent,
3741         .map_sg = intel_map_sg,
3742         .unmap_sg = intel_unmap_sg,
3743         .map_page = intel_map_page,
3744         .unmap_page = intel_unmap_page,
3745         .map_resource = intel_map_resource,
3746         .unmap_resource = intel_unmap_resource,
3747         .dma_supported = dma_direct_supported,
3748         .mmap = dma_common_mmap,
3749         .get_sgtable = dma_common_get_sgtable,
3750         .alloc_pages = dma_common_alloc_pages,
3751         .free_pages = dma_common_free_pages,
3752         .get_required_mask = intel_get_required_mask,
3753 };
3754
3755 static void
3756 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3757                    enum dma_data_direction dir, enum dma_sync_target target)
3758 {
3759         struct dmar_domain *domain;
3760         phys_addr_t tlb_addr;
3761
3762         domain = find_domain(dev);
3763         if (WARN_ON(!domain))
3764                 return;
3765
3766         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3767         if (is_swiotlb_buffer(tlb_addr))
3768                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3769 }
3770
3771 static dma_addr_t
3772 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3773                   enum dma_data_direction dir, unsigned long attrs,
3774                   u64 dma_mask)
3775 {
3776         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3777         struct dmar_domain *domain;
3778         struct intel_iommu *iommu;
3779         unsigned long iova_pfn;
3780         unsigned long nrpages;
3781         phys_addr_t tlb_addr;
3782         int prot = 0;
3783         int ret;
3784
3785         if (unlikely(attach_deferred(dev)))
3786                 do_deferred_attach(dev);
3787
3788         domain = find_domain(dev);
3789
3790         if (WARN_ON(dir == DMA_NONE || !domain))
3791                 return DMA_MAPPING_ERROR;
3792
3793         iommu = domain_get_iommu(domain);
3794         if (WARN_ON(!iommu))
3795                 return DMA_MAPPING_ERROR;
3796
3797         nrpages = aligned_nrpages(0, size);
3798         iova_pfn = intel_alloc_iova(dev, domain,
3799                                     dma_to_mm_pfn(nrpages), dma_mask);
3800         if (!iova_pfn)
3801                 return DMA_MAPPING_ERROR;
3802
3803         /*
3804          * Check if DMAR supports zero-length reads on write only
3805          * mappings..
3806          */
3807         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3808                         !cap_zlr(iommu->cap))
3809                 prot |= DMA_PTE_READ;
3810         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3811                 prot |= DMA_PTE_WRITE;
3812
3813         /*
3814          * If both the physical buffer start address and size are
3815          * page aligned, we don't need to use a bounce page.
3816          */
3817         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3818                 tlb_addr = swiotlb_tbl_map_single(dev,
3819                                 phys_to_dma_unencrypted(dev, io_tlb_start),
3820                                 paddr, size, aligned_size, dir, attrs);
3821                 if (tlb_addr == DMA_MAPPING_ERROR) {
3822                         goto swiotlb_error;
3823                 } else {
3824                         /* Cleanup the padding area. */
3825                         void *padding_start = phys_to_virt(tlb_addr);
3826                         size_t padding_size = aligned_size;
3827
3828                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3829                             (dir == DMA_TO_DEVICE ||
3830                              dir == DMA_BIDIRECTIONAL)) {
3831                                 padding_start += size;
3832                                 padding_size -= size;
3833                         }
3834
3835                         memset(padding_start, 0, padding_size);
3836                 }
3837         } else {
3838                 tlb_addr = paddr;
3839         }
3840
3841         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3842                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3843         if (ret)
3844                 goto mapping_error;
3845
3846         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3847
3848         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3849
3850 mapping_error:
3851         if (is_swiotlb_buffer(tlb_addr))
3852                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3853                                          aligned_size, dir, attrs);
3854 swiotlb_error:
3855         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3856         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3857                 size, (unsigned long long)paddr, dir);
3858
3859         return DMA_MAPPING_ERROR;
3860 }
3861
3862 static void
3863 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3864                     enum dma_data_direction dir, unsigned long attrs)
3865 {
3866         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3867         struct dmar_domain *domain;
3868         phys_addr_t tlb_addr;
3869
3870         domain = find_domain(dev);
3871         if (WARN_ON(!domain))
3872                 return;
3873
3874         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3875         if (WARN_ON(!tlb_addr))
3876                 return;
3877
3878         intel_unmap(dev, dev_addr, size);
3879         if (is_swiotlb_buffer(tlb_addr))
3880                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3881                                          aligned_size, dir, attrs);
3882
3883         trace_bounce_unmap_single(dev, dev_addr, size);
3884 }
3885
3886 static dma_addr_t
3887 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3888                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3889 {
3890         return bounce_map_single(dev, page_to_phys(page) + offset,
3891                                  size, dir, attrs, *dev->dma_mask);
3892 }
3893
3894 static dma_addr_t
3895 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3896                     enum dma_data_direction dir, unsigned long attrs)
3897 {
3898         return bounce_map_single(dev, phys_addr, size,
3899                                  dir, attrs, *dev->dma_mask);
3900 }
3901
3902 static void
3903 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3904                   enum dma_data_direction dir, unsigned long attrs)
3905 {
3906         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3907 }
3908
3909 static void
3910 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3911                       enum dma_data_direction dir, unsigned long attrs)
3912 {
3913         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3914 }
3915
3916 static void
3917 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3918                 enum dma_data_direction dir, unsigned long attrs)
3919 {
3920         struct scatterlist *sg;
3921         int i;
3922
3923         for_each_sg(sglist, sg, nelems, i)
3924                 bounce_unmap_page(dev, sg->dma_address,
3925                                   sg_dma_len(sg), dir, attrs);
3926 }
3927
3928 static int
3929 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3930               enum dma_data_direction dir, unsigned long attrs)
3931 {
3932         int i;
3933         struct scatterlist *sg;
3934
3935         for_each_sg(sglist, sg, nelems, i) {
3936                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3937                                                   sg->offset, sg->length,
3938                                                   dir, attrs);
3939                 if (sg->dma_address == DMA_MAPPING_ERROR)
3940                         goto out_unmap;
3941                 sg_dma_len(sg) = sg->length;
3942         }
3943
3944         for_each_sg(sglist, sg, nelems, i)
3945                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3946
3947         return nelems;
3948
3949 out_unmap:
3950         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3951         return 0;
3952 }
3953
3954 static void
3955 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3956                            size_t size, enum dma_data_direction dir)
3957 {
3958         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3959 }
3960
3961 static void
3962 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3963                               size_t size, enum dma_data_direction dir)
3964 {
3965         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3966 }
3967
3968 static void
3969 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3970                        int nelems, enum dma_data_direction dir)
3971 {
3972         struct scatterlist *sg;
3973         int i;
3974
3975         for_each_sg(sglist, sg, nelems, i)
3976                 bounce_sync_single(dev, sg_dma_address(sg),
3977                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3978 }
3979
3980 static void
3981 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3982                           int nelems, enum dma_data_direction dir)
3983 {
3984         struct scatterlist *sg;
3985         int i;
3986
3987         for_each_sg(sglist, sg, nelems, i)
3988                 bounce_sync_single(dev, sg_dma_address(sg),
3989                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3990 }
3991
3992 static const struct dma_map_ops bounce_dma_ops = {
3993         .alloc                  = intel_alloc_coherent,
3994         .free                   = intel_free_coherent,
3995         .map_sg                 = bounce_map_sg,
3996         .unmap_sg               = bounce_unmap_sg,
3997         .map_page               = bounce_map_page,
3998         .unmap_page             = bounce_unmap_page,
3999         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4000         .sync_single_for_device = bounce_sync_single_for_device,
4001         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4002         .sync_sg_for_device     = bounce_sync_sg_for_device,
4003         .map_resource           = bounce_map_resource,
4004         .unmap_resource         = bounce_unmap_resource,
4005         .alloc_pages            = dma_common_alloc_pages,
4006         .free_pages             = dma_common_free_pages,
4007         .dma_supported          = dma_direct_supported,
4008 };
4009
4010 static inline int iommu_domain_cache_init(void)
4011 {
4012         int ret = 0;
4013
4014         iommu_domain_cache = kmem_cache_create("iommu_domain",
4015                                          sizeof(struct dmar_domain),
4016                                          0,
4017                                          SLAB_HWCACHE_ALIGN,
4018
4019                                          NULL);
4020         if (!iommu_domain_cache) {
4021                 pr_err("Couldn't create iommu_domain cache\n");
4022                 ret = -ENOMEM;
4023         }
4024
4025         return ret;
4026 }
4027
4028 static inline int iommu_devinfo_cache_init(void)
4029 {
4030         int ret = 0;
4031
4032         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4033                                          sizeof(struct device_domain_info),
4034                                          0,
4035                                          SLAB_HWCACHE_ALIGN,
4036                                          NULL);
4037         if (!iommu_devinfo_cache) {
4038                 pr_err("Couldn't create devinfo cache\n");
4039                 ret = -ENOMEM;
4040         }
4041
4042         return ret;
4043 }
4044
4045 static int __init iommu_init_mempool(void)
4046 {
4047         int ret;
4048         ret = iova_cache_get();
4049         if (ret)
4050                 return ret;
4051
4052         ret = iommu_domain_cache_init();
4053         if (ret)
4054                 goto domain_error;
4055
4056         ret = iommu_devinfo_cache_init();
4057         if (!ret)
4058                 return ret;
4059
4060         kmem_cache_destroy(iommu_domain_cache);
4061 domain_error:
4062         iova_cache_put();
4063
4064         return -ENOMEM;
4065 }
4066
4067 static void __init iommu_exit_mempool(void)
4068 {
4069         kmem_cache_destroy(iommu_devinfo_cache);
4070         kmem_cache_destroy(iommu_domain_cache);
4071         iova_cache_put();
4072 }
4073
4074 static void __init init_no_remapping_devices(void)
4075 {
4076         struct dmar_drhd_unit *drhd;
4077         struct device *dev;
4078         int i;
4079
4080         for_each_drhd_unit(drhd) {
4081                 if (!drhd->include_all) {
4082                         for_each_active_dev_scope(drhd->devices,
4083                                                   drhd->devices_cnt, i, dev)
4084                                 break;
4085                         /* ignore DMAR unit if no devices exist */
4086                         if (i == drhd->devices_cnt)
4087                                 drhd->ignored = 1;
4088                 }
4089         }
4090
4091         for_each_active_drhd_unit(drhd) {
4092                 if (drhd->include_all)
4093                         continue;
4094
4095                 for_each_active_dev_scope(drhd->devices,
4096                                           drhd->devices_cnt, i, dev)
4097                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4098                                 break;
4099                 if (i < drhd->devices_cnt)
4100                         continue;
4101
4102                 /* This IOMMU has *only* gfx devices. Either bypass it or
4103                    set the gfx_mapped flag, as appropriate */
4104                 drhd->gfx_dedicated = 1;
4105                 if (!dmar_map_gfx)
4106                         drhd->ignored = 1;
4107         }
4108 }
4109
4110 #ifdef CONFIG_SUSPEND
4111 static int init_iommu_hw(void)
4112 {
4113         struct dmar_drhd_unit *drhd;
4114         struct intel_iommu *iommu = NULL;
4115
4116         for_each_active_iommu(iommu, drhd)
4117                 if (iommu->qi)
4118                         dmar_reenable_qi(iommu);
4119
4120         for_each_iommu(iommu, drhd) {
4121                 if (drhd->ignored) {
4122                         /*
4123                          * we always have to disable PMRs or DMA may fail on
4124                          * this device
4125                          */
4126                         if (force_on)
4127                                 iommu_disable_protect_mem_regions(iommu);
4128                         continue;
4129                 }
4130
4131                 iommu_flush_write_buffer(iommu);
4132
4133                 iommu_set_root_entry(iommu);
4134
4135                 iommu->flush.flush_context(iommu, 0, 0, 0,
4136                                            DMA_CCMD_GLOBAL_INVL);
4137                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4138                 iommu_enable_translation(iommu);
4139                 iommu_disable_protect_mem_regions(iommu);
4140         }
4141
4142         return 0;
4143 }
4144
4145 static void iommu_flush_all(void)
4146 {
4147         struct dmar_drhd_unit *drhd;
4148         struct intel_iommu *iommu;
4149
4150         for_each_active_iommu(iommu, drhd) {
4151                 iommu->flush.flush_context(iommu, 0, 0, 0,
4152                                            DMA_CCMD_GLOBAL_INVL);
4153                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4154                                          DMA_TLB_GLOBAL_FLUSH);
4155         }
4156 }
4157
4158 static int iommu_suspend(void)
4159 {
4160         struct dmar_drhd_unit *drhd;
4161         struct intel_iommu *iommu = NULL;
4162         unsigned long flag;
4163
4164         for_each_active_iommu(iommu, drhd) {
4165                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4166                                                  GFP_ATOMIC);
4167                 if (!iommu->iommu_state)
4168                         goto nomem;
4169         }
4170
4171         iommu_flush_all();
4172
4173         for_each_active_iommu(iommu, drhd) {
4174                 iommu_disable_translation(iommu);
4175
4176                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4177
4178                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4179                         readl(iommu->reg + DMAR_FECTL_REG);
4180                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4181                         readl(iommu->reg + DMAR_FEDATA_REG);
4182                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4183                         readl(iommu->reg + DMAR_FEADDR_REG);
4184                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4185                         readl(iommu->reg + DMAR_FEUADDR_REG);
4186
4187                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4188         }
4189         return 0;
4190
4191 nomem:
4192         for_each_active_iommu(iommu, drhd)
4193                 kfree(iommu->iommu_state);
4194
4195         return -ENOMEM;
4196 }
4197
4198 static void iommu_resume(void)
4199 {
4200         struct dmar_drhd_unit *drhd;
4201         struct intel_iommu *iommu = NULL;
4202         unsigned long flag;
4203
4204         if (init_iommu_hw()) {
4205                 if (force_on)
4206                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4207                 else
4208                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4209                 return;
4210         }
4211
4212         for_each_active_iommu(iommu, drhd) {
4213
4214                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4215
4216                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4217                         iommu->reg + DMAR_FECTL_REG);
4218                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4219                         iommu->reg + DMAR_FEDATA_REG);
4220                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4221                         iommu->reg + DMAR_FEADDR_REG);
4222                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4223                         iommu->reg + DMAR_FEUADDR_REG);
4224
4225                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4226         }
4227
4228         for_each_active_iommu(iommu, drhd)
4229                 kfree(iommu->iommu_state);
4230 }
4231
4232 static struct syscore_ops iommu_syscore_ops = {
4233         .resume         = iommu_resume,
4234         .suspend        = iommu_suspend,
4235 };
4236
4237 static void __init init_iommu_pm_ops(void)
4238 {
4239         register_syscore_ops(&iommu_syscore_ops);
4240 }
4241
4242 #else
4243 static inline void init_iommu_pm_ops(void) {}
4244 #endif  /* CONFIG_PM */
4245
4246 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4247 {
4248         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4249             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4250             rmrr->end_address <= rmrr->base_address ||
4251             arch_rmrr_sanity_check(rmrr))
4252                 return -EINVAL;
4253
4254         return 0;
4255 }
4256
4257 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4258 {
4259         struct acpi_dmar_reserved_memory *rmrr;
4260         struct dmar_rmrr_unit *rmrru;
4261
4262         rmrr = (struct acpi_dmar_reserved_memory *)header;
4263         if (rmrr_sanity_check(rmrr)) {
4264                 pr_warn(FW_BUG
4265                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4266                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4267                            rmrr->base_address, rmrr->end_address,
4268                            dmi_get_system_info(DMI_BIOS_VENDOR),
4269                            dmi_get_system_info(DMI_BIOS_VERSION),
4270                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4271                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4272         }
4273
4274         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4275         if (!rmrru)
4276                 goto out;
4277
4278         rmrru->hdr = header;
4279
4280         rmrru->base_address = rmrr->base_address;
4281         rmrru->end_address = rmrr->end_address;
4282
4283         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4284                                 ((void *)rmrr) + rmrr->header.length,
4285                                 &rmrru->devices_cnt);
4286         if (rmrru->devices_cnt && rmrru->devices == NULL)
4287                 goto free_rmrru;
4288
4289         list_add(&rmrru->list, &dmar_rmrr_units);
4290
4291         return 0;
4292 free_rmrru:
4293         kfree(rmrru);
4294 out:
4295         return -ENOMEM;
4296 }
4297
4298 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4299 {
4300         struct dmar_atsr_unit *atsru;
4301         struct acpi_dmar_atsr *tmp;
4302
4303         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4304                                 dmar_rcu_check()) {
4305                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4306                 if (atsr->segment != tmp->segment)
4307                         continue;
4308                 if (atsr->header.length != tmp->header.length)
4309                         continue;
4310                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4311                         return atsru;
4312         }
4313
4314         return NULL;
4315 }
4316
4317 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4318 {
4319         struct acpi_dmar_atsr *atsr;
4320         struct dmar_atsr_unit *atsru;
4321
4322         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4323                 return 0;
4324
4325         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4326         atsru = dmar_find_atsr(atsr);
4327         if (atsru)
4328                 return 0;
4329
4330         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4331         if (!atsru)
4332                 return -ENOMEM;
4333
4334         /*
4335          * If memory is allocated from slab by ACPI _DSM method, we need to
4336          * copy the memory content because the memory buffer will be freed
4337          * on return.
4338          */
4339         atsru->hdr = (void *)(atsru + 1);
4340         memcpy(atsru->hdr, hdr, hdr->length);
4341         atsru->include_all = atsr->flags & 0x1;
4342         if (!atsru->include_all) {
4343                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4344                                 (void *)atsr + atsr->header.length,
4345                                 &atsru->devices_cnt);
4346                 if (atsru->devices_cnt && atsru->devices == NULL) {
4347                         kfree(atsru);
4348                         return -ENOMEM;
4349                 }
4350         }
4351
4352         list_add_rcu(&atsru->list, &dmar_atsr_units);
4353
4354         return 0;
4355 }
4356
4357 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4358 {
4359         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4360         kfree(atsru);
4361 }
4362
4363 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4364 {
4365         struct acpi_dmar_atsr *atsr;
4366         struct dmar_atsr_unit *atsru;
4367
4368         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4369         atsru = dmar_find_atsr(atsr);
4370         if (atsru) {
4371                 list_del_rcu(&atsru->list);
4372                 synchronize_rcu();
4373                 intel_iommu_free_atsr(atsru);
4374         }
4375
4376         return 0;
4377 }
4378
4379 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4380 {
4381         int i;
4382         struct device *dev;
4383         struct acpi_dmar_atsr *atsr;
4384         struct dmar_atsr_unit *atsru;
4385
4386         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4387         atsru = dmar_find_atsr(atsr);
4388         if (!atsru)
4389                 return 0;
4390
4391         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4392                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4393                                           i, dev)
4394                         return -EBUSY;
4395         }
4396
4397         return 0;
4398 }
4399
4400 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4401 {
4402         int sp, ret;
4403         struct intel_iommu *iommu = dmaru->iommu;
4404
4405         if (g_iommus[iommu->seq_id])
4406                 return 0;
4407
4408         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4409                 pr_warn("%s: Doesn't support hardware pass through.\n",
4410                         iommu->name);
4411                 return -ENXIO;
4412         }
4413         if (!ecap_sc_support(iommu->ecap) &&
4414             domain_update_iommu_snooping(iommu)) {
4415                 pr_warn("%s: Doesn't support snooping.\n",
4416                         iommu->name);
4417                 return -ENXIO;
4418         }
4419         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4420         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4421                 pr_warn("%s: Doesn't support large page.\n",
4422                         iommu->name);
4423                 return -ENXIO;
4424         }
4425
4426         /*
4427          * Disable translation if already enabled prior to OS handover.
4428          */
4429         if (iommu->gcmd & DMA_GCMD_TE)
4430                 iommu_disable_translation(iommu);
4431
4432         g_iommus[iommu->seq_id] = iommu;
4433         ret = iommu_init_domains(iommu);
4434         if (ret == 0)
4435                 ret = iommu_alloc_root_entry(iommu);
4436         if (ret)
4437                 goto out;
4438
4439         intel_svm_check(iommu);
4440
4441         if (dmaru->ignored) {
4442                 /*
4443                  * we always have to disable PMRs or DMA may fail on this device
4444                  */
4445                 if (force_on)
4446                         iommu_disable_protect_mem_regions(iommu);
4447                 return 0;
4448         }
4449
4450         intel_iommu_init_qi(iommu);
4451         iommu_flush_write_buffer(iommu);
4452
4453 #ifdef CONFIG_INTEL_IOMMU_SVM
4454         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4455                 ret = intel_svm_enable_prq(iommu);
4456                 if (ret)
4457                         goto disable_iommu;
4458         }
4459 #endif
4460         ret = dmar_set_interrupt(iommu);
4461         if (ret)
4462                 goto disable_iommu;
4463
4464         iommu_set_root_entry(iommu);
4465         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4466         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4467         iommu_enable_translation(iommu);
4468
4469         iommu_disable_protect_mem_regions(iommu);
4470         return 0;
4471
4472 disable_iommu:
4473         disable_dmar_iommu(iommu);
4474 out:
4475         free_dmar_iommu(iommu);
4476         return ret;
4477 }
4478
4479 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4480 {
4481         int ret = 0;
4482         struct intel_iommu *iommu = dmaru->iommu;
4483
4484         if (!intel_iommu_enabled)
4485                 return 0;
4486         if (iommu == NULL)
4487                 return -EINVAL;
4488
4489         if (insert) {
4490                 ret = intel_iommu_add(dmaru);
4491         } else {
4492                 disable_dmar_iommu(iommu);
4493                 free_dmar_iommu(iommu);
4494         }
4495
4496         return ret;
4497 }
4498
4499 static void intel_iommu_free_dmars(void)
4500 {
4501         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4502         struct dmar_atsr_unit *atsru, *atsr_n;
4503
4504         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4505                 list_del(&rmrru->list);
4506                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4507                 kfree(rmrru);
4508         }
4509
4510         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4511                 list_del(&atsru->list);
4512                 intel_iommu_free_atsr(atsru);
4513         }
4514 }
4515
4516 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4517 {
4518         int i, ret = 1;
4519         struct pci_bus *bus;
4520         struct pci_dev *bridge = NULL;
4521         struct device *tmp;
4522         struct acpi_dmar_atsr *atsr;
4523         struct dmar_atsr_unit *atsru;
4524
4525         dev = pci_physfn(dev);
4526         for (bus = dev->bus; bus; bus = bus->parent) {
4527                 bridge = bus->self;
4528                 /* If it's an integrated device, allow ATS */
4529                 if (!bridge)
4530                         return 1;
4531                 /* Connected via non-PCIe: no ATS */
4532                 if (!pci_is_pcie(bridge) ||
4533                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4534                         return 0;
4535                 /* If we found the root port, look it up in the ATSR */
4536                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4537                         break;
4538         }
4539
4540         rcu_read_lock();
4541         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4542                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4543                 if (atsr->segment != pci_domain_nr(dev->bus))
4544                         continue;
4545
4546                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4547                         if (tmp == &bridge->dev)
4548                                 goto out;
4549
4550                 if (atsru->include_all)
4551                         goto out;
4552         }
4553         ret = 0;
4554 out:
4555         rcu_read_unlock();
4556
4557         return ret;
4558 }
4559
4560 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4561 {
4562         int ret;
4563         struct dmar_rmrr_unit *rmrru;
4564         struct dmar_atsr_unit *atsru;
4565         struct acpi_dmar_atsr *atsr;
4566         struct acpi_dmar_reserved_memory *rmrr;
4567
4568         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4569                 return 0;
4570
4571         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4572                 rmrr = container_of(rmrru->hdr,
4573                                     struct acpi_dmar_reserved_memory, header);
4574                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4575                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4576                                 ((void *)rmrr) + rmrr->header.length,
4577                                 rmrr->segment, rmrru->devices,
4578                                 rmrru->devices_cnt);
4579                         if (ret < 0)
4580                                 return ret;
4581                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4582                         dmar_remove_dev_scope(info, rmrr->segment,
4583                                 rmrru->devices, rmrru->devices_cnt);
4584                 }
4585         }
4586
4587         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4588                 if (atsru->include_all)
4589                         continue;
4590
4591                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4592                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4593                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4594                                         (void *)atsr + atsr->header.length,
4595                                         atsr->segment, atsru->devices,
4596                                         atsru->devices_cnt);
4597                         if (ret > 0)
4598                                 break;
4599                         else if (ret < 0)
4600                                 return ret;
4601                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4602                         if (dmar_remove_dev_scope(info, atsr->segment,
4603                                         atsru->devices, atsru->devices_cnt))
4604                                 break;
4605                 }
4606         }
4607
4608         return 0;
4609 }
4610
4611 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4612                                        unsigned long val, void *v)
4613 {
4614         struct memory_notify *mhp = v;
4615         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4616         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4617                         mhp->nr_pages - 1);
4618
4619         switch (val) {
4620         case MEM_GOING_ONLINE:
4621                 if (iommu_domain_identity_map(si_domain,
4622                                               start_vpfn, last_vpfn)) {
4623                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4624                                 start_vpfn, last_vpfn);
4625                         return NOTIFY_BAD;
4626                 }
4627                 break;
4628
4629         case MEM_OFFLINE:
4630         case MEM_CANCEL_ONLINE:
4631                 {
4632                         struct dmar_drhd_unit *drhd;
4633                         struct intel_iommu *iommu;
4634                         struct page *freelist;
4635
4636                         freelist = domain_unmap(si_domain,
4637                                                 start_vpfn, last_vpfn);
4638
4639                         rcu_read_lock();
4640                         for_each_active_iommu(iommu, drhd)
4641                                 iommu_flush_iotlb_psi(iommu, si_domain,
4642                                         start_vpfn, mhp->nr_pages,
4643                                         !freelist, 0);
4644                         rcu_read_unlock();
4645                         dma_free_pagelist(freelist);
4646                 }
4647                 break;
4648         }
4649
4650         return NOTIFY_OK;
4651 }
4652
4653 static struct notifier_block intel_iommu_memory_nb = {
4654         .notifier_call = intel_iommu_memory_notifier,
4655         .priority = 0
4656 };
4657
4658 static void free_all_cpu_cached_iovas(unsigned int cpu)
4659 {
4660         int i;
4661
4662         for (i = 0; i < g_num_of_iommus; i++) {
4663                 struct intel_iommu *iommu = g_iommus[i];
4664                 struct dmar_domain *domain;
4665                 int did;
4666
4667                 if (!iommu)
4668                         continue;
4669
4670                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4671                         domain = get_iommu_domain(iommu, (u16)did);
4672
4673                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4674                                 continue;
4675
4676                         free_cpu_cached_iovas(cpu, &domain->iovad);
4677                 }
4678         }
4679 }
4680
4681 static int intel_iommu_cpu_dead(unsigned int cpu)
4682 {
4683         free_all_cpu_cached_iovas(cpu);
4684         return 0;
4685 }
4686
4687 static void intel_disable_iommus(void)
4688 {
4689         struct intel_iommu *iommu = NULL;
4690         struct dmar_drhd_unit *drhd;
4691
4692         for_each_iommu(iommu, drhd)
4693                 iommu_disable_translation(iommu);
4694 }
4695
4696 void intel_iommu_shutdown(void)
4697 {
4698         struct dmar_drhd_unit *drhd;
4699         struct intel_iommu *iommu = NULL;
4700
4701         if (no_iommu || dmar_disabled)
4702                 return;
4703
4704         down_write(&dmar_global_lock);
4705
4706         /* Disable PMRs explicitly here. */
4707         for_each_iommu(iommu, drhd)
4708                 iommu_disable_protect_mem_regions(iommu);
4709
4710         /* Make sure the IOMMUs are switched off */
4711         intel_disable_iommus();
4712
4713         up_write(&dmar_global_lock);
4714 }
4715
4716 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4717 {
4718         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4719
4720         return container_of(iommu_dev, struct intel_iommu, iommu);
4721 }
4722
4723 static ssize_t intel_iommu_show_version(struct device *dev,
4724                                         struct device_attribute *attr,
4725                                         char *buf)
4726 {
4727         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4728         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4729         return sprintf(buf, "%d:%d\n",
4730                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4731 }
4732 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4733
4734 static ssize_t intel_iommu_show_address(struct device *dev,
4735                                         struct device_attribute *attr,
4736                                         char *buf)
4737 {
4738         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4739         return sprintf(buf, "%llx\n", iommu->reg_phys);
4740 }
4741 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4742
4743 static ssize_t intel_iommu_show_cap(struct device *dev,
4744                                     struct device_attribute *attr,
4745                                     char *buf)
4746 {
4747         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4748         return sprintf(buf, "%llx\n", iommu->cap);
4749 }
4750 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4751
4752 static ssize_t intel_iommu_show_ecap(struct device *dev,
4753                                     struct device_attribute *attr,
4754                                     char *buf)
4755 {
4756         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4757         return sprintf(buf, "%llx\n", iommu->ecap);
4758 }
4759 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4760
4761 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4762                                       struct device_attribute *attr,
4763                                       char *buf)
4764 {
4765         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4766         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4767 }
4768 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4769
4770 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4771                                            struct device_attribute *attr,
4772                                            char *buf)
4773 {
4774         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4775         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4776                                                   cap_ndoms(iommu->cap)));
4777 }
4778 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4779
4780 static struct attribute *intel_iommu_attrs[] = {
4781         &dev_attr_version.attr,
4782         &dev_attr_address.attr,
4783         &dev_attr_cap.attr,
4784         &dev_attr_ecap.attr,
4785         &dev_attr_domains_supported.attr,
4786         &dev_attr_domains_used.attr,
4787         NULL,
4788 };
4789
4790 static struct attribute_group intel_iommu_group = {
4791         .name = "intel-iommu",
4792         .attrs = intel_iommu_attrs,
4793 };
4794
4795 const struct attribute_group *intel_iommu_groups[] = {
4796         &intel_iommu_group,
4797         NULL,
4798 };
4799
4800 static inline bool has_external_pci(void)
4801 {
4802         struct pci_dev *pdev = NULL;
4803
4804         for_each_pci_dev(pdev)
4805                 if (pdev->external_facing)
4806                         return true;
4807
4808         return false;
4809 }
4810
4811 static int __init platform_optin_force_iommu(void)
4812 {
4813         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4814                 return 0;
4815
4816         if (no_iommu || dmar_disabled)
4817                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4818
4819         /*
4820          * If Intel-IOMMU is disabled by default, we will apply identity
4821          * map for all devices except those marked as being untrusted.
4822          */
4823         if (dmar_disabled)
4824                 iommu_set_default_passthrough(false);
4825
4826         dmar_disabled = 0;
4827         no_iommu = 0;
4828
4829         return 1;
4830 }
4831
4832 static int __init probe_acpi_namespace_devices(void)
4833 {
4834         struct dmar_drhd_unit *drhd;
4835         /* To avoid a -Wunused-but-set-variable warning. */
4836         struct intel_iommu *iommu __maybe_unused;
4837         struct device *dev;
4838         int i, ret = 0;
4839
4840         for_each_active_iommu(iommu, drhd) {
4841                 for_each_active_dev_scope(drhd->devices,
4842                                           drhd->devices_cnt, i, dev) {
4843                         struct acpi_device_physical_node *pn;
4844                         struct iommu_group *group;
4845                         struct acpi_device *adev;
4846
4847                         if (dev->bus != &acpi_bus_type)
4848                                 continue;
4849
4850                         adev = to_acpi_device(dev);
4851                         mutex_lock(&adev->physical_node_lock);
4852                         list_for_each_entry(pn,
4853                                             &adev->physical_node_list, node) {
4854                                 group = iommu_group_get(pn->dev);
4855                                 if (group) {
4856                                         iommu_group_put(group);
4857                                         continue;
4858                                 }
4859
4860                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4861                                 ret = iommu_probe_device(pn->dev);
4862                                 if (ret)
4863                                         break;
4864                         }
4865                         mutex_unlock(&adev->physical_node_lock);
4866
4867                         if (ret)
4868                                 return ret;
4869                 }
4870         }
4871
4872         return 0;
4873 }
4874
4875 int __init intel_iommu_init(void)
4876 {
4877         int ret = -ENODEV;
4878         struct dmar_drhd_unit *drhd;
4879         struct intel_iommu *iommu;
4880
4881         /*
4882          * Intel IOMMU is required for a TXT/tboot launch or platform
4883          * opt in, so enforce that.
4884          */
4885         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4886
4887         if (iommu_init_mempool()) {
4888                 if (force_on)
4889                         panic("tboot: Failed to initialize iommu memory\n");
4890                 return -ENOMEM;
4891         }
4892
4893         down_write(&dmar_global_lock);
4894         if (dmar_table_init()) {
4895                 if (force_on)
4896                         panic("tboot: Failed to initialize DMAR table\n");
4897                 goto out_free_dmar;
4898         }
4899
4900         if (dmar_dev_scope_init() < 0) {
4901                 if (force_on)
4902                         panic("tboot: Failed to initialize DMAR device scope\n");
4903                 goto out_free_dmar;
4904         }
4905
4906         up_write(&dmar_global_lock);
4907
4908         /*
4909          * The bus notifier takes the dmar_global_lock, so lockdep will
4910          * complain later when we register it under the lock.
4911          */
4912         dmar_register_bus_notifier();
4913
4914         down_write(&dmar_global_lock);
4915
4916         if (!no_iommu)
4917                 intel_iommu_debugfs_init();
4918
4919         if (no_iommu || dmar_disabled) {
4920                 /*
4921                  * We exit the function here to ensure IOMMU's remapping and
4922                  * mempool aren't setup, which means that the IOMMU's PMRs
4923                  * won't be disabled via the call to init_dmars(). So disable
4924                  * it explicitly here. The PMRs were setup by tboot prior to
4925                  * calling SENTER, but the kernel is expected to reset/tear
4926                  * down the PMRs.
4927                  */
4928                 if (intel_iommu_tboot_noforce) {
4929                         for_each_iommu(iommu, drhd)
4930                                 iommu_disable_protect_mem_regions(iommu);
4931                 }
4932
4933                 /*
4934                  * Make sure the IOMMUs are switched off, even when we
4935                  * boot into a kexec kernel and the previous kernel left
4936                  * them enabled
4937                  */
4938                 intel_disable_iommus();
4939                 goto out_free_dmar;
4940         }
4941
4942         if (list_empty(&dmar_rmrr_units))
4943                 pr_info("No RMRR found\n");
4944
4945         if (list_empty(&dmar_atsr_units))
4946                 pr_info("No ATSR found\n");
4947
4948         if (dmar_init_reserved_ranges()) {
4949                 if (force_on)
4950                         panic("tboot: Failed to reserve iommu ranges\n");
4951                 goto out_free_reserved_range;
4952         }
4953
4954         if (dmar_map_gfx)
4955                 intel_iommu_gfx_mapped = 1;
4956
4957         init_no_remapping_devices();
4958
4959         ret = init_dmars();
4960         if (ret) {
4961                 if (force_on)
4962                         panic("tboot: Failed to initialize DMARs\n");
4963                 pr_err("Initialization failed\n");
4964                 goto out_free_reserved_range;
4965         }
4966         up_write(&dmar_global_lock);
4967
4968         init_iommu_pm_ops();
4969
4970         down_read(&dmar_global_lock);
4971         for_each_active_iommu(iommu, drhd) {
4972                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4973                                        intel_iommu_groups,
4974                                        "%s", iommu->name);
4975                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4976                 iommu_device_register(&iommu->iommu);
4977         }
4978         up_read(&dmar_global_lock);
4979
4980         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4981         if (si_domain && !hw_pass_through)
4982                 register_memory_notifier(&intel_iommu_memory_nb);
4983         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4984                           intel_iommu_cpu_dead);
4985
4986         down_read(&dmar_global_lock);
4987         if (probe_acpi_namespace_devices())
4988                 pr_warn("ACPI name space devices didn't probe correctly\n");
4989
4990         /* Finally, we enable the DMA remapping hardware. */
4991         for_each_iommu(iommu, drhd) {
4992                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4993                         iommu_enable_translation(iommu);
4994
4995                 iommu_disable_protect_mem_regions(iommu);
4996         }
4997         up_read(&dmar_global_lock);
4998
4999         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5000
5001         intel_iommu_enabled = 1;
5002
5003         return 0;
5004
5005 out_free_reserved_range:
5006         put_iova_domain(&reserved_iova_list);
5007 out_free_dmar:
5008         intel_iommu_free_dmars();
5009         up_write(&dmar_global_lock);
5010         iommu_exit_mempool();
5011         return ret;
5012 }
5013
5014 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5015 {
5016         struct intel_iommu *iommu = opaque;
5017
5018         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5019         return 0;
5020 }
5021
5022 /*
5023  * NB - intel-iommu lacks any sort of reference counting for the users of
5024  * dependent devices.  If multiple endpoints have intersecting dependent
5025  * devices, unbinding the driver from any one of them will possibly leave
5026  * the others unable to operate.
5027  */
5028 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5029 {
5030         if (!iommu || !dev || !dev_is_pci(dev))
5031                 return;
5032
5033         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5034 }
5035
5036 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5037 {
5038         struct dmar_domain *domain;
5039         struct intel_iommu *iommu;
5040         unsigned long flags;
5041
5042         assert_spin_locked(&device_domain_lock);
5043
5044         if (WARN_ON(!info))
5045                 return;
5046
5047         iommu = info->iommu;
5048         domain = info->domain;
5049
5050         if (info->dev) {
5051                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5052                         intel_pasid_tear_down_entry(iommu, info->dev,
5053                                         PASID_RID2PASID, false);
5054
5055                 iommu_disable_dev_iotlb(info);
5056                 if (!dev_is_real_dma_subdevice(info->dev))
5057                         domain_context_clear(iommu, info->dev);
5058                 intel_pasid_free_table(info->dev);
5059         }
5060
5061         unlink_domain_info(info);
5062
5063         spin_lock_irqsave(&iommu->lock, flags);
5064         domain_detach_iommu(domain, iommu);
5065         spin_unlock_irqrestore(&iommu->lock, flags);
5066
5067         free_devinfo_mem(info);
5068 }
5069
5070 static void dmar_remove_one_dev_info(struct device *dev)
5071 {
5072         struct device_domain_info *info;
5073         unsigned long flags;
5074
5075         spin_lock_irqsave(&device_domain_lock, flags);
5076         info = get_domain_info(dev);
5077         if (info)
5078                 __dmar_remove_one_dev_info(info);
5079         spin_unlock_irqrestore(&device_domain_lock, flags);
5080 }
5081
5082 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5083 {
5084         int adjust_width;
5085
5086         /* calculate AGAW */
5087         domain->gaw = guest_width;
5088         adjust_width = guestwidth_to_adjustwidth(guest_width);
5089         domain->agaw = width_to_agaw(adjust_width);
5090
5091         domain->iommu_coherency = 0;
5092         domain->iommu_snooping = 0;
5093         domain->iommu_superpage = 0;
5094         domain->max_addr = 0;
5095
5096         /* always allocate the top pgd */
5097         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5098         if (!domain->pgd)
5099                 return -ENOMEM;
5100         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5101         return 0;
5102 }
5103
5104 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5105 {
5106         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5107         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5108
5109         if (!intel_iommu_strict &&
5110             init_iova_flush_queue(&dmar_domain->iovad,
5111                                   iommu_flush_iova, iova_entry_free))
5112                 pr_info("iova flush queue initialization failed\n");
5113 }
5114
5115 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5116 {
5117         struct dmar_domain *dmar_domain;
5118         struct iommu_domain *domain;
5119
5120         switch (type) {
5121         case IOMMU_DOMAIN_DMA:
5122         case IOMMU_DOMAIN_UNMANAGED:
5123                 dmar_domain = alloc_domain(0);
5124                 if (!dmar_domain) {
5125                         pr_err("Can't allocate dmar_domain\n");
5126                         return NULL;
5127                 }
5128                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5129                         pr_err("Domain initialization failed\n");
5130                         domain_exit(dmar_domain);
5131                         return NULL;
5132                 }
5133
5134                 if (type == IOMMU_DOMAIN_DMA)
5135                         intel_init_iova_domain(dmar_domain);
5136
5137                 domain = &dmar_domain->domain;
5138                 domain->geometry.aperture_start = 0;
5139                 domain->geometry.aperture_end   =
5140                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5141                 domain->geometry.force_aperture = true;
5142
5143                 return domain;
5144         case IOMMU_DOMAIN_IDENTITY:
5145                 return &si_domain->domain;
5146         default:
5147                 return NULL;
5148         }
5149
5150         return NULL;
5151 }
5152
5153 static void intel_iommu_domain_free(struct iommu_domain *domain)
5154 {
5155         if (domain != &si_domain->domain)
5156                 domain_exit(to_dmar_domain(domain));
5157 }
5158
5159 /*
5160  * Check whether a @domain could be attached to the @dev through the
5161  * aux-domain attach/detach APIs.
5162  */
5163 static inline bool
5164 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5165 {
5166         struct device_domain_info *info = get_domain_info(dev);
5167
5168         return info && info->auxd_enabled &&
5169                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5170 }
5171
5172 static void auxiliary_link_device(struct dmar_domain *domain,
5173                                   struct device *dev)
5174 {
5175         struct device_domain_info *info = get_domain_info(dev);
5176
5177         assert_spin_locked(&device_domain_lock);
5178         if (WARN_ON(!info))
5179                 return;
5180
5181         domain->auxd_refcnt++;
5182         list_add(&domain->auxd, &info->auxiliary_domains);
5183 }
5184
5185 static void auxiliary_unlink_device(struct dmar_domain *domain,
5186                                     struct device *dev)
5187 {
5188         struct device_domain_info *info = get_domain_info(dev);
5189
5190         assert_spin_locked(&device_domain_lock);
5191         if (WARN_ON(!info))
5192                 return;
5193
5194         list_del(&domain->auxd);
5195         domain->auxd_refcnt--;
5196
5197         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5198                 ioasid_free(domain->default_pasid);
5199 }
5200
5201 static int aux_domain_add_dev(struct dmar_domain *domain,
5202                               struct device *dev)
5203 {
5204         int ret;
5205         unsigned long flags;
5206         struct intel_iommu *iommu;
5207
5208         iommu = device_to_iommu(dev, NULL, NULL);
5209         if (!iommu)
5210                 return -ENODEV;
5211
5212         if (domain->default_pasid <= 0) {
5213                 u32 pasid;
5214
5215                 /* No private data needed for the default pasid */
5216                 pasid = ioasid_alloc(NULL, PASID_MIN,
5217                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5218                                      NULL);
5219                 if (pasid == INVALID_IOASID) {
5220                         pr_err("Can't allocate default pasid\n");
5221                         return -ENODEV;
5222                 }
5223                 domain->default_pasid = pasid;
5224         }
5225
5226         spin_lock_irqsave(&device_domain_lock, flags);
5227         /*
5228          * iommu->lock must be held to attach domain to iommu and setup the
5229          * pasid entry for second level translation.
5230          */
5231         spin_lock(&iommu->lock);
5232         ret = domain_attach_iommu(domain, iommu);
5233         if (ret)
5234                 goto attach_failed;
5235
5236         /* Setup the PASID entry for mediated devices: */
5237         if (domain_use_first_level(domain))
5238                 ret = domain_setup_first_level(iommu, domain, dev,
5239                                                domain->default_pasid);
5240         else
5241                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5242                                                      domain->default_pasid);
5243         if (ret)
5244                 goto table_failed;
5245         spin_unlock(&iommu->lock);
5246
5247         auxiliary_link_device(domain, dev);
5248
5249         spin_unlock_irqrestore(&device_domain_lock, flags);
5250
5251         return 0;
5252
5253 table_failed:
5254         domain_detach_iommu(domain, iommu);
5255 attach_failed:
5256         spin_unlock(&iommu->lock);
5257         spin_unlock_irqrestore(&device_domain_lock, flags);
5258         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5259                 ioasid_free(domain->default_pasid);
5260
5261         return ret;
5262 }
5263
5264 static void aux_domain_remove_dev(struct dmar_domain *domain,
5265                                   struct device *dev)
5266 {
5267         struct device_domain_info *info;
5268         struct intel_iommu *iommu;
5269         unsigned long flags;
5270
5271         if (!is_aux_domain(dev, &domain->domain))
5272                 return;
5273
5274         spin_lock_irqsave(&device_domain_lock, flags);
5275         info = get_domain_info(dev);
5276         iommu = info->iommu;
5277
5278         auxiliary_unlink_device(domain, dev);
5279
5280         spin_lock(&iommu->lock);
5281         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5282         domain_detach_iommu(domain, iommu);
5283         spin_unlock(&iommu->lock);
5284
5285         spin_unlock_irqrestore(&device_domain_lock, flags);
5286 }
5287
5288 static int prepare_domain_attach_device(struct iommu_domain *domain,
5289                                         struct device *dev)
5290 {
5291         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5292         struct intel_iommu *iommu;
5293         int addr_width;
5294
5295         iommu = device_to_iommu(dev, NULL, NULL);
5296         if (!iommu)
5297                 return -ENODEV;
5298
5299         /* check if this iommu agaw is sufficient for max mapped address */
5300         addr_width = agaw_to_width(iommu->agaw);
5301         if (addr_width > cap_mgaw(iommu->cap))
5302                 addr_width = cap_mgaw(iommu->cap);
5303
5304         if (dmar_domain->max_addr > (1LL << addr_width)) {
5305                 dev_err(dev, "%s: iommu width (%d) is not "
5306                         "sufficient for the mapped address (%llx)\n",
5307                         __func__, addr_width, dmar_domain->max_addr);
5308                 return -EFAULT;
5309         }
5310         dmar_domain->gaw = addr_width;
5311
5312         /*
5313          * Knock out extra levels of page tables if necessary
5314          */
5315         while (iommu->agaw < dmar_domain->agaw) {
5316                 struct dma_pte *pte;
5317
5318                 pte = dmar_domain->pgd;
5319                 if (dma_pte_present(pte)) {
5320                         dmar_domain->pgd = (struct dma_pte *)
5321                                 phys_to_virt(dma_pte_addr(pte));
5322                         free_pgtable_page(pte);
5323                 }
5324                 dmar_domain->agaw--;
5325         }
5326
5327         return 0;
5328 }
5329
5330 static int intel_iommu_attach_device(struct iommu_domain *domain,
5331                                      struct device *dev)
5332 {
5333         int ret;
5334
5335         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5336             device_is_rmrr_locked(dev)) {
5337                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5338                 return -EPERM;
5339         }
5340
5341         if (is_aux_domain(dev, domain))
5342                 return -EPERM;
5343
5344         /* normally dev is not mapped */
5345         if (unlikely(domain_context_mapped(dev))) {
5346                 struct dmar_domain *old_domain;
5347
5348                 old_domain = find_domain(dev);
5349                 if (old_domain)
5350                         dmar_remove_one_dev_info(dev);
5351         }
5352
5353         ret = prepare_domain_attach_device(domain, dev);
5354         if (ret)
5355                 return ret;
5356
5357         return domain_add_dev_info(to_dmar_domain(domain), dev);
5358 }
5359
5360 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5361                                          struct device *dev)
5362 {
5363         int ret;
5364
5365         if (!is_aux_domain(dev, domain))
5366                 return -EPERM;
5367
5368         ret = prepare_domain_attach_device(domain, dev);
5369         if (ret)
5370                 return ret;
5371
5372         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5373 }
5374
5375 static void intel_iommu_detach_device(struct iommu_domain *domain,
5376                                       struct device *dev)
5377 {
5378         dmar_remove_one_dev_info(dev);
5379 }
5380
5381 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5382                                           struct device *dev)
5383 {
5384         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5385 }
5386
5387 /*
5388  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5389  * VT-d granularity. Invalidation is typically included in the unmap operation
5390  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5391  * owns the first level page tables. Invalidations of translation caches in the
5392  * guest are trapped and passed down to the host.
5393  *
5394  * vIOMMU in the guest will only expose first level page tables, therefore
5395  * we do not support IOTLB granularity for request without PASID (second level).
5396  *
5397  * For example, to find the VT-d granularity encoding for IOTLB
5398  * type and page selective granularity within PASID:
5399  * X: indexed by iommu cache type
5400  * Y: indexed by enum iommu_inv_granularity
5401  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5402  */
5403
5404 static const int
5405 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5406         /*
5407          * PASID based IOTLB invalidation: PASID selective (per PASID),
5408          * page selective (address granularity)
5409          */
5410         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5411         /* PASID based dev TLBs */
5412         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5413         /* PASID cache */
5414         {-EINVAL, -EINVAL, -EINVAL}
5415 };
5416
5417 static inline int to_vtd_granularity(int type, int granu)
5418 {
5419         return inv_type_granu_table[type][granu];
5420 }
5421
5422 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5423 {
5424         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5425
5426         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5427          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5428          * granu size in contiguous memory.
5429          */
5430         return order_base_2(nr_pages);
5431 }
5432
5433 #ifdef CONFIG_INTEL_IOMMU_SVM
5434 static int
5435 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5436                            struct iommu_cache_invalidate_info *inv_info)
5437 {
5438         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5439         struct device_domain_info *info;
5440         struct intel_iommu *iommu;
5441         unsigned long flags;
5442         int cache_type;
5443         u8 bus, devfn;
5444         u16 did, sid;
5445         int ret = 0;
5446         u64 size = 0;
5447
5448         if (!inv_info || !dmar_domain)
5449                 return -EINVAL;
5450
5451         if (!dev || !dev_is_pci(dev))
5452                 return -ENODEV;
5453
5454         iommu = device_to_iommu(dev, &bus, &devfn);
5455         if (!iommu)
5456                 return -ENODEV;
5457
5458         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5459                 return -EINVAL;
5460
5461         spin_lock_irqsave(&device_domain_lock, flags);
5462         spin_lock(&iommu->lock);
5463         info = get_domain_info(dev);
5464         if (!info) {
5465                 ret = -EINVAL;
5466                 goto out_unlock;
5467         }
5468         did = dmar_domain->iommu_did[iommu->seq_id];
5469         sid = PCI_DEVID(bus, devfn);
5470
5471         /* Size is only valid in address selective invalidation */
5472         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5473                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5474                                    inv_info->granu.addr_info.nb_granules);
5475
5476         for_each_set_bit(cache_type,
5477                          (unsigned long *)&inv_info->cache,
5478                          IOMMU_CACHE_INV_TYPE_NR) {
5479                 int granu = 0;
5480                 u64 pasid = 0;
5481                 u64 addr = 0;
5482
5483                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5484                 if (granu == -EINVAL) {
5485                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5486                                            cache_type, inv_info->granularity);
5487                         break;
5488                 }
5489
5490                 /*
5491                  * PASID is stored in different locations based on the
5492                  * granularity.
5493                  */
5494                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5495                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5496                         pasid = inv_info->granu.pasid_info.pasid;
5497                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5498                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5499                         pasid = inv_info->granu.addr_info.pasid;
5500
5501                 switch (BIT(cache_type)) {
5502                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5503                         /* HW will ignore LSB bits based on address mask */
5504                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5505                             size &&
5506                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5507                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5508                                                    inv_info->granu.addr_info.addr, size);
5509                         }
5510
5511                         /*
5512                          * If granu is PASID-selective, address is ignored.
5513                          * We use npages = -1 to indicate that.
5514                          */
5515                         qi_flush_piotlb(iommu, did, pasid,
5516                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5517                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5518                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5519
5520                         if (!info->ats_enabled)
5521                                 break;
5522                         /*
5523                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5524                          * in the guest may assume IOTLB flush is inclusive,
5525                          * which is more efficient.
5526                          */
5527                         fallthrough;
5528                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5529                         /*
5530                          * PASID based device TLB invalidation does not support
5531                          * IOMMU_INV_GRANU_PASID granularity but only supports
5532                          * IOMMU_INV_GRANU_ADDR.
5533                          * The equivalent of that is we set the size to be the
5534                          * entire range of 64 bit. User only provides PASID info
5535                          * without address info. So we set addr to 0.
5536                          */
5537                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5538                                 size = 64 - VTD_PAGE_SHIFT;
5539                                 addr = 0;
5540                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5541                                 addr = inv_info->granu.addr_info.addr;
5542                         }
5543
5544                         if (info->ats_enabled)
5545                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5546                                                 info->pfsid, pasid,
5547                                                 info->ats_qdep, addr,
5548                                                 size);
5549                         else
5550                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5551                         break;
5552                 default:
5553                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5554                                             cache_type);
5555                         ret = -EINVAL;
5556                 }
5557         }
5558 out_unlock:
5559         spin_unlock(&iommu->lock);
5560         spin_unlock_irqrestore(&device_domain_lock, flags);
5561
5562         return ret;
5563 }
5564 #endif
5565
5566 static int intel_iommu_map(struct iommu_domain *domain,
5567                            unsigned long iova, phys_addr_t hpa,
5568                            size_t size, int iommu_prot, gfp_t gfp)
5569 {
5570         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5571         u64 max_addr;
5572         int prot = 0;
5573         int ret;
5574
5575         if (iommu_prot & IOMMU_READ)
5576                 prot |= DMA_PTE_READ;
5577         if (iommu_prot & IOMMU_WRITE)
5578                 prot |= DMA_PTE_WRITE;
5579         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5580                 prot |= DMA_PTE_SNP;
5581
5582         max_addr = iova + size;
5583         if (dmar_domain->max_addr < max_addr) {
5584                 u64 end;
5585
5586                 /* check if minimum agaw is sufficient for mapped address */
5587                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5588                 if (end < max_addr) {
5589                         pr_err("%s: iommu width (%d) is not "
5590                                "sufficient for the mapped address (%llx)\n",
5591                                __func__, dmar_domain->gaw, max_addr);
5592                         return -EFAULT;
5593                 }
5594                 dmar_domain->max_addr = max_addr;
5595         }
5596         /* Round up size to next multiple of PAGE_SIZE, if it and
5597            the low bits of hpa would take us onto the next page */
5598         size = aligned_nrpages(hpa, size);
5599         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5600                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5601         return ret;
5602 }
5603
5604 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5605                                 unsigned long iova, size_t size,
5606                                 struct iommu_iotlb_gather *gather)
5607 {
5608         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5609         struct page *freelist = NULL;
5610         unsigned long start_pfn, last_pfn;
5611         unsigned int npages;
5612         int iommu_id, level = 0;
5613
5614         /* Cope with horrid API which requires us to unmap more than the
5615            size argument if it happens to be a large-page mapping. */
5616         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5617
5618         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5619                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5620
5621         start_pfn = iova >> VTD_PAGE_SHIFT;
5622         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5623
5624         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5625
5626         npages = last_pfn - start_pfn + 1;
5627
5628         for_each_domain_iommu(iommu_id, dmar_domain)
5629                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5630                                       start_pfn, npages, !freelist, 0);
5631
5632         dma_free_pagelist(freelist);
5633
5634         if (dmar_domain->max_addr == iova + size)
5635                 dmar_domain->max_addr = iova;
5636
5637         return size;
5638 }
5639
5640 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5641                                             dma_addr_t iova)
5642 {
5643         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5644         struct dma_pte *pte;
5645         int level = 0;
5646         u64 phys = 0;
5647
5648         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5649         if (pte && dma_pte_present(pte))
5650                 phys = dma_pte_addr(pte) +
5651                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5652                                                 VTD_PAGE_SHIFT) - 1));
5653
5654         return phys;
5655 }
5656
5657 static inline bool scalable_mode_support(void)
5658 {
5659         struct dmar_drhd_unit *drhd;
5660         struct intel_iommu *iommu;
5661         bool ret = true;
5662
5663         rcu_read_lock();
5664         for_each_active_iommu(iommu, drhd) {
5665                 if (!sm_supported(iommu)) {
5666                         ret = false;
5667                         break;
5668                 }
5669         }
5670         rcu_read_unlock();
5671
5672         return ret;
5673 }
5674
5675 static inline bool iommu_pasid_support(void)
5676 {
5677         struct dmar_drhd_unit *drhd;
5678         struct intel_iommu *iommu;
5679         bool ret = true;
5680
5681         rcu_read_lock();
5682         for_each_active_iommu(iommu, drhd) {
5683                 if (!pasid_supported(iommu)) {
5684                         ret = false;
5685                         break;
5686                 }
5687         }
5688         rcu_read_unlock();
5689
5690         return ret;
5691 }
5692
5693 static inline bool nested_mode_support(void)
5694 {
5695         struct dmar_drhd_unit *drhd;
5696         struct intel_iommu *iommu;
5697         bool ret = true;
5698
5699         rcu_read_lock();
5700         for_each_active_iommu(iommu, drhd) {
5701                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5702                         ret = false;
5703                         break;
5704                 }
5705         }
5706         rcu_read_unlock();
5707
5708         return ret;
5709 }
5710
5711 static bool intel_iommu_capable(enum iommu_cap cap)
5712 {
5713         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5714                 return domain_update_iommu_snooping(NULL) == 1;
5715         if (cap == IOMMU_CAP_INTR_REMAP)
5716                 return irq_remapping_enabled == 1;
5717
5718         return false;
5719 }
5720
5721 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5722 {
5723         struct intel_iommu *iommu;
5724
5725         iommu = device_to_iommu(dev, NULL, NULL);
5726         if (!iommu)
5727                 return ERR_PTR(-ENODEV);
5728
5729         if (translation_pre_enabled(iommu))
5730                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5731
5732         return &iommu->iommu;
5733 }
5734
5735 static void intel_iommu_release_device(struct device *dev)
5736 {
5737         struct intel_iommu *iommu;
5738
5739         iommu = device_to_iommu(dev, NULL, NULL);
5740         if (!iommu)
5741                 return;
5742
5743         dmar_remove_one_dev_info(dev);
5744
5745         set_dma_ops(dev, NULL);
5746 }
5747
5748 static void intel_iommu_probe_finalize(struct device *dev)
5749 {
5750         struct iommu_domain *domain;
5751
5752         domain = iommu_get_domain_for_dev(dev);
5753         if (device_needs_bounce(dev))
5754                 set_dma_ops(dev, &bounce_dma_ops);
5755         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5756                 set_dma_ops(dev, &intel_dma_ops);
5757         else
5758                 set_dma_ops(dev, NULL);
5759 }
5760
5761 static void intel_iommu_get_resv_regions(struct device *device,
5762                                          struct list_head *head)
5763 {
5764         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5765         struct iommu_resv_region *reg;
5766         struct dmar_rmrr_unit *rmrr;
5767         struct device *i_dev;
5768         int i;
5769
5770         down_read(&dmar_global_lock);
5771         for_each_rmrr_units(rmrr) {
5772                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5773                                           i, i_dev) {
5774                         struct iommu_resv_region *resv;
5775                         enum iommu_resv_type type;
5776                         size_t length;
5777
5778                         if (i_dev != device &&
5779                             !is_downstream_to_pci_bridge(device, i_dev))
5780                                 continue;
5781
5782                         length = rmrr->end_address - rmrr->base_address + 1;
5783
5784                         type = device_rmrr_is_relaxable(device) ?
5785                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5786
5787                         resv = iommu_alloc_resv_region(rmrr->base_address,
5788                                                        length, prot, type);
5789                         if (!resv)
5790                                 break;
5791
5792                         list_add_tail(&resv->list, head);
5793                 }
5794         }
5795         up_read(&dmar_global_lock);
5796
5797 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5798         if (dev_is_pci(device)) {
5799                 struct pci_dev *pdev = to_pci_dev(device);
5800
5801                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5802                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5803                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5804                         if (reg)
5805                                 list_add_tail(&reg->list, head);
5806                 }
5807         }
5808 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5809
5810         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5811                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5812                                       0, IOMMU_RESV_MSI);
5813         if (!reg)
5814                 return;
5815         list_add_tail(&reg->list, head);
5816 }
5817
5818 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5819 {
5820         struct device_domain_info *info;
5821         struct context_entry *context;
5822         struct dmar_domain *domain;
5823         unsigned long flags;
5824         u64 ctx_lo;
5825         int ret;
5826
5827         domain = find_domain(dev);
5828         if (!domain)
5829                 return -EINVAL;
5830
5831         spin_lock_irqsave(&device_domain_lock, flags);
5832         spin_lock(&iommu->lock);
5833
5834         ret = -EINVAL;
5835         info = get_domain_info(dev);
5836         if (!info || !info->pasid_supported)
5837                 goto out;
5838
5839         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5840         if (WARN_ON(!context))
5841                 goto out;
5842
5843         ctx_lo = context[0].lo;
5844
5845         if (!(ctx_lo & CONTEXT_PASIDE)) {
5846                 ctx_lo |= CONTEXT_PASIDE;
5847                 context[0].lo = ctx_lo;
5848                 wmb();
5849                 iommu->flush.flush_context(iommu,
5850                                            domain->iommu_did[iommu->seq_id],
5851                                            PCI_DEVID(info->bus, info->devfn),
5852                                            DMA_CCMD_MASK_NOBIT,
5853                                            DMA_CCMD_DEVICE_INVL);
5854         }
5855
5856         /* Enable PASID support in the device, if it wasn't already */
5857         if (!info->pasid_enabled)
5858                 iommu_enable_dev_iotlb(info);
5859
5860         ret = 0;
5861
5862  out:
5863         spin_unlock(&iommu->lock);
5864         spin_unlock_irqrestore(&device_domain_lock, flags);
5865
5866         return ret;
5867 }
5868
5869 static void intel_iommu_apply_resv_region(struct device *dev,
5870                                           struct iommu_domain *domain,
5871                                           struct iommu_resv_region *region)
5872 {
5873         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5874         unsigned long start, end;
5875
5876         start = IOVA_PFN(region->start);
5877         end   = IOVA_PFN(region->start + region->length - 1);
5878
5879         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5880 }
5881
5882 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5883 {
5884         if (dev_is_pci(dev))
5885                 return pci_device_group(dev);
5886         return generic_device_group(dev);
5887 }
5888
5889 static int intel_iommu_enable_auxd(struct device *dev)
5890 {
5891         struct device_domain_info *info;
5892         struct intel_iommu *iommu;
5893         unsigned long flags;
5894         int ret;
5895
5896         iommu = device_to_iommu(dev, NULL, NULL);
5897         if (!iommu || dmar_disabled)
5898                 return -EINVAL;
5899
5900         if (!sm_supported(iommu) || !pasid_supported(iommu))
5901                 return -EINVAL;
5902
5903         ret = intel_iommu_enable_pasid(iommu, dev);
5904         if (ret)
5905                 return -ENODEV;
5906
5907         spin_lock_irqsave(&device_domain_lock, flags);
5908         info = get_domain_info(dev);
5909         info->auxd_enabled = 1;
5910         spin_unlock_irqrestore(&device_domain_lock, flags);
5911
5912         return 0;
5913 }
5914
5915 static int intel_iommu_disable_auxd(struct device *dev)
5916 {
5917         struct device_domain_info *info;
5918         unsigned long flags;
5919
5920         spin_lock_irqsave(&device_domain_lock, flags);
5921         info = get_domain_info(dev);
5922         if (!WARN_ON(!info))
5923                 info->auxd_enabled = 0;
5924         spin_unlock_irqrestore(&device_domain_lock, flags);
5925
5926         return 0;
5927 }
5928
5929 /*
5930  * A PCI express designated vendor specific extended capability is defined
5931  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5932  * for system software and tools to detect endpoint devices supporting the
5933  * Intel scalable IO virtualization without host driver dependency.
5934  *
5935  * Returns the address of the matching extended capability structure within
5936  * the device's PCI configuration space or 0 if the device does not support
5937  * it.
5938  */
5939 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5940 {
5941         int pos;
5942         u16 vendor, id;
5943
5944         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5945         while (pos) {
5946                 pci_read_config_word(pdev, pos + 4, &vendor);
5947                 pci_read_config_word(pdev, pos + 8, &id);
5948                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5949                         return pos;
5950
5951                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5952         }
5953
5954         return 0;
5955 }
5956
5957 static bool
5958 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5959 {
5960         if (feat == IOMMU_DEV_FEAT_AUX) {
5961                 int ret;
5962
5963                 if (!dev_is_pci(dev) || dmar_disabled ||
5964                     !scalable_mode_support() || !iommu_pasid_support())
5965                         return false;
5966
5967                 ret = pci_pasid_features(to_pci_dev(dev));
5968                 if (ret < 0)
5969                         return false;
5970
5971                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5972         }
5973
5974         if (feat == IOMMU_DEV_FEAT_SVA) {
5975                 struct device_domain_info *info = get_domain_info(dev);
5976
5977                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5978                         info->pasid_supported && info->pri_supported &&
5979                         info->ats_supported;
5980         }
5981
5982         return false;
5983 }
5984
5985 static int
5986 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5987 {
5988         if (feat == IOMMU_DEV_FEAT_AUX)
5989                 return intel_iommu_enable_auxd(dev);
5990
5991         if (feat == IOMMU_DEV_FEAT_SVA) {
5992                 struct device_domain_info *info = get_domain_info(dev);
5993
5994                 if (!info)
5995                         return -EINVAL;
5996
5997                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5998                         return 0;
5999         }
6000
6001         return -ENODEV;
6002 }
6003
6004 static int
6005 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6006 {
6007         if (feat == IOMMU_DEV_FEAT_AUX)
6008                 return intel_iommu_disable_auxd(dev);
6009
6010         return -ENODEV;
6011 }
6012
6013 static bool
6014 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6015 {
6016         struct device_domain_info *info = get_domain_info(dev);
6017
6018         if (feat == IOMMU_DEV_FEAT_AUX)
6019                 return scalable_mode_support() && info && info->auxd_enabled;
6020
6021         return false;
6022 }
6023
6024 static int
6025 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6026 {
6027         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6028
6029         return dmar_domain->default_pasid > 0 ?
6030                         dmar_domain->default_pasid : -EINVAL;
6031 }
6032
6033 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6034                                            struct device *dev)
6035 {
6036         return attach_deferred(dev);
6037 }
6038
6039 static int
6040 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6041                             enum iommu_attr attr, void *data)
6042 {
6043         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6044         unsigned long flags;
6045         int ret = 0;
6046
6047         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6048                 return -EINVAL;
6049
6050         switch (attr) {
6051         case DOMAIN_ATTR_NESTING:
6052                 spin_lock_irqsave(&device_domain_lock, flags);
6053                 if (nested_mode_support() &&
6054                     list_empty(&dmar_domain->devices)) {
6055                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6056                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6057                 } else {
6058                         ret = -ENODEV;
6059                 }
6060                 spin_unlock_irqrestore(&device_domain_lock, flags);
6061                 break;
6062         default:
6063                 ret = -EINVAL;
6064                 break;
6065         }
6066
6067         return ret;
6068 }
6069
6070 /*
6071  * Check that the device does not live on an external facing PCI port that is
6072  * marked as untrusted. Such devices should not be able to apply quirks and
6073  * thus not be able to bypass the IOMMU restrictions.
6074  */
6075 static bool risky_device(struct pci_dev *pdev)
6076 {
6077         if (pdev->untrusted) {
6078                 pci_info(pdev,
6079                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6080                          pdev->vendor, pdev->device);
6081                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6082                 return true;
6083         }
6084         return false;
6085 }
6086
6087 const struct iommu_ops intel_iommu_ops = {
6088         .capable                = intel_iommu_capable,
6089         .domain_alloc           = intel_iommu_domain_alloc,
6090         .domain_free            = intel_iommu_domain_free,
6091         .domain_set_attr        = intel_iommu_domain_set_attr,
6092         .attach_dev             = intel_iommu_attach_device,
6093         .detach_dev             = intel_iommu_detach_device,
6094         .aux_attach_dev         = intel_iommu_aux_attach_device,
6095         .aux_detach_dev         = intel_iommu_aux_detach_device,
6096         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6097         .map                    = intel_iommu_map,
6098         .unmap                  = intel_iommu_unmap,
6099         .iova_to_phys           = intel_iommu_iova_to_phys,
6100         .probe_device           = intel_iommu_probe_device,
6101         .probe_finalize         = intel_iommu_probe_finalize,
6102         .release_device         = intel_iommu_release_device,
6103         .get_resv_regions       = intel_iommu_get_resv_regions,
6104         .put_resv_regions       = generic_iommu_put_resv_regions,
6105         .apply_resv_region      = intel_iommu_apply_resv_region,
6106         .device_group           = intel_iommu_device_group,
6107         .dev_has_feat           = intel_iommu_dev_has_feat,
6108         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6109         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6110         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6111         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6112         .def_domain_type        = device_def_domain_type,
6113         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6114 #ifdef CONFIG_INTEL_IOMMU_SVM
6115         .cache_invalidate       = intel_iommu_sva_invalidate,
6116         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6117         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6118         .sva_bind               = intel_svm_bind,
6119         .sva_unbind             = intel_svm_unbind,
6120         .sva_get_pasid          = intel_svm_get_pasid,
6121         .page_response          = intel_svm_page_response,
6122 #endif
6123 };
6124
6125 static void quirk_iommu_igfx(struct pci_dev *dev)
6126 {
6127         if (risky_device(dev))
6128                 return;
6129
6130         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6131         dmar_map_gfx = 0;
6132 }
6133
6134 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6142
6143 /* Broadwell igfx malfunctions with dmar */
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6168
6169 static void quirk_iommu_rwbf(struct pci_dev *dev)
6170 {
6171         if (risky_device(dev))
6172                 return;
6173
6174         /*
6175          * Mobile 4 Series Chipset neglects to set RWBF capability,
6176          * but needs it. Same seems to hold for the desktop versions.
6177          */
6178         pci_info(dev, "Forcing write-buffer flush capability\n");
6179         rwbf_quirk = 1;
6180 }
6181
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6189
6190 #define GGC 0x52
6191 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6192 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6193 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6194 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6195 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6196 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6197 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6198 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6199
6200 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6201 {
6202         unsigned short ggc;
6203
6204         if (risky_device(dev))
6205                 return;
6206
6207         if (pci_read_config_word(dev, GGC, &ggc))
6208                 return;
6209
6210         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6211                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6212                 dmar_map_gfx = 0;
6213         } else if (dmar_map_gfx) {
6214                 /* we have to ensure the gfx device is idle before we flush */
6215                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6216                 intel_iommu_strict = 1;
6217        }
6218 }
6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6223
6224 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6225 {
6226         unsigned short ver;
6227
6228         if (!IS_GFX_DEVICE(dev))
6229                 return;
6230
6231         ver = (dev->device >> 8) & 0xff;
6232         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6233             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6234             ver != 0x9a)
6235                 return;
6236
6237         if (risky_device(dev))
6238                 return;
6239
6240         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6241         iommu_skip_te_disable = 1;
6242 }
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6244
6245 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6246    ISOCH DMAR unit for the Azalia sound device, but not give it any
6247    TLB entries, which causes it to deadlock. Check for that.  We do
6248    this in a function called from init_dmars(), instead of in a PCI
6249    quirk, because we don't want to print the obnoxious "BIOS broken"
6250    message if VT-d is actually disabled.
6251 */
6252 static void __init check_tylersburg_isoch(void)
6253 {
6254         struct pci_dev *pdev;
6255         uint32_t vtisochctrl;
6256
6257         /* If there's no Azalia in the system anyway, forget it. */
6258         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6259         if (!pdev)
6260                 return;
6261
6262         if (risky_device(pdev)) {
6263                 pci_dev_put(pdev);
6264                 return;
6265         }
6266
6267         pci_dev_put(pdev);
6268
6269         /* System Management Registers. Might be hidden, in which case
6270            we can't do the sanity check. But that's OK, because the
6271            known-broken BIOSes _don't_ actually hide it, so far. */
6272         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6273         if (!pdev)
6274                 return;
6275
6276         if (risky_device(pdev)) {
6277                 pci_dev_put(pdev);
6278                 return;
6279         }
6280
6281         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6282                 pci_dev_put(pdev);
6283                 return;
6284         }
6285
6286         pci_dev_put(pdev);
6287
6288         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6289         if (vtisochctrl & 1)
6290                 return;
6291
6292         /* Drop all bits other than the number of TLB entries */
6293         vtisochctrl &= 0x1c;
6294
6295         /* If we have the recommended number of TLB entries (16), fine. */
6296         if (vtisochctrl == 0x10)
6297                 return;
6298
6299         /* Zero TLB entries? You get to ride the short bus to school. */
6300         if (!vtisochctrl) {
6301                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6302                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6303                      dmi_get_system_info(DMI_BIOS_VENDOR),
6304                      dmi_get_system_info(DMI_BIOS_VERSION),
6305                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6306                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6307                 return;
6308         }
6309
6310         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6311                vtisochctrl);
6312 }