iommu: Handle freelists when using deferred flushing in iommu drivers
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev_iommu_priv_get(dev);
376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377                 return NULL;
378
379         return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
386                                 to_pci_dev(d)->untrusted)
387
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393                                      void *data), void *data)
394 {
395         int ret = 0;
396         unsigned long flags;
397         struct device_domain_info *info;
398
399         spin_lock_irqsave(&device_domain_lock, flags);
400         list_for_each_entry(info, &device_domain_list, global) {
401                 ret = fn(info, data);
402                 if (ret) {
403                         spin_unlock_irqrestore(&device_domain_lock, flags);
404                         return ret;
405                 }
406         }
407         spin_unlock_irqrestore(&device_domain_lock, flags);
408
409         return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426         u32 gsts;
427
428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
429         if (gsts & DMA_GSTS_TES)
430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
433 static int __init intel_iommu_setup(char *str)
434 {
435         if (!str)
436                 return -EINVAL;
437         while (*str) {
438                 if (!strncmp(str, "on", 2)) {
439                         dmar_disabled = 0;
440                         pr_info("IOMMU enabled\n");
441                 } else if (!strncmp(str, "off", 3)) {
442                         dmar_disabled = 1;
443                         no_platform_optin = 1;
444                         pr_info("IOMMU disabled\n");
445                 } else if (!strncmp(str, "igfx_off", 8)) {
446                         dmar_map_gfx = 0;
447                         pr_info("Disable GFX device mapping\n");
448                 } else if (!strncmp(str, "forcedac", 8)) {
449                         pr_info("Forcing DAC for PCI devices\n");
450                         dmar_forcedac = 1;
451                 } else if (!strncmp(str, "strict", 6)) {
452                         pr_info("Disable batched IOTLB flush\n");
453                         intel_iommu_strict = 1;
454                 } else if (!strncmp(str, "sp_off", 6)) {
455                         pr_info("Disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 } else if (!strncmp(str, "sm_on", 5)) {
458                         pr_info("Intel-IOMMU: scalable mode supported\n");
459                         intel_iommu_sm = 1;
460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462                         intel_iommu_tboot_noforce = 1;
463                 } else if (!strncmp(str, "nobounce", 8)) {
464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465                         intel_no_bounce = 1;
466                 }
467
468                 str += strcspn(str, ",");
469                 while (*str == ',')
470                         str++;
471         }
472         return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481         struct dmar_domain **domains;
482         int idx = did >> 8;
483
484         domains = iommu->domains[idx];
485         if (!domains)
486                 return NULL;
487
488         return domains[did & 0xff];
489 }
490
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492                              struct dmar_domain *domain)
493 {
494         struct dmar_domain **domains;
495         int idx = did >> 8;
496
497         if (!iommu->domains[idx]) {
498                 size_t size = 256 * sizeof(struct dmar_domain *);
499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500         }
501
502         domains = iommu->domains[idx];
503         if (WARN_ON(!domains))
504                 return;
505         else
506                 domains[did & 0xff] = domain;
507 }
508
509 void *alloc_pgtable_page(int node)
510 {
511         struct page *page;
512         void *vaddr = NULL;
513
514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515         if (page)
516                 vaddr = page_address(page);
517         return vaddr;
518 }
519
520 void free_pgtable_page(void *vaddr)
521 {
522         free_page((unsigned long)vaddr);
523 }
524
525 static inline void *alloc_domain_mem(void)
526 {
527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
530 static void free_domain_mem(void *vaddr)
531 {
532         kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
535 static inline void * alloc_devinfo_mem(void)
536 {
537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616         return sm_supported(iommu) ?
617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu;
624         bool found = false;
625         int i;
626
627         domain->iommu_coherency = 1;
628
629         for_each_domain_iommu(i, domain) {
630                 found = true;
631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632                         domain->iommu_coherency = 0;
633                         break;
634                 }
635         }
636         if (found)
637                 return;
638
639         /* No hardware attached; use lowest common denominator */
640         rcu_read_lock();
641         for_each_active_iommu(iommu, drhd) {
642                 if (!iommu_paging_structure_coherency(iommu)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         rcu_read_unlock();
648 }
649
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652         struct dmar_drhd_unit *drhd;
653         struct intel_iommu *iommu;
654         int ret = 1;
655
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         if (!ecap_sc_support(iommu->ecap)) {
660                                 ret = 0;
661                                 break;
662                         }
663                 }
664         }
665         rcu_read_unlock();
666
667         return ret;
668 }
669
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671                                          struct intel_iommu *skip)
672 {
673         struct dmar_drhd_unit *drhd;
674         struct intel_iommu *iommu;
675         int mask = 0x3;
676
677         if (!intel_iommu_superpage) {
678                 return 0;
679         }
680
681         /* set iommu_superpage to the smallest common denominator */
682         rcu_read_lock();
683         for_each_active_iommu(iommu, drhd) {
684                 if (iommu != skip) {
685                         if (domain && domain_use_first_level(domain)) {
686                                 if (!cap_fl1gp_support(iommu->cap))
687                                         mask = 0x1;
688                         } else {
689                                 mask &= cap_super_page_val(iommu->cap);
690                         }
691
692                         if (!mask)
693                                 break;
694                 }
695         }
696         rcu_read_unlock();
697
698         return fls(mask);
699 }
700
701 static int domain_update_device_node(struct dmar_domain *domain)
702 {
703         struct device_domain_info *info;
704         int nid = NUMA_NO_NODE;
705
706         assert_spin_locked(&device_domain_lock);
707
708         if (list_empty(&domain->devices))
709                 return NUMA_NO_NODE;
710
711         list_for_each_entry(info, &domain->devices, link) {
712                 if (!info->dev)
713                         continue;
714
715                 /*
716                  * There could possibly be multiple device numa nodes as devices
717                  * within the same domain may sit behind different IOMMUs. There
718                  * isn't perfect answer in such situation, so we select first
719                  * come first served policy.
720                  */
721                 nid = dev_to_node(info->dev);
722                 if (nid != NUMA_NO_NODE)
723                         break;
724         }
725
726         return nid;
727 }
728
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
731 {
732         domain_update_iommu_coherency(domain);
733         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
735
736         /*
737          * If RHSA is missing, we should default to the device numa domain
738          * as fall back.
739          */
740         if (domain->nid == NUMA_NO_NODE)
741                 domain->nid = domain_update_device_node(domain);
742 }
743
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
745                                          u8 devfn, int alloc)
746 {
747         struct root_entry *root = &iommu->root_entry[bus];
748         struct context_entry *context;
749         u64 *entry;
750
751         entry = &root->lo;
752         if (sm_supported(iommu)) {
753                 if (devfn >= 0x80) {
754                         devfn -= 0x80;
755                         entry = &root->hi;
756                 }
757                 devfn *= 2;
758         }
759         if (*entry & 1)
760                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
761         else {
762                 unsigned long phy_addr;
763                 if (!alloc)
764                         return NULL;
765
766                 context = alloc_pgtable_page(iommu->node);
767                 if (!context)
768                         return NULL;
769
770                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771                 phy_addr = virt_to_phys((void *)context);
772                 *entry = phy_addr | 1;
773                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
774         }
775         return &context[devfn];
776 }
777
778 static bool attach_deferred(struct device *dev)
779 {
780         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
781 }
782
783 /**
784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785  *                               sub-hierarchy of a candidate PCI-PCI bridge
786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787  * @bridge: the candidate PCI-PCI bridge
788  *
789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
790  */
791 static bool
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
793 {
794         struct pci_dev *pdev, *pbridge;
795
796         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
797                 return false;
798
799         pdev = to_pci_dev(dev);
800         pbridge = to_pci_dev(bridge);
801
802         if (pbridge->subordinate &&
803             pbridge->subordinate->number <= pdev->bus->number &&
804             pbridge->subordinate->busn_res.end >= pdev->bus->number)
805                 return true;
806
807         return false;
808 }
809
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
811 {
812         struct dmar_drhd_unit *drhd;
813         u32 vtbar;
814         int rc;
815
816         /* We know that this device on this chipset has its own IOMMU.
817          * If we find it under a different IOMMU, then the BIOS is lying
818          * to us. Hope that the IOMMU for this device is actually
819          * disabled, and it needs no translation...
820          */
821         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
822         if (rc) {
823                 /* "can't" happen */
824                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
825                 return false;
826         }
827         vtbar &= 0xffff0000;
828
829         /* we know that the this iommu should be at offset 0xa000 from vtbar */
830         drhd = dmar_find_matched_drhd_unit(pdev);
831         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
834                 return true;
835         }
836
837         return false;
838 }
839
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
841 {
842         if (!iommu || iommu->drhd->ignored)
843                 return true;
844
845         if (dev_is_pci(dev)) {
846                 struct pci_dev *pdev = to_pci_dev(dev);
847
848                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850                     quirk_ioat_snb_local_iommu(pdev))
851                         return true;
852         }
853
854         return false;
855 }
856
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
858 {
859         struct dmar_drhd_unit *drhd = NULL;
860         struct pci_dev *pdev = NULL;
861         struct intel_iommu *iommu;
862         struct device *tmp;
863         u16 segment = 0;
864         int i;
865
866         if (!dev)
867                 return NULL;
868
869         if (dev_is_pci(dev)) {
870                 struct pci_dev *pf_pdev;
871
872                 pdev = pci_real_dma_dev(to_pci_dev(dev));
873
874                 /* VFs aren't listed in scope tables; we need to look up
875                  * the PF instead to find the IOMMU. */
876                 pf_pdev = pci_physfn(pdev);
877                 dev = &pf_pdev->dev;
878                 segment = pci_domain_nr(pdev->bus);
879         } else if (has_acpi_companion(dev))
880                 dev = &ACPI_COMPANION(dev)->dev;
881
882         rcu_read_lock();
883         for_each_iommu(iommu, drhd) {
884                 if (pdev && segment != drhd->segment)
885                         continue;
886
887                 for_each_active_dev_scope(drhd->devices,
888                                           drhd->devices_cnt, i, tmp) {
889                         if (tmp == dev) {
890                                 /* For a VF use its original BDF# not that of the PF
891                                  * which we used for the IOMMU lookup. Strictly speaking
892                                  * we could do this for all PCI devices; we only need to
893                                  * get the BDF# from the scope table for ACPI matches. */
894                                 if (pdev && pdev->is_virtfn)
895                                         goto got_pdev;
896
897                                 if (bus && devfn) {
898                                         *bus = drhd->devices[i].bus;
899                                         *devfn = drhd->devices[i].devfn;
900                                 }
901                                 goto out;
902                         }
903
904                         if (is_downstream_to_pci_bridge(dev, tmp))
905                                 goto got_pdev;
906                 }
907
908                 if (pdev && drhd->include_all) {
909                 got_pdev:
910                         if (bus && devfn) {
911                                 *bus = pdev->bus->number;
912                                 *devfn = pdev->devfn;
913                         }
914                         goto out;
915                 }
916         }
917         iommu = NULL;
918  out:
919         if (iommu_is_dummy(iommu, dev))
920                 iommu = NULL;
921
922         rcu_read_unlock();
923
924         return iommu;
925 }
926
927 static void domain_flush_cache(struct dmar_domain *domain,
928                                void *addr, int size)
929 {
930         if (!domain->iommu_coherency)
931                 clflush_cache_range(addr, size);
932 }
933
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
935 {
936         struct context_entry *context;
937         int ret = 0;
938         unsigned long flags;
939
940         spin_lock_irqsave(&iommu->lock, flags);
941         context = iommu_context_addr(iommu, bus, devfn, 0);
942         if (context)
943                 ret = context_present(context);
944         spin_unlock_irqrestore(&iommu->lock, flags);
945         return ret;
946 }
947
948 static void free_context_table(struct intel_iommu *iommu)
949 {
950         int i;
951         unsigned long flags;
952         struct context_entry *context;
953
954         spin_lock_irqsave(&iommu->lock, flags);
955         if (!iommu->root_entry) {
956                 goto out;
957         }
958         for (i = 0; i < ROOT_ENTRY_NR; i++) {
959                 context = iommu_context_addr(iommu, i, 0, 0);
960                 if (context)
961                         free_pgtable_page(context);
962
963                 if (!sm_supported(iommu))
964                         continue;
965
966                 context = iommu_context_addr(iommu, i, 0x80, 0);
967                 if (context)
968                         free_pgtable_page(context);
969
970         }
971         free_pgtable_page(iommu->root_entry);
972         iommu->root_entry = NULL;
973 out:
974         spin_unlock_irqrestore(&iommu->lock, flags);
975 }
976
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978                                       unsigned long pfn, int *target_level)
979 {
980         struct dma_pte *parent, *pte;
981         int level = agaw_to_level(domain->agaw);
982         int offset;
983
984         BUG_ON(!domain->pgd);
985
986         if (!domain_pfn_supported(domain, pfn))
987                 /* Address beyond IOMMU's addressing capabilities. */
988                 return NULL;
989
990         parent = domain->pgd;
991
992         while (1) {
993                 void *tmp_page;
994
995                 offset = pfn_level_offset(pfn, level);
996                 pte = &parent[offset];
997                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
998                         break;
999                 if (level == *target_level)
1000                         break;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         uint64_t pteval;
1004
1005                         tmp_page = alloc_pgtable_page(domain->nid);
1006
1007                         if (!tmp_page)
1008                                 return NULL;
1009
1010                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012                         if (domain_use_first_level(domain))
1013                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1015                                 /* Someone else set it while we were thinking; use theirs. */
1016                                 free_pgtable_page(tmp_page);
1017                         else
1018                                 domain_flush_cache(domain, pte, sizeof(*pte));
1019                 }
1020                 if (level == 1)
1021                         break;
1022
1023                 parent = phys_to_virt(dma_pte_addr(pte));
1024                 level--;
1025         }
1026
1027         if (!*target_level)
1028                 *target_level = level;
1029
1030         return pte;
1031 }
1032
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035                                          unsigned long pfn,
1036                                          int level, int *large_page)
1037 {
1038         struct dma_pte *parent, *pte;
1039         int total = agaw_to_level(domain->agaw);
1040         int offset;
1041
1042         parent = domain->pgd;
1043         while (level <= total) {
1044                 offset = pfn_level_offset(pfn, total);
1045                 pte = &parent[offset];
1046                 if (level == total)
1047                         return pte;
1048
1049                 if (!dma_pte_present(pte)) {
1050                         *large_page = total;
1051                         break;
1052                 }
1053
1054                 if (dma_pte_superpage(pte)) {
1055                         *large_page = total;
1056                         return pte;
1057                 }
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 total--;
1061         }
1062         return NULL;
1063 }
1064
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067                                 unsigned long start_pfn,
1068                                 unsigned long last_pfn)
1069 {
1070         unsigned int large_page;
1071         struct dma_pte *first_pte, *pte;
1072
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         /* we don't need lock here; nobody else touches the iova range */
1078         do {
1079                 large_page = 1;
1080                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081                 if (!pte) {
1082                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083                         continue;
1084                 }
1085                 do {
1086                         dma_clear_pte(pte);
1087                         start_pfn += lvl_to_nr_pages(large_page);
1088                         pte++;
1089                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090
1091                 domain_flush_cache(domain, first_pte,
1092                                    (void *)pte - (void *)first_pte);
1093
1094         } while (start_pfn && start_pfn <= last_pfn);
1095 }
1096
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098                                int retain_level, struct dma_pte *pte,
1099                                unsigned long pfn, unsigned long start_pfn,
1100                                unsigned long last_pfn)
1101 {
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107                 struct dma_pte *level_pte;
1108
1109                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110                         goto next;
1111
1112                 level_pfn = pfn & level_mask(level);
1113                 level_pte = phys_to_virt(dma_pte_addr(pte));
1114
1115                 if (level > 2) {
1116                         dma_pte_free_level(domain, level - 1, retain_level,
1117                                            level_pte, level_pfn, start_pfn,
1118                                            last_pfn);
1119                 }
1120
1121                 /*
1122                  * Free the page table if we're below the level we want to
1123                  * retain and the range covers the entire table.
1124                  */
1125                 if (level < retain_level && !(start_pfn > level_pfn ||
1126                       last_pfn < level_pfn + level_size(level) - 1)) {
1127                         dma_clear_pte(pte);
1128                         domain_flush_cache(domain, pte, sizeof(*pte));
1129                         free_pgtable_page(level_pte);
1130                 }
1131 next:
1132                 pfn += level_size(level);
1133         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141                                    unsigned long start_pfn,
1142                                    unsigned long last_pfn,
1143                                    int retain_level)
1144 {
1145         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147         BUG_ON(start_pfn > last_pfn);
1148
1149         dma_pte_clear_range(domain, start_pfn, last_pfn);
1150
1151         /* We don't need lock here; nobody else touches the iova range */
1152         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153                            domain->pgd, 0, start_pfn, last_pfn);
1154
1155         /* free pgd */
1156         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157                 free_pgtable_page(domain->pgd);
1158                 domain->pgd = NULL;
1159         }
1160 }
1161
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169                                             int level, struct dma_pte *pte,
1170                                             struct page *freelist)
1171 {
1172         struct page *pg;
1173
1174         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175         pg->freelist = freelist;
1176         freelist = pg;
1177
1178         if (level == 1)
1179                 return freelist;
1180
1181         pte = page_address(pg);
1182         do {
1183                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184                         freelist = dma_pte_list_pagetables(domain, level - 1,
1185                                                            pte, freelist);
1186                 pte++;
1187         } while (!first_pte_in_page(pte));
1188
1189         return freelist;
1190 }
1191
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193                                         struct dma_pte *pte, unsigned long pfn,
1194                                         unsigned long start_pfn,
1195                                         unsigned long last_pfn,
1196                                         struct page *freelist)
1197 {
1198         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199
1200         pfn = max(start_pfn, pfn);
1201         pte = &pte[pfn_level_offset(pfn, level)];
1202
1203         do {
1204                 unsigned long level_pfn;
1205
1206                 if (!dma_pte_present(pte))
1207                         goto next;
1208
1209                 level_pfn = pfn & level_mask(level);
1210
1211                 /* If range covers entire pagetable, free it */
1212                 if (start_pfn <= level_pfn &&
1213                     last_pfn >= level_pfn + level_size(level) - 1) {
1214                         /* These suborbinate page tables are going away entirely. Don't
1215                            bother to clear them; we're just going to *free* them. */
1216                         if (level > 1 && !dma_pte_superpage(pte))
1217                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218
1219                         dma_clear_pte(pte);
1220                         if (!first_pte)
1221                                 first_pte = pte;
1222                         last_pte = pte;
1223                 } else if (level > 1) {
1224                         /* Recurse down into a level that isn't *entirely* obsolete */
1225                         freelist = dma_pte_clear_level(domain, level - 1,
1226                                                        phys_to_virt(dma_pte_addr(pte)),
1227                                                        level_pfn, start_pfn, last_pfn,
1228                                                        freelist);
1229                 }
1230 next:
1231                 pfn += level_size(level);
1232         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233
1234         if (first_pte)
1235                 domain_flush_cache(domain, first_pte,
1236                                    (void *)++last_pte - (void *)first_pte);
1237
1238         return freelist;
1239 }
1240
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245                                  unsigned long start_pfn,
1246                                  unsigned long last_pfn,
1247                                  struct page *freelist)
1248 {
1249         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1250         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1251         BUG_ON(start_pfn > last_pfn);
1252
1253         /* we don't need lock here; nobody else touches the iova range */
1254         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1255                                        domain->pgd, 0, start_pfn, last_pfn,
1256                                        freelist);
1257
1258         /* free pgd */
1259         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260                 struct page *pgd_page = virt_to_page(domain->pgd);
1261                 pgd_page->freelist = freelist;
1262                 freelist = pgd_page;
1263
1264                 domain->pgd = NULL;
1265         }
1266
1267         return freelist;
1268 }
1269
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272         struct page *pg;
1273
1274         while ((pg = freelist)) {
1275                 freelist = pg->freelist;
1276                 free_pgtable_page(page_address(pg));
1277         }
1278 }
1279
1280 static void iova_entry_free(unsigned long data)
1281 {
1282         struct page *freelist = (struct page *)data;
1283
1284         dma_free_pagelist(freelist);
1285 }
1286
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290         struct root_entry *root;
1291         unsigned long flags;
1292
1293         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294         if (!root) {
1295                 pr_err("Allocating root entry for %s failed\n",
1296                         iommu->name);
1297                 return -ENOMEM;
1298         }
1299
1300         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         iommu->root_entry = root;
1304         spin_unlock_irqrestore(&iommu->lock, flags);
1305
1306         return 0;
1307 }
1308
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311         u64 addr;
1312         u32 sts;
1313         unsigned long flag;
1314
1315         addr = virt_to_phys(iommu->root_entry);
1316         if (sm_supported(iommu))
1317                 addr |= DMA_RTADDR_SMT;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321
1322         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (sts & DMA_GSTS_RTPS), sts);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333         u32 val;
1334         unsigned long flag;
1335
1336         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337                 return;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344                       readl, (!(val & DMA_GSTS_WBFS)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351                                   u16 did, u16 source_id, u8 function_mask,
1352                                   u64 type)
1353 {
1354         u64 val = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_CCMD_GLOBAL_INVL:
1359                 val = DMA_CCMD_GLOBAL_INVL;
1360                 break;
1361         case DMA_CCMD_DOMAIN_INVL:
1362                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363                 break;
1364         case DMA_CCMD_DEVICE_INVL:
1365                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367                 break;
1368         default:
1369                 BUG();
1370         }
1371         val |= DMA_CCMD_ICC;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385                                 u64 addr, unsigned int size_order, u64 type)
1386 {
1387         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388         u64 val = 0, val_iva = 0;
1389         unsigned long flag;
1390
1391         switch (type) {
1392         case DMA_TLB_GLOBAL_FLUSH:
1393                 /* global flush doesn't need set IVA_REG */
1394                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395                 break;
1396         case DMA_TLB_DSI_FLUSH:
1397                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398                 break;
1399         case DMA_TLB_PSI_FLUSH:
1400                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 /* IH bit is passed in as part of address */
1402                 val_iva = size_order | addr;
1403                 break;
1404         default:
1405                 BUG();
1406         }
1407         /* Note: set drain read/write */
1408 #if 0
1409         /*
1410          * This is probably to be super secure.. Looks like we can
1411          * ignore it without any impact.
1412          */
1413         if (cap_read_drain(iommu->cap))
1414                 val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416         if (cap_write_drain(iommu->cap))
1417                 val |= DMA_TLB_WRITE_DRAIN;
1418
1419         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420         /* Note: Only uses first TLB reg currently */
1421         if (val_iva)
1422                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430
1431         /* check IOTLB invalidation granularity */
1432         if (DMA_TLB_IAIG(val) == 0)
1433                 pr_err("Flush IOTLB failed\n");
1434         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436                         (unsigned long long)DMA_TLB_IIRG(type),
1437                         (unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442                          u8 bus, u8 devfn)
1443 {
1444         struct device_domain_info *info;
1445
1446         assert_spin_locked(&device_domain_lock);
1447
1448         if (!iommu->qi)
1449                 return NULL;
1450
1451         list_for_each_entry(info, &domain->devices, link)
1452                 if (info->iommu == iommu && info->bus == bus &&
1453                     info->devfn == devfn) {
1454                         if (info->ats_supported && info->dev)
1455                                 return info;
1456                         break;
1457                 }
1458
1459         return NULL;
1460 }
1461
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464         struct device_domain_info *info;
1465         bool has_iotlb_device = false;
1466
1467         assert_spin_locked(&device_domain_lock);
1468
1469         list_for_each_entry(info, &domain->devices, link) {
1470                 struct pci_dev *pdev;
1471
1472                 if (!info->dev || !dev_is_pci(info->dev))
1473                         continue;
1474
1475                 pdev = to_pci_dev(info->dev);
1476                 if (pdev->ats_enabled) {
1477                         has_iotlb_device = true;
1478                         break;
1479                 }
1480         }
1481
1482         domain->has_iotlb_device = has_iotlb_device;
1483 }
1484
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487         struct pci_dev *pdev;
1488
1489         assert_spin_locked(&device_domain_lock);
1490
1491         if (!info || !dev_is_pci(info->dev))
1492                 return;
1493
1494         pdev = to_pci_dev(info->dev);
1495         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498          * reserved, which should be set to 0.
1499          */
1500         if (!ecap_dit(info->iommu->ecap))
1501                 info->pfsid = 0;
1502         else {
1503                 struct pci_dev *pf_pdev;
1504
1505                 /* pdev will be returned if device is not a vf */
1506                 pf_pdev = pci_physfn(pdev);
1507                 info->pfsid = pci_dev_id(pf_pdev);
1508         }
1509
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511         /* The PCIe spec, in its wisdom, declares that the behaviour of
1512            the device if you enable PASID support after ATS support is
1513            undefined. So always enable PASID support on devices which
1514            have it, even if we can't yet know if we're ever going to
1515            use it. */
1516         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517                 info->pasid_enabled = 1;
1518
1519         if (info->pri_supported &&
1520             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522                 info->pri_enabled = 1;
1523 #endif
1524         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526                 info->ats_enabled = 1;
1527                 domain_update_iotlb(info->domain);
1528                 info->ats_qdep = pci_ats_queue_depth(pdev);
1529         }
1530 }
1531
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534         struct pci_dev *pdev;
1535
1536         assert_spin_locked(&device_domain_lock);
1537
1538         if (!dev_is_pci(info->dev))
1539                 return;
1540
1541         pdev = to_pci_dev(info->dev);
1542
1543         if (info->ats_enabled) {
1544                 pci_disable_ats(pdev);
1545                 info->ats_enabled = 0;
1546                 domain_update_iotlb(info->domain);
1547         }
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549         if (info->pri_enabled) {
1550                 pci_disable_pri(pdev);
1551                 info->pri_enabled = 0;
1552         }
1553         if (info->pasid_enabled) {
1554                 pci_disable_pasid(pdev);
1555                 info->pasid_enabled = 0;
1556         }
1557 #endif
1558 }
1559
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561                                   u64 addr, unsigned mask)
1562 {
1563         u16 sid, qdep;
1564         unsigned long flags;
1565         struct device_domain_info *info;
1566
1567         if (!domain->has_iotlb_device)
1568                 return;
1569
1570         spin_lock_irqsave(&device_domain_lock, flags);
1571         list_for_each_entry(info, &domain->devices, link) {
1572                 if (!info->ats_enabled)
1573                         continue;
1574
1575                 sid = info->bus << 8 | info->devfn;
1576                 qdep = info->ats_qdep;
1577                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578                                 qdep, addr, mask);
1579         }
1580         spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584                                 struct dmar_domain *domain,
1585                                 u64 addr, unsigned long npages, bool ih)
1586 {
1587         u16 did = domain->iommu_did[iommu->seq_id];
1588
1589         if (domain->default_pasid)
1590                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591                                 addr, npages, ih);
1592
1593         if (!list_empty(&domain->devices))
1594                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598                                   struct dmar_domain *domain,
1599                                   unsigned long pfn, unsigned int pages,
1600                                   int ih, int map)
1601 {
1602         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604         u16 did = domain->iommu_did[iommu->seq_id];
1605
1606         BUG_ON(pages == 0);
1607
1608         if (ih)
1609                 ih = 1 << 6;
1610
1611         if (domain_use_first_level(domain)) {
1612                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613         } else {
1614                 /*
1615                  * Fallback to domain selective flush if no PSI support or
1616                  * the size is too big. PSI requires page size to be 2 ^ x,
1617                  * and the base address is naturally aligned to the size.
1618                  */
1619                 if (!cap_pgsel_inv(iommu->cap) ||
1620                     mask > cap_max_amask_val(iommu->cap))
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                         DMA_TLB_DSI_FLUSH);
1623                 else
1624                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625                                                         DMA_TLB_PSI_FLUSH);
1626         }
1627
1628         /*
1629          * In caching mode, changes of pages from non-present to present require
1630          * flush. However, device IOTLB doesn't need to be flushed in this case.
1631          */
1632         if (!cap_caching_mode(iommu->cap) || !map)
1633                 iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638                                         struct dmar_domain *domain,
1639                                         unsigned long pfn, unsigned int pages)
1640 {
1641         /*
1642          * It's a non-present to present mapping. Only flush if caching mode
1643          * and second level.
1644          */
1645         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647         else
1648                 iommu_flush_write_buffer(iommu);
1649 }
1650
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653         struct dmar_domain *domain;
1654         int idx;
1655
1656         domain = container_of(iovad, struct dmar_domain, iovad);
1657
1658         for_each_domain_iommu(idx, domain) {
1659                 struct intel_iommu *iommu = g_iommus[idx];
1660                 u16 did = domain->iommu_did[iommu->seq_id];
1661
1662                 if (domain_use_first_level(domain))
1663                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666                                                  DMA_TLB_DSI_FLUSH);
1667
1668                 if (!cap_caching_mode(iommu->cap))
1669                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670                                               0, MAX_AGAW_PFN_WIDTH);
1671         }
1672 }
1673
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676         u32 pmen;
1677         unsigned long flags;
1678
1679         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680                 return;
1681
1682         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684         pmen &= ~DMA_PMEN_EPM;
1685         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686
1687         /* wait for the protected region status bit to clear */
1688         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1690
1691         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696         u32 sts;
1697         unsigned long flags;
1698
1699         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700         iommu->gcmd |= DMA_GCMD_TE;
1701         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702
1703         /* Make sure hardware complete it */
1704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705                       readl, (sts & DMA_GSTS_TES), sts);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flag;
1714
1715         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717                 return;
1718
1719         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720         iommu->gcmd &= ~DMA_GCMD_TE;
1721         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722
1723         /* Make sure hardware complete it */
1724         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725                       readl, (!(sts & DMA_GSTS_TES)), sts);
1726
1727         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732         u32 ndomains, nlongs;
1733         size_t size;
1734
1735         ndomains = cap_ndoms(iommu->cap);
1736         pr_debug("%s: Number of Domains supported <%d>\n",
1737                  iommu->name, ndomains);
1738         nlongs = BITS_TO_LONGS(ndomains);
1739
1740         spin_lock_init(&iommu->lock);
1741
1742         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743         if (!iommu->domain_ids) {
1744                 pr_err("%s: Allocating domain id array failed\n",
1745                        iommu->name);
1746                 return -ENOMEM;
1747         }
1748
1749         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750         iommu->domains = kzalloc(size, GFP_KERNEL);
1751
1752         if (iommu->domains) {
1753                 size = 256 * sizeof(struct dmar_domain *);
1754                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755         }
1756
1757         if (!iommu->domains || !iommu->domains[0]) {
1758                 pr_err("%s: Allocating domain array failed\n",
1759                        iommu->name);
1760                 kfree(iommu->domain_ids);
1761                 kfree(iommu->domains);
1762                 iommu->domain_ids = NULL;
1763                 iommu->domains    = NULL;
1764                 return -ENOMEM;
1765         }
1766
1767         /*
1768          * If Caching mode is set, then invalid translations are tagged
1769          * with domain-id 0, hence we need to pre-allocate it. We also
1770          * use domain-id 0 as a marker for non-allocated domain-id, so
1771          * make sure it is not used for a real domain.
1772          */
1773         set_bit(0, iommu->domain_ids);
1774
1775         /*
1776          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777          * entry for first-level or pass-through translation modes should
1778          * be programmed with a domain id different from those used for
1779          * second-level or nested translation. We reserve a domain id for
1780          * this purpose.
1781          */
1782         if (sm_supported(iommu))
1783                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784
1785         return 0;
1786 }
1787
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790         struct device_domain_info *info, *tmp;
1791         unsigned long flags;
1792
1793         if (!iommu->domains || !iommu->domain_ids)
1794                 return;
1795
1796         spin_lock_irqsave(&device_domain_lock, flags);
1797         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798                 if (info->iommu != iommu)
1799                         continue;
1800
1801                 if (!info->dev || !info->domain)
1802                         continue;
1803
1804                 __dmar_remove_one_dev_info(info);
1805         }
1806         spin_unlock_irqrestore(&device_domain_lock, flags);
1807
1808         if (iommu->gcmd & DMA_GCMD_TE)
1809                 iommu_disable_translation(iommu);
1810 }
1811
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814         if ((iommu->domains) && (iommu->domain_ids)) {
1815                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816                 int i;
1817
1818                 for (i = 0; i < elems; i++)
1819                         kfree(iommu->domains[i]);
1820                 kfree(iommu->domains);
1821                 kfree(iommu->domain_ids);
1822                 iommu->domains = NULL;
1823                 iommu->domain_ids = NULL;
1824         }
1825
1826         g_iommus[iommu->seq_id] = NULL;
1827
1828         /* free context mapping */
1829         free_context_table(iommu);
1830
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832         if (pasid_supported(iommu)) {
1833                 if (ecap_prs(iommu->ecap))
1834                         intel_svm_finish_prq(iommu);
1835         }
1836         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1838
1839 #endif
1840 }
1841
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848         struct dmar_drhd_unit *drhd;
1849         struct intel_iommu *iommu;
1850         static int first_level_support = -1;
1851
1852         if (likely(first_level_support != -1))
1853                 return first_level_support;
1854
1855         first_level_support = 1;
1856
1857         rcu_read_lock();
1858         for_each_active_iommu(iommu, drhd) {
1859                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860                         first_level_support = 0;
1861                         break;
1862                 }
1863         }
1864         rcu_read_unlock();
1865
1866         return first_level_support;
1867 }
1868
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871         struct dmar_domain *domain;
1872
1873         domain = alloc_domain_mem();
1874         if (!domain)
1875                 return NULL;
1876
1877         memset(domain, 0, sizeof(*domain));
1878         domain->nid = NUMA_NO_NODE;
1879         domain->flags = flags;
1880         if (first_level_by_default())
1881                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882         domain->has_iotlb_device = false;
1883         INIT_LIST_HEAD(&domain->devices);
1884
1885         return domain;
1886 }
1887
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890                                struct intel_iommu *iommu)
1891 {
1892         unsigned long ndomains;
1893         int num;
1894
1895         assert_spin_locked(&device_domain_lock);
1896         assert_spin_locked(&iommu->lock);
1897
1898         domain->iommu_refcnt[iommu->seq_id] += 1;
1899         domain->iommu_count += 1;
1900         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901                 ndomains = cap_ndoms(iommu->cap);
1902                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903
1904                 if (num >= ndomains) {
1905                         pr_err("%s: No free domain ids\n", iommu->name);
1906                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1907                         domain->iommu_count -= 1;
1908                         return -ENOSPC;
1909                 }
1910
1911                 set_bit(num, iommu->domain_ids);
1912                 set_iommu_domain(iommu, num, domain);
1913
1914                 domain->iommu_did[iommu->seq_id] = num;
1915                 domain->nid                      = iommu->node;
1916
1917                 domain_update_iommu_cap(domain);
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924                                struct intel_iommu *iommu)
1925 {
1926         int num, count;
1927
1928         assert_spin_locked(&device_domain_lock);
1929         assert_spin_locked(&iommu->lock);
1930
1931         domain->iommu_refcnt[iommu->seq_id] -= 1;
1932         count = --domain->iommu_count;
1933         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934                 num = domain->iommu_did[iommu->seq_id];
1935                 clear_bit(num, iommu->domain_ids);
1936                 set_iommu_domain(iommu, num, NULL);
1937
1938                 domain_update_iommu_cap(domain);
1939                 domain->iommu_did[iommu->seq_id] = 0;
1940         }
1941
1942         return count;
1943 }
1944
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950         struct pci_dev *pdev = NULL;
1951         struct iova *iova;
1952         int i;
1953
1954         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955
1956         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957                 &reserved_rbtree_key);
1958
1959         /* IOAPIC ranges shouldn't be accessed by DMA */
1960         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961                 IOVA_PFN(IOAPIC_RANGE_END));
1962         if (!iova) {
1963                 pr_err("Reserve IOAPIC range failed\n");
1964                 return -ENODEV;
1965         }
1966
1967         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968         for_each_pci_dev(pdev) {
1969                 struct resource *r;
1970
1971                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972                         r = &pdev->resource[i];
1973                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974                                 continue;
1975                         iova = reserve_iova(&reserved_iova_list,
1976                                             IOVA_PFN(r->start),
1977                                             IOVA_PFN(r->end));
1978                         if (!iova) {
1979                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980                                 return -ENODEV;
1981                         }
1982                 }
1983         }
1984         return 0;
1985 }
1986
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989         int agaw;
1990         int r = (gaw - 12) % 9;
1991
1992         if (r == 0)
1993                 agaw = gaw;
1994         else
1995                 agaw = gaw + 9 - r;
1996         if (agaw > 64)
1997                 agaw = 64;
1998         return agaw;
1999 }
2000
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003
2004         /* Remove associated devices and clear attached or cached domains */
2005         domain_remove_dev_info(domain);
2006
2007         /* destroy iovas */
2008         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009                 put_iova_domain(&domain->iovad);
2010
2011         if (domain->pgd) {
2012                 struct page *freelist;
2013
2014                 freelist = domain_unmap(domain, 0,
2015                                         DOMAIN_MAX_PFN(domain->gaw), NULL);
2016                 dma_free_pagelist(freelist);
2017         }
2018
2019         free_domain_mem(domain);
2020 }
2021
2022 /*
2023  * Get the PASID directory size for scalable mode context entry.
2024  * Value of X in the PDTS field of a scalable mode context entry
2025  * indicates PASID directory with 2^(X + 7) entries.
2026  */
2027 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2028 {
2029         int pds, max_pde;
2030
2031         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2032         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2033         if (pds < 7)
2034                 return 0;
2035
2036         return pds - 7;
2037 }
2038
2039 /*
2040  * Set the RID_PASID field of a scalable mode context entry. The
2041  * IOMMU hardware will use the PASID value set in this field for
2042  * DMA translations of DMA requests without PASID.
2043  */
2044 static inline void
2045 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2046 {
2047         context->hi |= pasid & ((1 << 20) - 1);
2048 }
2049
2050 /*
2051  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2052  * entry.
2053  */
2054 static inline void context_set_sm_dte(struct context_entry *context)
2055 {
2056         context->lo |= (1 << 2);
2057 }
2058
2059 /*
2060  * Set the PRE(Page Request Enable) field of a scalable mode context
2061  * entry.
2062  */
2063 static inline void context_set_sm_pre(struct context_entry *context)
2064 {
2065         context->lo |= (1 << 4);
2066 }
2067
2068 /* Convert value to context PASID directory size field coding. */
2069 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2070
2071 static int domain_context_mapping_one(struct dmar_domain *domain,
2072                                       struct intel_iommu *iommu,
2073                                       struct pasid_table *table,
2074                                       u8 bus, u8 devfn)
2075 {
2076         u16 did = domain->iommu_did[iommu->seq_id];
2077         int translation = CONTEXT_TT_MULTI_LEVEL;
2078         struct device_domain_info *info = NULL;
2079         struct context_entry *context;
2080         unsigned long flags;
2081         int ret;
2082
2083         WARN_ON(did == 0);
2084
2085         if (hw_pass_through && domain_type_is_si(domain))
2086                 translation = CONTEXT_TT_PASS_THROUGH;
2087
2088         pr_debug("Set context mapping for %02x:%02x.%d\n",
2089                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2090
2091         BUG_ON(!domain->pgd);
2092
2093         spin_lock_irqsave(&device_domain_lock, flags);
2094         spin_lock(&iommu->lock);
2095
2096         ret = -ENOMEM;
2097         context = iommu_context_addr(iommu, bus, devfn, 1);
2098         if (!context)
2099                 goto out_unlock;
2100
2101         ret = 0;
2102         if (context_present(context))
2103                 goto out_unlock;
2104
2105         /*
2106          * For kdump cases, old valid entries may be cached due to the
2107          * in-flight DMA and copied pgtable, but there is no unmapping
2108          * behaviour for them, thus we need an explicit cache flush for
2109          * the newly-mapped device. For kdump, at this point, the device
2110          * is supposed to finish reset at its driver probe stage, so no
2111          * in-flight DMA will exist, and we don't need to worry anymore
2112          * hereafter.
2113          */
2114         if (context_copied(context)) {
2115                 u16 did_old = context_domain_id(context);
2116
2117                 if (did_old < cap_ndoms(iommu->cap)) {
2118                         iommu->flush.flush_context(iommu, did_old,
2119                                                    (((u16)bus) << 8) | devfn,
2120                                                    DMA_CCMD_MASK_NOBIT,
2121                                                    DMA_CCMD_DEVICE_INVL);
2122                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2123                                                  DMA_TLB_DSI_FLUSH);
2124                 }
2125         }
2126
2127         context_clear_entry(context);
2128
2129         if (sm_supported(iommu)) {
2130                 unsigned long pds;
2131
2132                 WARN_ON(!table);
2133
2134                 /* Setup the PASID DIR pointer: */
2135                 pds = context_get_sm_pds(table);
2136                 context->lo = (u64)virt_to_phys(table->table) |
2137                                 context_pdts(pds);
2138
2139                 /* Setup the RID_PASID field: */
2140                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2141
2142                 /*
2143                  * Setup the Device-TLB enable bit and Page request
2144                  * Enable bit:
2145                  */
2146                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2147                 if (info && info->ats_supported)
2148                         context_set_sm_dte(context);
2149                 if (info && info->pri_supported)
2150                         context_set_sm_pre(context);
2151         } else {
2152                 struct dma_pte *pgd = domain->pgd;
2153                 int agaw;
2154
2155                 context_set_domain_id(context, did);
2156
2157                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2158                         /*
2159                          * Skip top levels of page tables for iommu which has
2160                          * less agaw than default. Unnecessary for PT mode.
2161                          */
2162                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2163                                 ret = -ENOMEM;
2164                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2165                                 if (!dma_pte_present(pgd))
2166                                         goto out_unlock;
2167                         }
2168
2169                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2170                         if (info && info->ats_supported)
2171                                 translation = CONTEXT_TT_DEV_IOTLB;
2172                         else
2173                                 translation = CONTEXT_TT_MULTI_LEVEL;
2174
2175                         context_set_address_root(context, virt_to_phys(pgd));
2176                         context_set_address_width(context, agaw);
2177                 } else {
2178                         /*
2179                          * In pass through mode, AW must be programmed to
2180                          * indicate the largest AGAW value supported by
2181                          * hardware. And ASR is ignored by hardware.
2182                          */
2183                         context_set_address_width(context, iommu->msagaw);
2184                 }
2185
2186                 context_set_translation_type(context, translation);
2187         }
2188
2189         context_set_fault_enable(context);
2190         context_set_present(context);
2191         if (!ecap_coherent(iommu->ecap))
2192                 clflush_cache_range(context, sizeof(*context));
2193
2194         /*
2195          * It's a non-present to present mapping. If hardware doesn't cache
2196          * non-present entry we only need to flush the write-buffer. If the
2197          * _does_ cache non-present entries, then it does so in the special
2198          * domain #0, which we have to flush:
2199          */
2200         if (cap_caching_mode(iommu->cap)) {
2201                 iommu->flush.flush_context(iommu, 0,
2202                                            (((u16)bus) << 8) | devfn,
2203                                            DMA_CCMD_MASK_NOBIT,
2204                                            DMA_CCMD_DEVICE_INVL);
2205                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2206         } else {
2207                 iommu_flush_write_buffer(iommu);
2208         }
2209         iommu_enable_dev_iotlb(info);
2210
2211         ret = 0;
2212
2213 out_unlock:
2214         spin_unlock(&iommu->lock);
2215         spin_unlock_irqrestore(&device_domain_lock, flags);
2216
2217         return ret;
2218 }
2219
2220 struct domain_context_mapping_data {
2221         struct dmar_domain *domain;
2222         struct intel_iommu *iommu;
2223         struct pasid_table *table;
2224 };
2225
2226 static int domain_context_mapping_cb(struct pci_dev *pdev,
2227                                      u16 alias, void *opaque)
2228 {
2229         struct domain_context_mapping_data *data = opaque;
2230
2231         return domain_context_mapping_one(data->domain, data->iommu,
2232                                           data->table, PCI_BUS_NUM(alias),
2233                                           alias & 0xff);
2234 }
2235
2236 static int
2237 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2238 {
2239         struct domain_context_mapping_data data;
2240         struct pasid_table *table;
2241         struct intel_iommu *iommu;
2242         u8 bus, devfn;
2243
2244         iommu = device_to_iommu(dev, &bus, &devfn);
2245         if (!iommu)
2246                 return -ENODEV;
2247
2248         table = intel_pasid_get_table(dev);
2249
2250         if (!dev_is_pci(dev))
2251                 return domain_context_mapping_one(domain, iommu, table,
2252                                                   bus, devfn);
2253
2254         data.domain = domain;
2255         data.iommu = iommu;
2256         data.table = table;
2257
2258         return pci_for_each_dma_alias(to_pci_dev(dev),
2259                                       &domain_context_mapping_cb, &data);
2260 }
2261
2262 static int domain_context_mapped_cb(struct pci_dev *pdev,
2263                                     u16 alias, void *opaque)
2264 {
2265         struct intel_iommu *iommu = opaque;
2266
2267         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2268 }
2269
2270 static int domain_context_mapped(struct device *dev)
2271 {
2272         struct intel_iommu *iommu;
2273         u8 bus, devfn;
2274
2275         iommu = device_to_iommu(dev, &bus, &devfn);
2276         if (!iommu)
2277                 return -ENODEV;
2278
2279         if (!dev_is_pci(dev))
2280                 return device_context_mapped(iommu, bus, devfn);
2281
2282         return !pci_for_each_dma_alias(to_pci_dev(dev),
2283                                        domain_context_mapped_cb, iommu);
2284 }
2285
2286 /* Returns a number of VTD pages, but aligned to MM page size */
2287 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2288                                             size_t size)
2289 {
2290         host_addr &= ~PAGE_MASK;
2291         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2292 }
2293
2294 /* Return largest possible superpage level for a given mapping */
2295 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2296                                           unsigned long iov_pfn,
2297                                           unsigned long phy_pfn,
2298                                           unsigned long pages)
2299 {
2300         int support, level = 1;
2301         unsigned long pfnmerge;
2302
2303         support = domain->iommu_superpage;
2304
2305         /* To use a large page, the virtual *and* physical addresses
2306            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2307            of them will mean we have to use smaller pages. So just
2308            merge them and check both at once. */
2309         pfnmerge = iov_pfn | phy_pfn;
2310
2311         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2312                 pages >>= VTD_STRIDE_SHIFT;
2313                 if (!pages)
2314                         break;
2315                 pfnmerge >>= VTD_STRIDE_SHIFT;
2316                 level++;
2317                 support--;
2318         }
2319         return level;
2320 }
2321
2322 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2323                             struct scatterlist *sg, unsigned long phys_pfn,
2324                             unsigned long nr_pages, int prot)
2325 {
2326         struct dma_pte *first_pte = NULL, *pte = NULL;
2327         phys_addr_t pteval;
2328         unsigned long sg_res = 0;
2329         unsigned int largepage_lvl = 0;
2330         unsigned long lvl_pages = 0;
2331         u64 attr;
2332
2333         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2334
2335         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2336                 return -EINVAL;
2337
2338         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2339         if (domain_use_first_level(domain))
2340                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2341
2342         if (!sg) {
2343                 sg_res = nr_pages;
2344                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2345         }
2346
2347         while (nr_pages > 0) {
2348                 uint64_t tmp;
2349
2350                 if (!sg_res) {
2351                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2352
2353                         sg_res = aligned_nrpages(sg->offset, sg->length);
2354                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2355                         sg->dma_length = sg->length;
2356                         pteval = (sg_phys(sg) - pgoff) | attr;
2357                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2358                 }
2359
2360                 if (!pte) {
2361                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2362
2363                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2364                         if (!pte)
2365                                 return -ENOMEM;
2366                         /* It is large page*/
2367                         if (largepage_lvl > 1) {
2368                                 unsigned long nr_superpages, end_pfn;
2369
2370                                 pteval |= DMA_PTE_LARGE_PAGE;
2371                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2372
2373                                 nr_superpages = sg_res / lvl_pages;
2374                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2375
2376                                 /*
2377                                  * Ensure that old small page tables are
2378                                  * removed to make room for superpage(s).
2379                                  * We're adding new large pages, so make sure
2380                                  * we don't remove their parent tables.
2381                                  */
2382                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2383                                                        largepage_lvl + 1);
2384                         } else {
2385                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2386                         }
2387
2388                 }
2389                 /* We don't need lock here, nobody else
2390                  * touches the iova range
2391                  */
2392                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2393                 if (tmp) {
2394                         static int dumps = 5;
2395                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2396                                 iov_pfn, tmp, (unsigned long long)pteval);
2397                         if (dumps) {
2398                                 dumps--;
2399                                 debug_dma_dump_mappings(NULL);
2400                         }
2401                         WARN_ON(1);
2402                 }
2403
2404                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2405
2406                 BUG_ON(nr_pages < lvl_pages);
2407                 BUG_ON(sg_res < lvl_pages);
2408
2409                 nr_pages -= lvl_pages;
2410                 iov_pfn += lvl_pages;
2411                 phys_pfn += lvl_pages;
2412                 pteval += lvl_pages * VTD_PAGE_SIZE;
2413                 sg_res -= lvl_pages;
2414
2415                 /* If the next PTE would be the first in a new page, then we
2416                    need to flush the cache on the entries we've just written.
2417                    And then we'll need to recalculate 'pte', so clear it and
2418                    let it get set again in the if (!pte) block above.
2419
2420                    If we're done (!nr_pages) we need to flush the cache too.
2421
2422                    Also if we've been setting superpages, we may need to
2423                    recalculate 'pte' and switch back to smaller pages for the
2424                    end of the mapping, if the trailing size is not enough to
2425                    use another superpage (i.e. sg_res < lvl_pages). */
2426                 pte++;
2427                 if (!nr_pages || first_pte_in_page(pte) ||
2428                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2429                         domain_flush_cache(domain, first_pte,
2430                                            (void *)pte - (void *)first_pte);
2431                         pte = NULL;
2432                 }
2433
2434                 if (!sg_res && nr_pages)
2435                         sg = sg_next(sg);
2436         }
2437         return 0;
2438 }
2439
2440 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2441                           struct scatterlist *sg, unsigned long phys_pfn,
2442                           unsigned long nr_pages, int prot)
2443 {
2444         int iommu_id, ret;
2445         struct intel_iommu *iommu;
2446
2447         /* Do the real mapping first */
2448         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2449         if (ret)
2450                 return ret;
2451
2452         for_each_domain_iommu(iommu_id, domain) {
2453                 iommu = g_iommus[iommu_id];
2454                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2455         }
2456
2457         return 0;
2458 }
2459
2460 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2461                                     struct scatterlist *sg, unsigned long nr_pages,
2462                                     int prot)
2463 {
2464         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2465 }
2466
2467 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2468                                      unsigned long phys_pfn, unsigned long nr_pages,
2469                                      int prot)
2470 {
2471         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2472 }
2473
2474 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2475 {
2476         unsigned long flags;
2477         struct context_entry *context;
2478         u16 did_old;
2479
2480         if (!iommu)
2481                 return;
2482
2483         spin_lock_irqsave(&iommu->lock, flags);
2484         context = iommu_context_addr(iommu, bus, devfn, 0);
2485         if (!context) {
2486                 spin_unlock_irqrestore(&iommu->lock, flags);
2487                 return;
2488         }
2489         did_old = context_domain_id(context);
2490         context_clear_entry(context);
2491         __iommu_flush_cache(iommu, context, sizeof(*context));
2492         spin_unlock_irqrestore(&iommu->lock, flags);
2493         iommu->flush.flush_context(iommu,
2494                                    did_old,
2495                                    (((u16)bus) << 8) | devfn,
2496                                    DMA_CCMD_MASK_NOBIT,
2497                                    DMA_CCMD_DEVICE_INVL);
2498         iommu->flush.flush_iotlb(iommu,
2499                                  did_old,
2500                                  0,
2501                                  0,
2502                                  DMA_TLB_DSI_FLUSH);
2503 }
2504
2505 static inline void unlink_domain_info(struct device_domain_info *info)
2506 {
2507         assert_spin_locked(&device_domain_lock);
2508         list_del(&info->link);
2509         list_del(&info->global);
2510         if (info->dev)
2511                 dev_iommu_priv_set(info->dev, NULL);
2512 }
2513
2514 static void domain_remove_dev_info(struct dmar_domain *domain)
2515 {
2516         struct device_domain_info *info, *tmp;
2517         unsigned long flags;
2518
2519         spin_lock_irqsave(&device_domain_lock, flags);
2520         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2521                 __dmar_remove_one_dev_info(info);
2522         spin_unlock_irqrestore(&device_domain_lock, flags);
2523 }
2524
2525 struct dmar_domain *find_domain(struct device *dev)
2526 {
2527         struct device_domain_info *info;
2528
2529         if (unlikely(!dev || !dev->iommu))
2530                 return NULL;
2531
2532         if (unlikely(attach_deferred(dev)))
2533                 return NULL;
2534
2535         /* No lock here, assumes no domain exit in normal case */
2536         info = get_domain_info(dev);
2537         if (likely(info))
2538                 return info->domain;
2539
2540         return NULL;
2541 }
2542
2543 static void do_deferred_attach(struct device *dev)
2544 {
2545         struct iommu_domain *domain;
2546
2547         dev_iommu_priv_set(dev, NULL);
2548         domain = iommu_get_domain_for_dev(dev);
2549         if (domain)
2550                 intel_iommu_attach_device(domain, dev);
2551 }
2552
2553 static inline struct device_domain_info *
2554 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2555 {
2556         struct device_domain_info *info;
2557
2558         list_for_each_entry(info, &device_domain_list, global)
2559                 if (info->segment == segment && info->bus == bus &&
2560                     info->devfn == devfn)
2561                         return info;
2562
2563         return NULL;
2564 }
2565
2566 static int domain_setup_first_level(struct intel_iommu *iommu,
2567                                     struct dmar_domain *domain,
2568                                     struct device *dev,
2569                                     u32 pasid)
2570 {
2571         int flags = PASID_FLAG_SUPERVISOR_MODE;
2572         struct dma_pte *pgd = domain->pgd;
2573         int agaw, level;
2574
2575         /*
2576          * Skip top levels of page tables for iommu which has
2577          * less agaw than default. Unnecessary for PT mode.
2578          */
2579         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2580                 pgd = phys_to_virt(dma_pte_addr(pgd));
2581                 if (!dma_pte_present(pgd))
2582                         return -ENOMEM;
2583         }
2584
2585         level = agaw_to_level(agaw);
2586         if (level != 4 && level != 5)
2587                 return -EINVAL;
2588
2589         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2590
2591         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2592                                              domain->iommu_did[iommu->seq_id],
2593                                              flags);
2594 }
2595
2596 static bool dev_is_real_dma_subdevice(struct device *dev)
2597 {
2598         return dev && dev_is_pci(dev) &&
2599                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2600 }
2601
2602 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2603                                                     int bus, int devfn,
2604                                                     struct device *dev,
2605                                                     struct dmar_domain *domain)
2606 {
2607         struct dmar_domain *found = NULL;
2608         struct device_domain_info *info;
2609         unsigned long flags;
2610         int ret;
2611
2612         info = alloc_devinfo_mem();
2613         if (!info)
2614                 return NULL;
2615
2616         if (!dev_is_real_dma_subdevice(dev)) {
2617                 info->bus = bus;
2618                 info->devfn = devfn;
2619                 info->segment = iommu->segment;
2620         } else {
2621                 struct pci_dev *pdev = to_pci_dev(dev);
2622
2623                 info->bus = pdev->bus->number;
2624                 info->devfn = pdev->devfn;
2625                 info->segment = pci_domain_nr(pdev->bus);
2626         }
2627
2628         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2629         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2630         info->ats_qdep = 0;
2631         info->dev = dev;
2632         info->domain = domain;
2633         info->iommu = iommu;
2634         info->pasid_table = NULL;
2635         info->auxd_enabled = 0;
2636         INIT_LIST_HEAD(&info->auxiliary_domains);
2637
2638         if (dev && dev_is_pci(dev)) {
2639                 struct pci_dev *pdev = to_pci_dev(info->dev);
2640
2641                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2642                     pci_ats_supported(pdev) &&
2643                     dmar_find_matched_atsr_unit(pdev))
2644                         info->ats_supported = 1;
2645
2646                 if (sm_supported(iommu)) {
2647                         if (pasid_supported(iommu)) {
2648                                 int features = pci_pasid_features(pdev);
2649                                 if (features >= 0)
2650                                         info->pasid_supported = features | 1;
2651                         }
2652
2653                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2654                             pci_pri_supported(pdev))
2655                                 info->pri_supported = 1;
2656                 }
2657         }
2658
2659         spin_lock_irqsave(&device_domain_lock, flags);
2660         if (dev)
2661                 found = find_domain(dev);
2662
2663         if (!found) {
2664                 struct device_domain_info *info2;
2665                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2666                                                        info->devfn);
2667                 if (info2) {
2668                         found      = info2->domain;
2669                         info2->dev = dev;
2670                 }
2671         }
2672
2673         if (found) {
2674                 spin_unlock_irqrestore(&device_domain_lock, flags);
2675                 free_devinfo_mem(info);
2676                 /* Caller must free the original domain */
2677                 return found;
2678         }
2679
2680         spin_lock(&iommu->lock);
2681         ret = domain_attach_iommu(domain, iommu);
2682         spin_unlock(&iommu->lock);
2683
2684         if (ret) {
2685                 spin_unlock_irqrestore(&device_domain_lock, flags);
2686                 free_devinfo_mem(info);
2687                 return NULL;
2688         }
2689
2690         list_add(&info->link, &domain->devices);
2691         list_add(&info->global, &device_domain_list);
2692         if (dev)
2693                 dev_iommu_priv_set(dev, info);
2694         spin_unlock_irqrestore(&device_domain_lock, flags);
2695
2696         /* PASID table is mandatory for a PCI device in scalable mode. */
2697         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2698                 ret = intel_pasid_alloc_table(dev);
2699                 if (ret) {
2700                         dev_err(dev, "PASID table allocation failed\n");
2701                         dmar_remove_one_dev_info(dev);
2702                         return NULL;
2703                 }
2704
2705                 /* Setup the PASID entry for requests without PASID: */
2706                 spin_lock_irqsave(&iommu->lock, flags);
2707                 if (hw_pass_through && domain_type_is_si(domain))
2708                         ret = intel_pasid_setup_pass_through(iommu, domain,
2709                                         dev, PASID_RID2PASID);
2710                 else if (domain_use_first_level(domain))
2711                         ret = domain_setup_first_level(iommu, domain, dev,
2712                                         PASID_RID2PASID);
2713                 else
2714                         ret = intel_pasid_setup_second_level(iommu, domain,
2715                                         dev, PASID_RID2PASID);
2716                 spin_unlock_irqrestore(&iommu->lock, flags);
2717                 if (ret) {
2718                         dev_err(dev, "Setup RID2PASID failed\n");
2719                         dmar_remove_one_dev_info(dev);
2720                         return NULL;
2721                 }
2722         }
2723
2724         if (dev && domain_context_mapping(domain, dev)) {
2725                 dev_err(dev, "Domain context map failed\n");
2726                 dmar_remove_one_dev_info(dev);
2727                 return NULL;
2728         }
2729
2730         return domain;
2731 }
2732
2733 static int iommu_domain_identity_map(struct dmar_domain *domain,
2734                                      unsigned long first_vpfn,
2735                                      unsigned long last_vpfn)
2736 {
2737         /*
2738          * RMRR range might have overlap with physical memory range,
2739          * clear it first
2740          */
2741         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2742
2743         return __domain_mapping(domain, first_vpfn, NULL,
2744                                 first_vpfn, last_vpfn - first_vpfn + 1,
2745                                 DMA_PTE_READ|DMA_PTE_WRITE);
2746 }
2747
2748 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2749
2750 static int __init si_domain_init(int hw)
2751 {
2752         struct dmar_rmrr_unit *rmrr;
2753         struct device *dev;
2754         int i, nid, ret;
2755
2756         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2757         if (!si_domain)
2758                 return -EFAULT;
2759
2760         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2761                 domain_exit(si_domain);
2762                 return -EFAULT;
2763         }
2764
2765         if (hw)
2766                 return 0;
2767
2768         for_each_online_node(nid) {
2769                 unsigned long start_pfn, end_pfn;
2770                 int i;
2771
2772                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2773                         ret = iommu_domain_identity_map(si_domain,
2774                                         mm_to_dma_pfn(start_pfn),
2775                                         mm_to_dma_pfn(end_pfn));
2776                         if (ret)
2777                                 return ret;
2778                 }
2779         }
2780
2781         /*
2782          * Identity map the RMRRs so that devices with RMRRs could also use
2783          * the si_domain.
2784          */
2785         for_each_rmrr_units(rmrr) {
2786                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2787                                           i, dev) {
2788                         unsigned long long start = rmrr->base_address;
2789                         unsigned long long end = rmrr->end_address;
2790
2791                         if (WARN_ON(end < start ||
2792                                     end >> agaw_to_width(si_domain->agaw)))
2793                                 continue;
2794
2795                         ret = iommu_domain_identity_map(si_domain,
2796                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2797                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2798                         if (ret)
2799                                 return ret;
2800                 }
2801         }
2802
2803         return 0;
2804 }
2805
2806 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2807 {
2808         struct dmar_domain *ndomain;
2809         struct intel_iommu *iommu;
2810         u8 bus, devfn;
2811
2812         iommu = device_to_iommu(dev, &bus, &devfn);
2813         if (!iommu)
2814                 return -ENODEV;
2815
2816         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2817         if (ndomain != domain)
2818                 return -EBUSY;
2819
2820         return 0;
2821 }
2822
2823 static bool device_has_rmrr(struct device *dev)
2824 {
2825         struct dmar_rmrr_unit *rmrr;
2826         struct device *tmp;
2827         int i;
2828
2829         rcu_read_lock();
2830         for_each_rmrr_units(rmrr) {
2831                 /*
2832                  * Return TRUE if this RMRR contains the device that
2833                  * is passed in.
2834                  */
2835                 for_each_active_dev_scope(rmrr->devices,
2836                                           rmrr->devices_cnt, i, tmp)
2837                         if (tmp == dev ||
2838                             is_downstream_to_pci_bridge(dev, tmp)) {
2839                                 rcu_read_unlock();
2840                                 return true;
2841                         }
2842         }
2843         rcu_read_unlock();
2844         return false;
2845 }
2846
2847 /**
2848  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2849  * is relaxable (ie. is allowed to be not enforced under some conditions)
2850  * @dev: device handle
2851  *
2852  * We assume that PCI USB devices with RMRRs have them largely
2853  * for historical reasons and that the RMRR space is not actively used post
2854  * boot.  This exclusion may change if vendors begin to abuse it.
2855  *
2856  * The same exception is made for graphics devices, with the requirement that
2857  * any use of the RMRR regions will be torn down before assigning the device
2858  * to a guest.
2859  *
2860  * Return: true if the RMRR is relaxable, false otherwise
2861  */
2862 static bool device_rmrr_is_relaxable(struct device *dev)
2863 {
2864         struct pci_dev *pdev;
2865
2866         if (!dev_is_pci(dev))
2867                 return false;
2868
2869         pdev = to_pci_dev(dev);
2870         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2871                 return true;
2872         else
2873                 return false;
2874 }
2875
2876 /*
2877  * There are a couple cases where we need to restrict the functionality of
2878  * devices associated with RMRRs.  The first is when evaluating a device for
2879  * identity mapping because problems exist when devices are moved in and out
2880  * of domains and their respective RMRR information is lost.  This means that
2881  * a device with associated RMRRs will never be in a "passthrough" domain.
2882  * The second is use of the device through the IOMMU API.  This interface
2883  * expects to have full control of the IOVA space for the device.  We cannot
2884  * satisfy both the requirement that RMRR access is maintained and have an
2885  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2886  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2887  * We therefore prevent devices associated with an RMRR from participating in
2888  * the IOMMU API, which eliminates them from device assignment.
2889  *
2890  * In both cases, devices which have relaxable RMRRs are not concerned by this
2891  * restriction. See device_rmrr_is_relaxable comment.
2892  */
2893 static bool device_is_rmrr_locked(struct device *dev)
2894 {
2895         if (!device_has_rmrr(dev))
2896                 return false;
2897
2898         if (device_rmrr_is_relaxable(dev))
2899                 return false;
2900
2901         return true;
2902 }
2903
2904 /*
2905  * Return the required default domain type for a specific device.
2906  *
2907  * @dev: the device in query
2908  * @startup: true if this is during early boot
2909  *
2910  * Returns:
2911  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2912  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2913  *  - 0: both identity and dynamic domains work for this device
2914  */
2915 static int device_def_domain_type(struct device *dev)
2916 {
2917         if (dev_is_pci(dev)) {
2918                 struct pci_dev *pdev = to_pci_dev(dev);
2919
2920                 /*
2921                  * Prevent any device marked as untrusted from getting
2922                  * placed into the statically identity mapping domain.
2923                  */
2924                 if (pdev->untrusted)
2925                         return IOMMU_DOMAIN_DMA;
2926
2927                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2928                         return IOMMU_DOMAIN_IDENTITY;
2929
2930                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2931                         return IOMMU_DOMAIN_IDENTITY;
2932         }
2933
2934         return 0;
2935 }
2936
2937 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2938 {
2939         /*
2940          * Start from the sane iommu hardware state.
2941          * If the queued invalidation is already initialized by us
2942          * (for example, while enabling interrupt-remapping) then
2943          * we got the things already rolling from a sane state.
2944          */
2945         if (!iommu->qi) {
2946                 /*
2947                  * Clear any previous faults.
2948                  */
2949                 dmar_fault(-1, iommu);
2950                 /*
2951                  * Disable queued invalidation if supported and already enabled
2952                  * before OS handover.
2953                  */
2954                 dmar_disable_qi(iommu);
2955         }
2956
2957         if (dmar_enable_qi(iommu)) {
2958                 /*
2959                  * Queued Invalidate not enabled, use Register Based Invalidate
2960                  */
2961                 iommu->flush.flush_context = __iommu_flush_context;
2962                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2963                 pr_info("%s: Using Register based invalidation\n",
2964                         iommu->name);
2965         } else {
2966                 iommu->flush.flush_context = qi_flush_context;
2967                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2968                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2969         }
2970 }
2971
2972 static int copy_context_table(struct intel_iommu *iommu,
2973                               struct root_entry *old_re,
2974                               struct context_entry **tbl,
2975                               int bus, bool ext)
2976 {
2977         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2978         struct context_entry *new_ce = NULL, ce;
2979         struct context_entry *old_ce = NULL;
2980         struct root_entry re;
2981         phys_addr_t old_ce_phys;
2982
2983         tbl_idx = ext ? bus * 2 : bus;
2984         memcpy(&re, old_re, sizeof(re));
2985
2986         for (devfn = 0; devfn < 256; devfn++) {
2987                 /* First calculate the correct index */
2988                 idx = (ext ? devfn * 2 : devfn) % 256;
2989
2990                 if (idx == 0) {
2991                         /* First save what we may have and clean up */
2992                         if (new_ce) {
2993                                 tbl[tbl_idx] = new_ce;
2994                                 __iommu_flush_cache(iommu, new_ce,
2995                                                     VTD_PAGE_SIZE);
2996                                 pos = 1;
2997                         }
2998
2999                         if (old_ce)
3000                                 memunmap(old_ce);
3001
3002                         ret = 0;
3003                         if (devfn < 0x80)
3004                                 old_ce_phys = root_entry_lctp(&re);
3005                         else
3006                                 old_ce_phys = root_entry_uctp(&re);
3007
3008                         if (!old_ce_phys) {
3009                                 if (ext && devfn == 0) {
3010                                         /* No LCTP, try UCTP */
3011                                         devfn = 0x7f;
3012                                         continue;
3013                                 } else {
3014                                         goto out;
3015                                 }
3016                         }
3017
3018                         ret = -ENOMEM;
3019                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3020                                         MEMREMAP_WB);
3021                         if (!old_ce)
3022                                 goto out;
3023
3024                         new_ce = alloc_pgtable_page(iommu->node);
3025                         if (!new_ce)
3026                                 goto out_unmap;
3027
3028                         ret = 0;
3029                 }
3030
3031                 /* Now copy the context entry */
3032                 memcpy(&ce, old_ce + idx, sizeof(ce));
3033
3034                 if (!__context_present(&ce))
3035                         continue;
3036
3037                 did = context_domain_id(&ce);
3038                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3039                         set_bit(did, iommu->domain_ids);
3040
3041                 /*
3042                  * We need a marker for copied context entries. This
3043                  * marker needs to work for the old format as well as
3044                  * for extended context entries.
3045                  *
3046                  * Bit 67 of the context entry is used. In the old
3047                  * format this bit is available to software, in the
3048                  * extended format it is the PGE bit, but PGE is ignored
3049                  * by HW if PASIDs are disabled (and thus still
3050                  * available).
3051                  *
3052                  * So disable PASIDs first and then mark the entry
3053                  * copied. This means that we don't copy PASID
3054                  * translations from the old kernel, but this is fine as
3055                  * faults there are not fatal.
3056                  */
3057                 context_clear_pasid_enable(&ce);
3058                 context_set_copied(&ce);
3059
3060                 new_ce[idx] = ce;
3061         }
3062
3063         tbl[tbl_idx + pos] = new_ce;
3064
3065         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3066
3067 out_unmap:
3068         memunmap(old_ce);
3069
3070 out:
3071         return ret;
3072 }
3073
3074 static int copy_translation_tables(struct intel_iommu *iommu)
3075 {
3076         struct context_entry **ctxt_tbls;
3077         struct root_entry *old_rt;
3078         phys_addr_t old_rt_phys;
3079         int ctxt_table_entries;
3080         unsigned long flags;
3081         u64 rtaddr_reg;
3082         int bus, ret;
3083         bool new_ext, ext;
3084
3085         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3086         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3087         new_ext    = !!ecap_ecs(iommu->ecap);
3088
3089         /*
3090          * The RTT bit can only be changed when translation is disabled,
3091          * but disabling translation means to open a window for data
3092          * corruption. So bail out and don't copy anything if we would
3093          * have to change the bit.
3094          */
3095         if (new_ext != ext)
3096                 return -EINVAL;
3097
3098         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3099         if (!old_rt_phys)
3100                 return -EINVAL;
3101
3102         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3103         if (!old_rt)
3104                 return -ENOMEM;
3105
3106         /* This is too big for the stack - allocate it from slab */
3107         ctxt_table_entries = ext ? 512 : 256;
3108         ret = -ENOMEM;
3109         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3110         if (!ctxt_tbls)
3111                 goto out_unmap;
3112
3113         for (bus = 0; bus < 256; bus++) {
3114                 ret = copy_context_table(iommu, &old_rt[bus],
3115                                          ctxt_tbls, bus, ext);
3116                 if (ret) {
3117                         pr_err("%s: Failed to copy context table for bus %d\n",
3118                                 iommu->name, bus);
3119                         continue;
3120                 }
3121         }
3122
3123         spin_lock_irqsave(&iommu->lock, flags);
3124
3125         /* Context tables are copied, now write them to the root_entry table */
3126         for (bus = 0; bus < 256; bus++) {
3127                 int idx = ext ? bus * 2 : bus;
3128                 u64 val;
3129
3130                 if (ctxt_tbls[idx]) {
3131                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3132                         iommu->root_entry[bus].lo = val;
3133                 }
3134
3135                 if (!ext || !ctxt_tbls[idx + 1])
3136                         continue;
3137
3138                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3139                 iommu->root_entry[bus].hi = val;
3140         }
3141
3142         spin_unlock_irqrestore(&iommu->lock, flags);
3143
3144         kfree(ctxt_tbls);
3145
3146         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3147
3148         ret = 0;
3149
3150 out_unmap:
3151         memunmap(old_rt);
3152
3153         return ret;
3154 }
3155
3156 #ifdef CONFIG_INTEL_IOMMU_SVM
3157 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3158 {
3159         struct intel_iommu *iommu = data;
3160         ioasid_t ioasid;
3161
3162         if (!iommu)
3163                 return INVALID_IOASID;
3164         /*
3165          * VT-d virtual command interface always uses the full 20 bit
3166          * PASID range. Host can partition guest PASID range based on
3167          * policies but it is out of guest's control.
3168          */
3169         if (min < PASID_MIN || max > intel_pasid_max_id)
3170                 return INVALID_IOASID;
3171
3172         if (vcmd_alloc_pasid(iommu, &ioasid))
3173                 return INVALID_IOASID;
3174
3175         return ioasid;
3176 }
3177
3178 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3179 {
3180         struct intel_iommu *iommu = data;
3181
3182         if (!iommu)
3183                 return;
3184         /*
3185          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3186          * We can only free the PASID when all the devices are unbound.
3187          */
3188         if (ioasid_find(NULL, ioasid, NULL)) {
3189                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3190                 return;
3191         }
3192         vcmd_free_pasid(iommu, ioasid);
3193 }
3194
3195 static void register_pasid_allocator(struct intel_iommu *iommu)
3196 {
3197         /*
3198          * If we are running in the host, no need for custom allocator
3199          * in that PASIDs are allocated from the host system-wide.
3200          */
3201         if (!cap_caching_mode(iommu->cap))
3202                 return;
3203
3204         if (!sm_supported(iommu)) {
3205                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3206                 return;
3207         }
3208
3209         /*
3210          * Register a custom PASID allocator if we are running in a guest,
3211          * guest PASID must be obtained via virtual command interface.
3212          * There can be multiple vIOMMUs in each guest but only one allocator
3213          * is active. All vIOMMU allocators will eventually be calling the same
3214          * host allocator.
3215          */
3216         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3217                 return;
3218
3219         pr_info("Register custom PASID allocator\n");
3220         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3221         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3222         iommu->pasid_allocator.pdata = (void *)iommu;
3223         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3224                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3225                 /*
3226                  * Disable scalable mode on this IOMMU if there
3227                  * is no custom allocator. Mixing SM capable vIOMMU
3228                  * and non-SM vIOMMU are not supported.
3229                  */
3230                 intel_iommu_sm = 0;
3231         }
3232 }
3233 #endif
3234
3235 static int __init init_dmars(void)
3236 {
3237         struct dmar_drhd_unit *drhd;
3238         struct intel_iommu *iommu;
3239         int ret;
3240
3241         /*
3242          * for each drhd
3243          *    allocate root
3244          *    initialize and program root entry to not present
3245          * endfor
3246          */
3247         for_each_drhd_unit(drhd) {
3248                 /*
3249                  * lock not needed as this is only incremented in the single
3250                  * threaded kernel __init code path all other access are read
3251                  * only
3252                  */
3253                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3254                         g_num_of_iommus++;
3255                         continue;
3256                 }
3257                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3258         }
3259
3260         /* Preallocate enough resources for IOMMU hot-addition */
3261         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3262                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3263
3264         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3265                         GFP_KERNEL);
3266         if (!g_iommus) {
3267                 pr_err("Allocating global iommu array failed\n");
3268                 ret = -ENOMEM;
3269                 goto error;
3270         }
3271
3272         for_each_iommu(iommu, drhd) {
3273                 if (drhd->ignored) {
3274                         iommu_disable_translation(iommu);
3275                         continue;
3276                 }
3277
3278                 /*
3279                  * Find the max pasid size of all IOMMU's in the system.
3280                  * We need to ensure the system pasid table is no bigger
3281                  * than the smallest supported.
3282                  */
3283                 if (pasid_supported(iommu)) {
3284                         u32 temp = 2 << ecap_pss(iommu->ecap);
3285
3286                         intel_pasid_max_id = min_t(u32, temp,
3287                                                    intel_pasid_max_id);
3288                 }
3289
3290                 g_iommus[iommu->seq_id] = iommu;
3291
3292                 intel_iommu_init_qi(iommu);
3293
3294                 ret = iommu_init_domains(iommu);
3295                 if (ret)
3296                         goto free_iommu;
3297
3298                 init_translation_status(iommu);
3299
3300                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3301                         iommu_disable_translation(iommu);
3302                         clear_translation_pre_enabled(iommu);
3303                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3304                                 iommu->name);
3305                 }
3306
3307                 /*
3308                  * TBD:
3309                  * we could share the same root & context tables
3310                  * among all IOMMU's. Need to Split it later.
3311                  */
3312                 ret = iommu_alloc_root_entry(iommu);
3313                 if (ret)
3314                         goto free_iommu;
3315
3316                 if (translation_pre_enabled(iommu)) {
3317                         pr_info("Translation already enabled - trying to copy translation structures\n");
3318
3319                         ret = copy_translation_tables(iommu);
3320                         if (ret) {
3321                                 /*
3322                                  * We found the IOMMU with translation
3323                                  * enabled - but failed to copy over the
3324                                  * old root-entry table. Try to proceed
3325                                  * by disabling translation now and
3326                                  * allocating a clean root-entry table.
3327                                  * This might cause DMAR faults, but
3328                                  * probably the dump will still succeed.
3329                                  */
3330                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3331                                        iommu->name);
3332                                 iommu_disable_translation(iommu);
3333                                 clear_translation_pre_enabled(iommu);
3334                         } else {
3335                                 pr_info("Copied translation tables from previous kernel for %s\n",
3336                                         iommu->name);
3337                         }
3338                 }
3339
3340                 if (!ecap_pass_through(iommu->ecap))
3341                         hw_pass_through = 0;
3342                 intel_svm_check(iommu);
3343         }
3344
3345         /*
3346          * Now that qi is enabled on all iommus, set the root entry and flush
3347          * caches. This is required on some Intel X58 chipsets, otherwise the
3348          * flush_context function will loop forever and the boot hangs.
3349          */
3350         for_each_active_iommu(iommu, drhd) {
3351                 iommu_flush_write_buffer(iommu);
3352 #ifdef CONFIG_INTEL_IOMMU_SVM
3353                 register_pasid_allocator(iommu);
3354 #endif
3355                 iommu_set_root_entry(iommu);
3356                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3357                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3358         }
3359
3360 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3361         dmar_map_gfx = 0;
3362 #endif
3363
3364         if (!dmar_map_gfx)
3365                 iommu_identity_mapping |= IDENTMAP_GFX;
3366
3367         check_tylersburg_isoch();
3368
3369         ret = si_domain_init(hw_pass_through);
3370         if (ret)
3371                 goto free_iommu;
3372
3373         /*
3374          * for each drhd
3375          *   enable fault log
3376          *   global invalidate context cache
3377          *   global invalidate iotlb
3378          *   enable translation
3379          */
3380         for_each_iommu(iommu, drhd) {
3381                 if (drhd->ignored) {
3382                         /*
3383                          * we always have to disable PMRs or DMA may fail on
3384                          * this device
3385                          */
3386                         if (force_on)
3387                                 iommu_disable_protect_mem_regions(iommu);
3388                         continue;
3389                 }
3390
3391                 iommu_flush_write_buffer(iommu);
3392
3393 #ifdef CONFIG_INTEL_IOMMU_SVM
3394                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3395                         /*
3396                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3397                          * could cause possible lock race condition.
3398                          */
3399                         up_write(&dmar_global_lock);
3400                         ret = intel_svm_enable_prq(iommu);
3401                         down_write(&dmar_global_lock);
3402                         if (ret)
3403                                 goto free_iommu;
3404                 }
3405 #endif
3406                 ret = dmar_set_interrupt(iommu);
3407                 if (ret)
3408                         goto free_iommu;
3409         }
3410
3411         return 0;
3412
3413 free_iommu:
3414         for_each_active_iommu(iommu, drhd) {
3415                 disable_dmar_iommu(iommu);
3416                 free_dmar_iommu(iommu);
3417         }
3418
3419         kfree(g_iommus);
3420
3421 error:
3422         return ret;
3423 }
3424
3425 /* This takes a number of _MM_ pages, not VTD pages */
3426 static unsigned long intel_alloc_iova(struct device *dev,
3427                                      struct dmar_domain *domain,
3428                                      unsigned long nrpages, uint64_t dma_mask)
3429 {
3430         unsigned long iova_pfn;
3431
3432         /*
3433          * Restrict dma_mask to the width that the iommu can handle.
3434          * First-level translation restricts the input-address to a
3435          * canonical address (i.e., address bits 63:N have the same
3436          * value as address bit [N-1], where N is 48-bits with 4-level
3437          * paging and 57-bits with 5-level paging). Hence, skip bit
3438          * [N-1].
3439          */
3440         if (domain_use_first_level(domain))
3441                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3442                                  dma_mask);
3443         else
3444                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3445                                  dma_mask);
3446
3447         /* Ensure we reserve the whole size-aligned region */
3448         nrpages = __roundup_pow_of_two(nrpages);
3449
3450         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3451                 /*
3452                  * First try to allocate an io virtual address in
3453                  * DMA_BIT_MASK(32) and if that fails then try allocating
3454                  * from higher range
3455                  */
3456                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3457                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3458                 if (iova_pfn)
3459                         return iova_pfn;
3460         }
3461         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3462                                    IOVA_PFN(dma_mask), true);
3463         if (unlikely(!iova_pfn)) {
3464                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3465                              nrpages);
3466                 return 0;
3467         }
3468
3469         return iova_pfn;
3470 }
3471
3472 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3473                                      size_t size, int dir, u64 dma_mask)
3474 {
3475         struct dmar_domain *domain;
3476         phys_addr_t start_paddr;
3477         unsigned long iova_pfn;
3478         int prot = 0;
3479         int ret;
3480         struct intel_iommu *iommu;
3481         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3482
3483         BUG_ON(dir == DMA_NONE);
3484
3485         if (unlikely(attach_deferred(dev)))
3486                 do_deferred_attach(dev);
3487
3488         domain = find_domain(dev);
3489         if (!domain)
3490                 return DMA_MAPPING_ERROR;
3491
3492         iommu = domain_get_iommu(domain);
3493         size = aligned_nrpages(paddr, size);
3494
3495         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3496         if (!iova_pfn)
3497                 goto error;
3498
3499         /*
3500          * Check if DMAR supports zero-length reads on write only
3501          * mappings..
3502          */
3503         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3504                         !cap_zlr(iommu->cap))
3505                 prot |= DMA_PTE_READ;
3506         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3507                 prot |= DMA_PTE_WRITE;
3508         /*
3509          * paddr - (paddr + size) might be partial page, we should map the whole
3510          * page.  Note: if two part of one page are separately mapped, we
3511          * might have two guest_addr mapping to the same host paddr, but this
3512          * is not a big problem
3513          */
3514         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3515                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3516         if (ret)
3517                 goto error;
3518
3519         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3520         start_paddr += paddr & ~PAGE_MASK;
3521
3522         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3523
3524         return start_paddr;
3525
3526 error:
3527         if (iova_pfn)
3528                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3529         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3530                 size, (unsigned long long)paddr, dir);
3531         return DMA_MAPPING_ERROR;
3532 }
3533
3534 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3535                                  unsigned long offset, size_t size,
3536                                  enum dma_data_direction dir,
3537                                  unsigned long attrs)
3538 {
3539         return __intel_map_single(dev, page_to_phys(page) + offset,
3540                                   size, dir, *dev->dma_mask);
3541 }
3542
3543 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3544                                      size_t size, enum dma_data_direction dir,
3545                                      unsigned long attrs)
3546 {
3547         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3548 }
3549
3550 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3551 {
3552         struct dmar_domain *domain;
3553         unsigned long start_pfn, last_pfn;
3554         unsigned long nrpages;
3555         unsigned long iova_pfn;
3556         struct intel_iommu *iommu;
3557         struct page *freelist;
3558         struct pci_dev *pdev = NULL;
3559
3560         domain = find_domain(dev);
3561         BUG_ON(!domain);
3562
3563         iommu = domain_get_iommu(domain);
3564
3565         iova_pfn = IOVA_PFN(dev_addr);
3566
3567         nrpages = aligned_nrpages(dev_addr, size);
3568         start_pfn = mm_to_dma_pfn(iova_pfn);
3569         last_pfn = start_pfn + nrpages - 1;
3570
3571         if (dev_is_pci(dev))
3572                 pdev = to_pci_dev(dev);
3573
3574         freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
3575         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3576                         !has_iova_flush_queue(&domain->iovad)) {
3577                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3578                                       nrpages, !freelist, 0);
3579                 /* free iova */
3580                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3581                 dma_free_pagelist(freelist);
3582         } else {
3583                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3584                            (unsigned long)freelist);
3585                 /*
3586                  * queue up the release of the unmap to save the 1/6th of the
3587                  * cpu used up by the iotlb flush operation...
3588                  */
3589         }
3590
3591         trace_unmap_single(dev, dev_addr, size);
3592 }
3593
3594 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3595                              size_t size, enum dma_data_direction dir,
3596                              unsigned long attrs)
3597 {
3598         intel_unmap(dev, dev_addr, size);
3599 }
3600
3601 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3602                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3603 {
3604         intel_unmap(dev, dev_addr, size);
3605 }
3606
3607 static void *intel_alloc_coherent(struct device *dev, size_t size,
3608                                   dma_addr_t *dma_handle, gfp_t flags,
3609                                   unsigned long attrs)
3610 {
3611         struct page *page = NULL;
3612         int order;
3613
3614         if (unlikely(attach_deferred(dev)))
3615                 do_deferred_attach(dev);
3616
3617         size = PAGE_ALIGN(size);
3618         order = get_order(size);
3619
3620         if (gfpflags_allow_blocking(flags)) {
3621                 unsigned int count = size >> PAGE_SHIFT;
3622
3623                 page = dma_alloc_from_contiguous(dev, count, order,
3624                                                  flags & __GFP_NOWARN);
3625         }
3626
3627         if (!page)
3628                 page = alloc_pages(flags, order);
3629         if (!page)
3630                 return NULL;
3631         memset(page_address(page), 0, size);
3632
3633         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3634                                          DMA_BIDIRECTIONAL,
3635                                          dev->coherent_dma_mask);
3636         if (*dma_handle != DMA_MAPPING_ERROR)
3637                 return page_address(page);
3638         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3639                 __free_pages(page, order);
3640
3641         return NULL;
3642 }
3643
3644 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3645                                 dma_addr_t dma_handle, unsigned long attrs)
3646 {
3647         int order;
3648         struct page *page = virt_to_page(vaddr);
3649
3650         size = PAGE_ALIGN(size);
3651         order = get_order(size);
3652
3653         intel_unmap(dev, dma_handle, size);
3654         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3655                 __free_pages(page, order);
3656 }
3657
3658 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3659                            int nelems, enum dma_data_direction dir,
3660                            unsigned long attrs)
3661 {
3662         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3663         unsigned long nrpages = 0;
3664         struct scatterlist *sg;
3665         int i;
3666
3667         for_each_sg(sglist, sg, nelems, i) {
3668                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3669         }
3670
3671         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3672
3673         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3674 }
3675
3676 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3677                         enum dma_data_direction dir, unsigned long attrs)
3678 {
3679         int i;
3680         struct dmar_domain *domain;
3681         size_t size = 0;
3682         int prot = 0;
3683         unsigned long iova_pfn;
3684         int ret;
3685         struct scatterlist *sg;
3686         unsigned long start_vpfn;
3687         struct intel_iommu *iommu;
3688
3689         BUG_ON(dir == DMA_NONE);
3690
3691         if (unlikely(attach_deferred(dev)))
3692                 do_deferred_attach(dev);
3693
3694         domain = find_domain(dev);
3695         if (!domain)
3696                 return 0;
3697
3698         iommu = domain_get_iommu(domain);
3699
3700         for_each_sg(sglist, sg, nelems, i)
3701                 size += aligned_nrpages(sg->offset, sg->length);
3702
3703         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3704                                 *dev->dma_mask);
3705         if (!iova_pfn) {
3706                 sglist->dma_length = 0;
3707                 return 0;
3708         }
3709
3710         /*
3711          * Check if DMAR supports zero-length reads on write only
3712          * mappings..
3713          */
3714         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3715                         !cap_zlr(iommu->cap))
3716                 prot |= DMA_PTE_READ;
3717         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3718                 prot |= DMA_PTE_WRITE;
3719
3720         start_vpfn = mm_to_dma_pfn(iova_pfn);
3721
3722         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3723         if (unlikely(ret)) {
3724                 dma_pte_free_pagetable(domain, start_vpfn,
3725                                        start_vpfn + size - 1,
3726                                        agaw_to_level(domain->agaw) + 1);
3727                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3728                 return 0;
3729         }
3730
3731         for_each_sg(sglist, sg, nelems, i)
3732                 trace_map_sg(dev, i + 1, nelems, sg);
3733
3734         return nelems;
3735 }
3736
3737 static u64 intel_get_required_mask(struct device *dev)
3738 {
3739         return DMA_BIT_MASK(32);
3740 }
3741
3742 static const struct dma_map_ops intel_dma_ops = {
3743         .alloc = intel_alloc_coherent,
3744         .free = intel_free_coherent,
3745         .map_sg = intel_map_sg,
3746         .unmap_sg = intel_unmap_sg,
3747         .map_page = intel_map_page,
3748         .unmap_page = intel_unmap_page,
3749         .map_resource = intel_map_resource,
3750         .unmap_resource = intel_unmap_resource,
3751         .dma_supported = dma_direct_supported,
3752         .mmap = dma_common_mmap,
3753         .get_sgtable = dma_common_get_sgtable,
3754         .alloc_pages = dma_common_alloc_pages,
3755         .free_pages = dma_common_free_pages,
3756         .get_required_mask = intel_get_required_mask,
3757 };
3758
3759 static void
3760 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3761                    enum dma_data_direction dir, enum dma_sync_target target)
3762 {
3763         struct dmar_domain *domain;
3764         phys_addr_t tlb_addr;
3765
3766         domain = find_domain(dev);
3767         if (WARN_ON(!domain))
3768                 return;
3769
3770         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3771         if (is_swiotlb_buffer(tlb_addr))
3772                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3773 }
3774
3775 static dma_addr_t
3776 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3777                   enum dma_data_direction dir, unsigned long attrs,
3778                   u64 dma_mask)
3779 {
3780         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3781         struct dmar_domain *domain;
3782         struct intel_iommu *iommu;
3783         unsigned long iova_pfn;
3784         unsigned long nrpages;
3785         phys_addr_t tlb_addr;
3786         int prot = 0;
3787         int ret;
3788
3789         if (unlikely(attach_deferred(dev)))
3790                 do_deferred_attach(dev);
3791
3792         domain = find_domain(dev);
3793
3794         if (WARN_ON(dir == DMA_NONE || !domain))
3795                 return DMA_MAPPING_ERROR;
3796
3797         iommu = domain_get_iommu(domain);
3798         if (WARN_ON(!iommu))
3799                 return DMA_MAPPING_ERROR;
3800
3801         nrpages = aligned_nrpages(0, size);
3802         iova_pfn = intel_alloc_iova(dev, domain,
3803                                     dma_to_mm_pfn(nrpages), dma_mask);
3804         if (!iova_pfn)
3805                 return DMA_MAPPING_ERROR;
3806
3807         /*
3808          * Check if DMAR supports zero-length reads on write only
3809          * mappings..
3810          */
3811         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3812                         !cap_zlr(iommu->cap))
3813                 prot |= DMA_PTE_READ;
3814         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3815                 prot |= DMA_PTE_WRITE;
3816
3817         /*
3818          * If both the physical buffer start address and size are
3819          * page aligned, we don't need to use a bounce page.
3820          */
3821         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3822                 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3823                                 aligned_size, dir, attrs);
3824                 if (tlb_addr == DMA_MAPPING_ERROR) {
3825                         goto swiotlb_error;
3826                 } else {
3827                         /* Cleanup the padding area. */
3828                         void *padding_start = phys_to_virt(tlb_addr);
3829                         size_t padding_size = aligned_size;
3830
3831                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3832                             (dir == DMA_TO_DEVICE ||
3833                              dir == DMA_BIDIRECTIONAL)) {
3834                                 padding_start += size;
3835                                 padding_size -= size;
3836                         }
3837
3838                         memset(padding_start, 0, padding_size);
3839                 }
3840         } else {
3841                 tlb_addr = paddr;
3842         }
3843
3844         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3845                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3846         if (ret)
3847                 goto mapping_error;
3848
3849         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3850
3851         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3852
3853 mapping_error:
3854         if (is_swiotlb_buffer(tlb_addr))
3855                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3856                                          aligned_size, dir, attrs);
3857 swiotlb_error:
3858         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3859         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3860                 size, (unsigned long long)paddr, dir);
3861
3862         return DMA_MAPPING_ERROR;
3863 }
3864
3865 static void
3866 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3867                     enum dma_data_direction dir, unsigned long attrs)
3868 {
3869         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3870         struct dmar_domain *domain;
3871         phys_addr_t tlb_addr;
3872
3873         domain = find_domain(dev);
3874         if (WARN_ON(!domain))
3875                 return;
3876
3877         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3878         if (WARN_ON(!tlb_addr))
3879                 return;
3880
3881         intel_unmap(dev, dev_addr, size);
3882         if (is_swiotlb_buffer(tlb_addr))
3883                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3884                                          aligned_size, dir, attrs);
3885
3886         trace_bounce_unmap_single(dev, dev_addr, size);
3887 }
3888
3889 static dma_addr_t
3890 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3891                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3892 {
3893         return bounce_map_single(dev, page_to_phys(page) + offset,
3894                                  size, dir, attrs, *dev->dma_mask);
3895 }
3896
3897 static dma_addr_t
3898 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3899                     enum dma_data_direction dir, unsigned long attrs)
3900 {
3901         return bounce_map_single(dev, phys_addr, size,
3902                                  dir, attrs, *dev->dma_mask);
3903 }
3904
3905 static void
3906 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3907                   enum dma_data_direction dir, unsigned long attrs)
3908 {
3909         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3910 }
3911
3912 static void
3913 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3914                       enum dma_data_direction dir, unsigned long attrs)
3915 {
3916         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3917 }
3918
3919 static void
3920 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3921                 enum dma_data_direction dir, unsigned long attrs)
3922 {
3923         struct scatterlist *sg;
3924         int i;
3925
3926         for_each_sg(sglist, sg, nelems, i)
3927                 bounce_unmap_page(dev, sg->dma_address,
3928                                   sg_dma_len(sg), dir, attrs);
3929 }
3930
3931 static int
3932 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3933               enum dma_data_direction dir, unsigned long attrs)
3934 {
3935         int i;
3936         struct scatterlist *sg;
3937
3938         for_each_sg(sglist, sg, nelems, i) {
3939                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3940                                                   sg->offset, sg->length,
3941                                                   dir, attrs);
3942                 if (sg->dma_address == DMA_MAPPING_ERROR)
3943                         goto out_unmap;
3944                 sg_dma_len(sg) = sg->length;
3945         }
3946
3947         for_each_sg(sglist, sg, nelems, i)
3948                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3949
3950         return nelems;
3951
3952 out_unmap:
3953         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3954         return 0;
3955 }
3956
3957 static void
3958 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3959                            size_t size, enum dma_data_direction dir)
3960 {
3961         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3962 }
3963
3964 static void
3965 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3966                               size_t size, enum dma_data_direction dir)
3967 {
3968         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3969 }
3970
3971 static void
3972 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3973                        int nelems, enum dma_data_direction dir)
3974 {
3975         struct scatterlist *sg;
3976         int i;
3977
3978         for_each_sg(sglist, sg, nelems, i)
3979                 bounce_sync_single(dev, sg_dma_address(sg),
3980                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3981 }
3982
3983 static void
3984 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3985                           int nelems, enum dma_data_direction dir)
3986 {
3987         struct scatterlist *sg;
3988         int i;
3989
3990         for_each_sg(sglist, sg, nelems, i)
3991                 bounce_sync_single(dev, sg_dma_address(sg),
3992                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3993 }
3994
3995 static const struct dma_map_ops bounce_dma_ops = {
3996         .alloc                  = intel_alloc_coherent,
3997         .free                   = intel_free_coherent,
3998         .map_sg                 = bounce_map_sg,
3999         .unmap_sg               = bounce_unmap_sg,
4000         .map_page               = bounce_map_page,
4001         .unmap_page             = bounce_unmap_page,
4002         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4003         .sync_single_for_device = bounce_sync_single_for_device,
4004         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4005         .sync_sg_for_device     = bounce_sync_sg_for_device,
4006         .map_resource           = bounce_map_resource,
4007         .unmap_resource         = bounce_unmap_resource,
4008         .alloc_pages            = dma_common_alloc_pages,
4009         .free_pages             = dma_common_free_pages,
4010         .dma_supported          = dma_direct_supported,
4011 };
4012
4013 static inline int iommu_domain_cache_init(void)
4014 {
4015         int ret = 0;
4016
4017         iommu_domain_cache = kmem_cache_create("iommu_domain",
4018                                          sizeof(struct dmar_domain),
4019                                          0,
4020                                          SLAB_HWCACHE_ALIGN,
4021
4022                                          NULL);
4023         if (!iommu_domain_cache) {
4024                 pr_err("Couldn't create iommu_domain cache\n");
4025                 ret = -ENOMEM;
4026         }
4027
4028         return ret;
4029 }
4030
4031 static inline int iommu_devinfo_cache_init(void)
4032 {
4033         int ret = 0;
4034
4035         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4036                                          sizeof(struct device_domain_info),
4037                                          0,
4038                                          SLAB_HWCACHE_ALIGN,
4039                                          NULL);
4040         if (!iommu_devinfo_cache) {
4041                 pr_err("Couldn't create devinfo cache\n");
4042                 ret = -ENOMEM;
4043         }
4044
4045         return ret;
4046 }
4047
4048 static int __init iommu_init_mempool(void)
4049 {
4050         int ret;
4051         ret = iova_cache_get();
4052         if (ret)
4053                 return ret;
4054
4055         ret = iommu_domain_cache_init();
4056         if (ret)
4057                 goto domain_error;
4058
4059         ret = iommu_devinfo_cache_init();
4060         if (!ret)
4061                 return ret;
4062
4063         kmem_cache_destroy(iommu_domain_cache);
4064 domain_error:
4065         iova_cache_put();
4066
4067         return -ENOMEM;
4068 }
4069
4070 static void __init iommu_exit_mempool(void)
4071 {
4072         kmem_cache_destroy(iommu_devinfo_cache);
4073         kmem_cache_destroy(iommu_domain_cache);
4074         iova_cache_put();
4075 }
4076
4077 static void __init init_no_remapping_devices(void)
4078 {
4079         struct dmar_drhd_unit *drhd;
4080         struct device *dev;
4081         int i;
4082
4083         for_each_drhd_unit(drhd) {
4084                 if (!drhd->include_all) {
4085                         for_each_active_dev_scope(drhd->devices,
4086                                                   drhd->devices_cnt, i, dev)
4087                                 break;
4088                         /* ignore DMAR unit if no devices exist */
4089                         if (i == drhd->devices_cnt)
4090                                 drhd->ignored = 1;
4091                 }
4092         }
4093
4094         for_each_active_drhd_unit(drhd) {
4095                 if (drhd->include_all)
4096                         continue;
4097
4098                 for_each_active_dev_scope(drhd->devices,
4099                                           drhd->devices_cnt, i, dev)
4100                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4101                                 break;
4102                 if (i < drhd->devices_cnt)
4103                         continue;
4104
4105                 /* This IOMMU has *only* gfx devices. Either bypass it or
4106                    set the gfx_mapped flag, as appropriate */
4107                 drhd->gfx_dedicated = 1;
4108                 if (!dmar_map_gfx)
4109                         drhd->ignored = 1;
4110         }
4111 }
4112
4113 #ifdef CONFIG_SUSPEND
4114 static int init_iommu_hw(void)
4115 {
4116         struct dmar_drhd_unit *drhd;
4117         struct intel_iommu *iommu = NULL;
4118
4119         for_each_active_iommu(iommu, drhd)
4120                 if (iommu->qi)
4121                         dmar_reenable_qi(iommu);
4122
4123         for_each_iommu(iommu, drhd) {
4124                 if (drhd->ignored) {
4125                         /*
4126                          * we always have to disable PMRs or DMA may fail on
4127                          * this device
4128                          */
4129                         if (force_on)
4130                                 iommu_disable_protect_mem_regions(iommu);
4131                         continue;
4132                 }
4133
4134                 iommu_flush_write_buffer(iommu);
4135
4136                 iommu_set_root_entry(iommu);
4137
4138                 iommu->flush.flush_context(iommu, 0, 0, 0,
4139                                            DMA_CCMD_GLOBAL_INVL);
4140                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4141                 iommu_enable_translation(iommu);
4142                 iommu_disable_protect_mem_regions(iommu);
4143         }
4144
4145         return 0;
4146 }
4147
4148 static void iommu_flush_all(void)
4149 {
4150         struct dmar_drhd_unit *drhd;
4151         struct intel_iommu *iommu;
4152
4153         for_each_active_iommu(iommu, drhd) {
4154                 iommu->flush.flush_context(iommu, 0, 0, 0,
4155                                            DMA_CCMD_GLOBAL_INVL);
4156                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4157                                          DMA_TLB_GLOBAL_FLUSH);
4158         }
4159 }
4160
4161 static int iommu_suspend(void)
4162 {
4163         struct dmar_drhd_unit *drhd;
4164         struct intel_iommu *iommu = NULL;
4165         unsigned long flag;
4166
4167         for_each_active_iommu(iommu, drhd) {
4168                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4169                                                  GFP_ATOMIC);
4170                 if (!iommu->iommu_state)
4171                         goto nomem;
4172         }
4173
4174         iommu_flush_all();
4175
4176         for_each_active_iommu(iommu, drhd) {
4177                 iommu_disable_translation(iommu);
4178
4179                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4180
4181                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4182                         readl(iommu->reg + DMAR_FECTL_REG);
4183                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4184                         readl(iommu->reg + DMAR_FEDATA_REG);
4185                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4186                         readl(iommu->reg + DMAR_FEADDR_REG);
4187                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4188                         readl(iommu->reg + DMAR_FEUADDR_REG);
4189
4190                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4191         }
4192         return 0;
4193
4194 nomem:
4195         for_each_active_iommu(iommu, drhd)
4196                 kfree(iommu->iommu_state);
4197
4198         return -ENOMEM;
4199 }
4200
4201 static void iommu_resume(void)
4202 {
4203         struct dmar_drhd_unit *drhd;
4204         struct intel_iommu *iommu = NULL;
4205         unsigned long flag;
4206
4207         if (init_iommu_hw()) {
4208                 if (force_on)
4209                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4210                 else
4211                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4212                 return;
4213         }
4214
4215         for_each_active_iommu(iommu, drhd) {
4216
4217                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4218
4219                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4220                         iommu->reg + DMAR_FECTL_REG);
4221                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4222                         iommu->reg + DMAR_FEDATA_REG);
4223                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4224                         iommu->reg + DMAR_FEADDR_REG);
4225                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4226                         iommu->reg + DMAR_FEUADDR_REG);
4227
4228                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4229         }
4230
4231         for_each_active_iommu(iommu, drhd)
4232                 kfree(iommu->iommu_state);
4233 }
4234
4235 static struct syscore_ops iommu_syscore_ops = {
4236         .resume         = iommu_resume,
4237         .suspend        = iommu_suspend,
4238 };
4239
4240 static void __init init_iommu_pm_ops(void)
4241 {
4242         register_syscore_ops(&iommu_syscore_ops);
4243 }
4244
4245 #else
4246 static inline void init_iommu_pm_ops(void) {}
4247 #endif  /* CONFIG_PM */
4248
4249 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4250 {
4251         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4252             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4253             rmrr->end_address <= rmrr->base_address ||
4254             arch_rmrr_sanity_check(rmrr))
4255                 return -EINVAL;
4256
4257         return 0;
4258 }
4259
4260 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4261 {
4262         struct acpi_dmar_reserved_memory *rmrr;
4263         struct dmar_rmrr_unit *rmrru;
4264
4265         rmrr = (struct acpi_dmar_reserved_memory *)header;
4266         if (rmrr_sanity_check(rmrr)) {
4267                 pr_warn(FW_BUG
4268                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4269                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4270                            rmrr->base_address, rmrr->end_address,
4271                            dmi_get_system_info(DMI_BIOS_VENDOR),
4272                            dmi_get_system_info(DMI_BIOS_VERSION),
4273                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4274                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4275         }
4276
4277         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4278         if (!rmrru)
4279                 goto out;
4280
4281         rmrru->hdr = header;
4282
4283         rmrru->base_address = rmrr->base_address;
4284         rmrru->end_address = rmrr->end_address;
4285
4286         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4287                                 ((void *)rmrr) + rmrr->header.length,
4288                                 &rmrru->devices_cnt);
4289         if (rmrru->devices_cnt && rmrru->devices == NULL)
4290                 goto free_rmrru;
4291
4292         list_add(&rmrru->list, &dmar_rmrr_units);
4293
4294         return 0;
4295 free_rmrru:
4296         kfree(rmrru);
4297 out:
4298         return -ENOMEM;
4299 }
4300
4301 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4302 {
4303         struct dmar_atsr_unit *atsru;
4304         struct acpi_dmar_atsr *tmp;
4305
4306         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4307                                 dmar_rcu_check()) {
4308                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4309                 if (atsr->segment != tmp->segment)
4310                         continue;
4311                 if (atsr->header.length != tmp->header.length)
4312                         continue;
4313                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4314                         return atsru;
4315         }
4316
4317         return NULL;
4318 }
4319
4320 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4321 {
4322         struct acpi_dmar_atsr *atsr;
4323         struct dmar_atsr_unit *atsru;
4324
4325         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4326                 return 0;
4327
4328         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4329         atsru = dmar_find_atsr(atsr);
4330         if (atsru)
4331                 return 0;
4332
4333         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4334         if (!atsru)
4335                 return -ENOMEM;
4336
4337         /*
4338          * If memory is allocated from slab by ACPI _DSM method, we need to
4339          * copy the memory content because the memory buffer will be freed
4340          * on return.
4341          */
4342         atsru->hdr = (void *)(atsru + 1);
4343         memcpy(atsru->hdr, hdr, hdr->length);
4344         atsru->include_all = atsr->flags & 0x1;
4345         if (!atsru->include_all) {
4346                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4347                                 (void *)atsr + atsr->header.length,
4348                                 &atsru->devices_cnt);
4349                 if (atsru->devices_cnt && atsru->devices == NULL) {
4350                         kfree(atsru);
4351                         return -ENOMEM;
4352                 }
4353         }
4354
4355         list_add_rcu(&atsru->list, &dmar_atsr_units);
4356
4357         return 0;
4358 }
4359
4360 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4361 {
4362         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4363         kfree(atsru);
4364 }
4365
4366 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4367 {
4368         struct acpi_dmar_atsr *atsr;
4369         struct dmar_atsr_unit *atsru;
4370
4371         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4372         atsru = dmar_find_atsr(atsr);
4373         if (atsru) {
4374                 list_del_rcu(&atsru->list);
4375                 synchronize_rcu();
4376                 intel_iommu_free_atsr(atsru);
4377         }
4378
4379         return 0;
4380 }
4381
4382 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4383 {
4384         int i;
4385         struct device *dev;
4386         struct acpi_dmar_atsr *atsr;
4387         struct dmar_atsr_unit *atsru;
4388
4389         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4390         atsru = dmar_find_atsr(atsr);
4391         if (!atsru)
4392                 return 0;
4393
4394         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4395                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4396                                           i, dev)
4397                         return -EBUSY;
4398         }
4399
4400         return 0;
4401 }
4402
4403 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4404 {
4405         int sp, ret;
4406         struct intel_iommu *iommu = dmaru->iommu;
4407
4408         if (g_iommus[iommu->seq_id])
4409                 return 0;
4410
4411         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4412                 pr_warn("%s: Doesn't support hardware pass through.\n",
4413                         iommu->name);
4414                 return -ENXIO;
4415         }
4416         if (!ecap_sc_support(iommu->ecap) &&
4417             domain_update_iommu_snooping(iommu)) {
4418                 pr_warn("%s: Doesn't support snooping.\n",
4419                         iommu->name);
4420                 return -ENXIO;
4421         }
4422         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4423         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4424                 pr_warn("%s: Doesn't support large page.\n",
4425                         iommu->name);
4426                 return -ENXIO;
4427         }
4428
4429         /*
4430          * Disable translation if already enabled prior to OS handover.
4431          */
4432         if (iommu->gcmd & DMA_GCMD_TE)
4433                 iommu_disable_translation(iommu);
4434
4435         g_iommus[iommu->seq_id] = iommu;
4436         ret = iommu_init_domains(iommu);
4437         if (ret == 0)
4438                 ret = iommu_alloc_root_entry(iommu);
4439         if (ret)
4440                 goto out;
4441
4442         intel_svm_check(iommu);
4443
4444         if (dmaru->ignored) {
4445                 /*
4446                  * we always have to disable PMRs or DMA may fail on this device
4447                  */
4448                 if (force_on)
4449                         iommu_disable_protect_mem_regions(iommu);
4450                 return 0;
4451         }
4452
4453         intel_iommu_init_qi(iommu);
4454         iommu_flush_write_buffer(iommu);
4455
4456 #ifdef CONFIG_INTEL_IOMMU_SVM
4457         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4458                 ret = intel_svm_enable_prq(iommu);
4459                 if (ret)
4460                         goto disable_iommu;
4461         }
4462 #endif
4463         ret = dmar_set_interrupt(iommu);
4464         if (ret)
4465                 goto disable_iommu;
4466
4467         iommu_set_root_entry(iommu);
4468         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4469         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4470         iommu_enable_translation(iommu);
4471
4472         iommu_disable_protect_mem_regions(iommu);
4473         return 0;
4474
4475 disable_iommu:
4476         disable_dmar_iommu(iommu);
4477 out:
4478         free_dmar_iommu(iommu);
4479         return ret;
4480 }
4481
4482 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4483 {
4484         int ret = 0;
4485         struct intel_iommu *iommu = dmaru->iommu;
4486
4487         if (!intel_iommu_enabled)
4488                 return 0;
4489         if (iommu == NULL)
4490                 return -EINVAL;
4491
4492         if (insert) {
4493                 ret = intel_iommu_add(dmaru);
4494         } else {
4495                 disable_dmar_iommu(iommu);
4496                 free_dmar_iommu(iommu);
4497         }
4498
4499         return ret;
4500 }
4501
4502 static void intel_iommu_free_dmars(void)
4503 {
4504         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4505         struct dmar_atsr_unit *atsru, *atsr_n;
4506
4507         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4508                 list_del(&rmrru->list);
4509                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4510                 kfree(rmrru);
4511         }
4512
4513         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4514                 list_del(&atsru->list);
4515                 intel_iommu_free_atsr(atsru);
4516         }
4517 }
4518
4519 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4520 {
4521         int i, ret = 1;
4522         struct pci_bus *bus;
4523         struct pci_dev *bridge = NULL;
4524         struct device *tmp;
4525         struct acpi_dmar_atsr *atsr;
4526         struct dmar_atsr_unit *atsru;
4527
4528         dev = pci_physfn(dev);
4529         for (bus = dev->bus; bus; bus = bus->parent) {
4530                 bridge = bus->self;
4531                 /* If it's an integrated device, allow ATS */
4532                 if (!bridge)
4533                         return 1;
4534                 /* Connected via non-PCIe: no ATS */
4535                 if (!pci_is_pcie(bridge) ||
4536                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4537                         return 0;
4538                 /* If we found the root port, look it up in the ATSR */
4539                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4540                         break;
4541         }
4542
4543         rcu_read_lock();
4544         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4545                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4546                 if (atsr->segment != pci_domain_nr(dev->bus))
4547                         continue;
4548
4549                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4550                         if (tmp == &bridge->dev)
4551                                 goto out;
4552
4553                 if (atsru->include_all)
4554                         goto out;
4555         }
4556         ret = 0;
4557 out:
4558         rcu_read_unlock();
4559
4560         return ret;
4561 }
4562
4563 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4564 {
4565         int ret;
4566         struct dmar_rmrr_unit *rmrru;
4567         struct dmar_atsr_unit *atsru;
4568         struct acpi_dmar_atsr *atsr;
4569         struct acpi_dmar_reserved_memory *rmrr;
4570
4571         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4572                 return 0;
4573
4574         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4575                 rmrr = container_of(rmrru->hdr,
4576                                     struct acpi_dmar_reserved_memory, header);
4577                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4578                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4579                                 ((void *)rmrr) + rmrr->header.length,
4580                                 rmrr->segment, rmrru->devices,
4581                                 rmrru->devices_cnt);
4582                         if (ret < 0)
4583                                 return ret;
4584                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4585                         dmar_remove_dev_scope(info, rmrr->segment,
4586                                 rmrru->devices, rmrru->devices_cnt);
4587                 }
4588         }
4589
4590         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4591                 if (atsru->include_all)
4592                         continue;
4593
4594                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4595                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4596                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4597                                         (void *)atsr + atsr->header.length,
4598                                         atsr->segment, atsru->devices,
4599                                         atsru->devices_cnt);
4600                         if (ret > 0)
4601                                 break;
4602                         else if (ret < 0)
4603                                 return ret;
4604                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4605                         if (dmar_remove_dev_scope(info, atsr->segment,
4606                                         atsru->devices, atsru->devices_cnt))
4607                                 break;
4608                 }
4609         }
4610
4611         return 0;
4612 }
4613
4614 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4615                                        unsigned long val, void *v)
4616 {
4617         struct memory_notify *mhp = v;
4618         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4619         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4620                         mhp->nr_pages - 1);
4621
4622         switch (val) {
4623         case MEM_GOING_ONLINE:
4624                 if (iommu_domain_identity_map(si_domain,
4625                                               start_vpfn, last_vpfn)) {
4626                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4627                                 start_vpfn, last_vpfn);
4628                         return NOTIFY_BAD;
4629                 }
4630                 break;
4631
4632         case MEM_OFFLINE:
4633         case MEM_CANCEL_ONLINE:
4634                 {
4635                         struct dmar_drhd_unit *drhd;
4636                         struct intel_iommu *iommu;
4637                         struct page *freelist;
4638
4639                         freelist = domain_unmap(si_domain,
4640                                                 start_vpfn, last_vpfn,
4641                                                 NULL);
4642
4643                         rcu_read_lock();
4644                         for_each_active_iommu(iommu, drhd)
4645                                 iommu_flush_iotlb_psi(iommu, si_domain,
4646                                         start_vpfn, mhp->nr_pages,
4647                                         !freelist, 0);
4648                         rcu_read_unlock();
4649                         dma_free_pagelist(freelist);
4650                 }
4651                 break;
4652         }
4653
4654         return NOTIFY_OK;
4655 }
4656
4657 static struct notifier_block intel_iommu_memory_nb = {
4658         .notifier_call = intel_iommu_memory_notifier,
4659         .priority = 0
4660 };
4661
4662 static void free_all_cpu_cached_iovas(unsigned int cpu)
4663 {
4664         int i;
4665
4666         for (i = 0; i < g_num_of_iommus; i++) {
4667                 struct intel_iommu *iommu = g_iommus[i];
4668                 struct dmar_domain *domain;
4669                 int did;
4670
4671                 if (!iommu)
4672                         continue;
4673
4674                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4675                         domain = get_iommu_domain(iommu, (u16)did);
4676
4677                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4678                                 continue;
4679
4680                         free_cpu_cached_iovas(cpu, &domain->iovad);
4681                 }
4682         }
4683 }
4684
4685 static int intel_iommu_cpu_dead(unsigned int cpu)
4686 {
4687         free_all_cpu_cached_iovas(cpu);
4688         return 0;
4689 }
4690
4691 static void intel_disable_iommus(void)
4692 {
4693         struct intel_iommu *iommu = NULL;
4694         struct dmar_drhd_unit *drhd;
4695
4696         for_each_iommu(iommu, drhd)
4697                 iommu_disable_translation(iommu);
4698 }
4699
4700 void intel_iommu_shutdown(void)
4701 {
4702         struct dmar_drhd_unit *drhd;
4703         struct intel_iommu *iommu = NULL;
4704
4705         if (no_iommu || dmar_disabled)
4706                 return;
4707
4708         down_write(&dmar_global_lock);
4709
4710         /* Disable PMRs explicitly here. */
4711         for_each_iommu(iommu, drhd)
4712                 iommu_disable_protect_mem_regions(iommu);
4713
4714         /* Make sure the IOMMUs are switched off */
4715         intel_disable_iommus();
4716
4717         up_write(&dmar_global_lock);
4718 }
4719
4720 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4721 {
4722         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4723
4724         return container_of(iommu_dev, struct intel_iommu, iommu);
4725 }
4726
4727 static ssize_t intel_iommu_show_version(struct device *dev,
4728                                         struct device_attribute *attr,
4729                                         char *buf)
4730 {
4731         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4732         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4733         return sprintf(buf, "%d:%d\n",
4734                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4735 }
4736 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4737
4738 static ssize_t intel_iommu_show_address(struct device *dev,
4739                                         struct device_attribute *attr,
4740                                         char *buf)
4741 {
4742         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4743         return sprintf(buf, "%llx\n", iommu->reg_phys);
4744 }
4745 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4746
4747 static ssize_t intel_iommu_show_cap(struct device *dev,
4748                                     struct device_attribute *attr,
4749                                     char *buf)
4750 {
4751         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4752         return sprintf(buf, "%llx\n", iommu->cap);
4753 }
4754 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4755
4756 static ssize_t intel_iommu_show_ecap(struct device *dev,
4757                                     struct device_attribute *attr,
4758                                     char *buf)
4759 {
4760         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4761         return sprintf(buf, "%llx\n", iommu->ecap);
4762 }
4763 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4764
4765 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4766                                       struct device_attribute *attr,
4767                                       char *buf)
4768 {
4769         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4770         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4771 }
4772 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4773
4774 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4775                                            struct device_attribute *attr,
4776                                            char *buf)
4777 {
4778         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4779         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4780                                                   cap_ndoms(iommu->cap)));
4781 }
4782 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4783
4784 static struct attribute *intel_iommu_attrs[] = {
4785         &dev_attr_version.attr,
4786         &dev_attr_address.attr,
4787         &dev_attr_cap.attr,
4788         &dev_attr_ecap.attr,
4789         &dev_attr_domains_supported.attr,
4790         &dev_attr_domains_used.attr,
4791         NULL,
4792 };
4793
4794 static struct attribute_group intel_iommu_group = {
4795         .name = "intel-iommu",
4796         .attrs = intel_iommu_attrs,
4797 };
4798
4799 const struct attribute_group *intel_iommu_groups[] = {
4800         &intel_iommu_group,
4801         NULL,
4802 };
4803
4804 static inline bool has_external_pci(void)
4805 {
4806         struct pci_dev *pdev = NULL;
4807
4808         for_each_pci_dev(pdev)
4809                 if (pdev->external_facing)
4810                         return true;
4811
4812         return false;
4813 }
4814
4815 static int __init platform_optin_force_iommu(void)
4816 {
4817         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4818                 return 0;
4819
4820         if (no_iommu || dmar_disabled)
4821                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4822
4823         /*
4824          * If Intel-IOMMU is disabled by default, we will apply identity
4825          * map for all devices except those marked as being untrusted.
4826          */
4827         if (dmar_disabled)
4828                 iommu_set_default_passthrough(false);
4829
4830         dmar_disabled = 0;
4831         no_iommu = 0;
4832
4833         return 1;
4834 }
4835
4836 static int __init probe_acpi_namespace_devices(void)
4837 {
4838         struct dmar_drhd_unit *drhd;
4839         /* To avoid a -Wunused-but-set-variable warning. */
4840         struct intel_iommu *iommu __maybe_unused;
4841         struct device *dev;
4842         int i, ret = 0;
4843
4844         for_each_active_iommu(iommu, drhd) {
4845                 for_each_active_dev_scope(drhd->devices,
4846                                           drhd->devices_cnt, i, dev) {
4847                         struct acpi_device_physical_node *pn;
4848                         struct iommu_group *group;
4849                         struct acpi_device *adev;
4850
4851                         if (dev->bus != &acpi_bus_type)
4852                                 continue;
4853
4854                         adev = to_acpi_device(dev);
4855                         mutex_lock(&adev->physical_node_lock);
4856                         list_for_each_entry(pn,
4857                                             &adev->physical_node_list, node) {
4858                                 group = iommu_group_get(pn->dev);
4859                                 if (group) {
4860                                         iommu_group_put(group);
4861                                         continue;
4862                                 }
4863
4864                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4865                                 ret = iommu_probe_device(pn->dev);
4866                                 if (ret)
4867                                         break;
4868                         }
4869                         mutex_unlock(&adev->physical_node_lock);
4870
4871                         if (ret)
4872                                 return ret;
4873                 }
4874         }
4875
4876         return 0;
4877 }
4878
4879 int __init intel_iommu_init(void)
4880 {
4881         int ret = -ENODEV;
4882         struct dmar_drhd_unit *drhd;
4883         struct intel_iommu *iommu;
4884
4885         /*
4886          * Intel IOMMU is required for a TXT/tboot launch or platform
4887          * opt in, so enforce that.
4888          */
4889         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4890
4891         if (iommu_init_mempool()) {
4892                 if (force_on)
4893                         panic("tboot: Failed to initialize iommu memory\n");
4894                 return -ENOMEM;
4895         }
4896
4897         down_write(&dmar_global_lock);
4898         if (dmar_table_init()) {
4899                 if (force_on)
4900                         panic("tboot: Failed to initialize DMAR table\n");
4901                 goto out_free_dmar;
4902         }
4903
4904         if (dmar_dev_scope_init() < 0) {
4905                 if (force_on)
4906                         panic("tboot: Failed to initialize DMAR device scope\n");
4907                 goto out_free_dmar;
4908         }
4909
4910         up_write(&dmar_global_lock);
4911
4912         /*
4913          * The bus notifier takes the dmar_global_lock, so lockdep will
4914          * complain later when we register it under the lock.
4915          */
4916         dmar_register_bus_notifier();
4917
4918         down_write(&dmar_global_lock);
4919
4920         if (!no_iommu)
4921                 intel_iommu_debugfs_init();
4922
4923         if (no_iommu || dmar_disabled) {
4924                 /*
4925                  * We exit the function here to ensure IOMMU's remapping and
4926                  * mempool aren't setup, which means that the IOMMU's PMRs
4927                  * won't be disabled via the call to init_dmars(). So disable
4928                  * it explicitly here. The PMRs were setup by tboot prior to
4929                  * calling SENTER, but the kernel is expected to reset/tear
4930                  * down the PMRs.
4931                  */
4932                 if (intel_iommu_tboot_noforce) {
4933                         for_each_iommu(iommu, drhd)
4934                                 iommu_disable_protect_mem_regions(iommu);
4935                 }
4936
4937                 /*
4938                  * Make sure the IOMMUs are switched off, even when we
4939                  * boot into a kexec kernel and the previous kernel left
4940                  * them enabled
4941                  */
4942                 intel_disable_iommus();
4943                 goto out_free_dmar;
4944         }
4945
4946         if (list_empty(&dmar_rmrr_units))
4947                 pr_info("No RMRR found\n");
4948
4949         if (list_empty(&dmar_atsr_units))
4950                 pr_info("No ATSR found\n");
4951
4952         if (dmar_init_reserved_ranges()) {
4953                 if (force_on)
4954                         panic("tboot: Failed to reserve iommu ranges\n");
4955                 goto out_free_reserved_range;
4956         }
4957
4958         if (dmar_map_gfx)
4959                 intel_iommu_gfx_mapped = 1;
4960
4961         init_no_remapping_devices();
4962
4963         ret = init_dmars();
4964         if (ret) {
4965                 if (force_on)
4966                         panic("tboot: Failed to initialize DMARs\n");
4967                 pr_err("Initialization failed\n");
4968                 goto out_free_reserved_range;
4969         }
4970         up_write(&dmar_global_lock);
4971
4972         init_iommu_pm_ops();
4973
4974         down_read(&dmar_global_lock);
4975         for_each_active_iommu(iommu, drhd) {
4976                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4977                                        intel_iommu_groups,
4978                                        "%s", iommu->name);
4979                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4980                 iommu_device_register(&iommu->iommu);
4981         }
4982         up_read(&dmar_global_lock);
4983
4984         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4985         if (si_domain && !hw_pass_through)
4986                 register_memory_notifier(&intel_iommu_memory_nb);
4987         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4988                           intel_iommu_cpu_dead);
4989
4990         down_read(&dmar_global_lock);
4991         if (probe_acpi_namespace_devices())
4992                 pr_warn("ACPI name space devices didn't probe correctly\n");
4993
4994         /* Finally, we enable the DMA remapping hardware. */
4995         for_each_iommu(iommu, drhd) {
4996                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4997                         iommu_enable_translation(iommu);
4998
4999                 iommu_disable_protect_mem_regions(iommu);
5000         }
5001         up_read(&dmar_global_lock);
5002
5003         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5004
5005         intel_iommu_enabled = 1;
5006
5007         return 0;
5008
5009 out_free_reserved_range:
5010         put_iova_domain(&reserved_iova_list);
5011 out_free_dmar:
5012         intel_iommu_free_dmars();
5013         up_write(&dmar_global_lock);
5014         iommu_exit_mempool();
5015         return ret;
5016 }
5017
5018 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5019 {
5020         struct intel_iommu *iommu = opaque;
5021
5022         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5023         return 0;
5024 }
5025
5026 /*
5027  * NB - intel-iommu lacks any sort of reference counting for the users of
5028  * dependent devices.  If multiple endpoints have intersecting dependent
5029  * devices, unbinding the driver from any one of them will possibly leave
5030  * the others unable to operate.
5031  */
5032 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5033 {
5034         if (!iommu || !dev || !dev_is_pci(dev))
5035                 return;
5036
5037         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5038 }
5039
5040 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5041 {
5042         struct dmar_domain *domain;
5043         struct intel_iommu *iommu;
5044         unsigned long flags;
5045
5046         assert_spin_locked(&device_domain_lock);
5047
5048         if (WARN_ON(!info))
5049                 return;
5050
5051         iommu = info->iommu;
5052         domain = info->domain;
5053
5054         if (info->dev) {
5055                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5056                         intel_pasid_tear_down_entry(iommu, info->dev,
5057                                         PASID_RID2PASID, false);
5058
5059                 iommu_disable_dev_iotlb(info);
5060                 if (!dev_is_real_dma_subdevice(info->dev))
5061                         domain_context_clear(iommu, info->dev);
5062                 intel_pasid_free_table(info->dev);
5063         }
5064
5065         unlink_domain_info(info);
5066
5067         spin_lock_irqsave(&iommu->lock, flags);
5068         domain_detach_iommu(domain, iommu);
5069         spin_unlock_irqrestore(&iommu->lock, flags);
5070
5071         free_devinfo_mem(info);
5072 }
5073
5074 static void dmar_remove_one_dev_info(struct device *dev)
5075 {
5076         struct device_domain_info *info;
5077         unsigned long flags;
5078
5079         spin_lock_irqsave(&device_domain_lock, flags);
5080         info = get_domain_info(dev);
5081         if (info)
5082                 __dmar_remove_one_dev_info(info);
5083         spin_unlock_irqrestore(&device_domain_lock, flags);
5084 }
5085
5086 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5087 {
5088         int adjust_width;
5089
5090         /* calculate AGAW */
5091         domain->gaw = guest_width;
5092         adjust_width = guestwidth_to_adjustwidth(guest_width);
5093         domain->agaw = width_to_agaw(adjust_width);
5094
5095         domain->iommu_coherency = 0;
5096         domain->iommu_snooping = 0;
5097         domain->iommu_superpage = 0;
5098         domain->max_addr = 0;
5099
5100         /* always allocate the top pgd */
5101         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5102         if (!domain->pgd)
5103                 return -ENOMEM;
5104         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5105         return 0;
5106 }
5107
5108 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5109 {
5110         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5111         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5112
5113         if (!intel_iommu_strict &&
5114             init_iova_flush_queue(&dmar_domain->iovad,
5115                                   iommu_flush_iova, iova_entry_free))
5116                 pr_info("iova flush queue initialization failed\n");
5117 }
5118
5119 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5120 {
5121         struct dmar_domain *dmar_domain;
5122         struct iommu_domain *domain;
5123
5124         switch (type) {
5125         case IOMMU_DOMAIN_DMA:
5126         case IOMMU_DOMAIN_UNMANAGED:
5127                 dmar_domain = alloc_domain(0);
5128                 if (!dmar_domain) {
5129                         pr_err("Can't allocate dmar_domain\n");
5130                         return NULL;
5131                 }
5132                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5133                         pr_err("Domain initialization failed\n");
5134                         domain_exit(dmar_domain);
5135                         return NULL;
5136                 }
5137
5138                 if (type == IOMMU_DOMAIN_DMA)
5139                         intel_init_iova_domain(dmar_domain);
5140
5141                 domain = &dmar_domain->domain;
5142                 domain->geometry.aperture_start = 0;
5143                 domain->geometry.aperture_end   =
5144                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5145                 domain->geometry.force_aperture = true;
5146
5147                 return domain;
5148         case IOMMU_DOMAIN_IDENTITY:
5149                 return &si_domain->domain;
5150         default:
5151                 return NULL;
5152         }
5153
5154         return NULL;
5155 }
5156
5157 static void intel_iommu_domain_free(struct iommu_domain *domain)
5158 {
5159         if (domain != &si_domain->domain)
5160                 domain_exit(to_dmar_domain(domain));
5161 }
5162
5163 /*
5164  * Check whether a @domain could be attached to the @dev through the
5165  * aux-domain attach/detach APIs.
5166  */
5167 static inline bool
5168 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5169 {
5170         struct device_domain_info *info = get_domain_info(dev);
5171
5172         return info && info->auxd_enabled &&
5173                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5174 }
5175
5176 static void auxiliary_link_device(struct dmar_domain *domain,
5177                                   struct device *dev)
5178 {
5179         struct device_domain_info *info = get_domain_info(dev);
5180
5181         assert_spin_locked(&device_domain_lock);
5182         if (WARN_ON(!info))
5183                 return;
5184
5185         domain->auxd_refcnt++;
5186         list_add(&domain->auxd, &info->auxiliary_domains);
5187 }
5188
5189 static void auxiliary_unlink_device(struct dmar_domain *domain,
5190                                     struct device *dev)
5191 {
5192         struct device_domain_info *info = get_domain_info(dev);
5193
5194         assert_spin_locked(&device_domain_lock);
5195         if (WARN_ON(!info))
5196                 return;
5197
5198         list_del(&domain->auxd);
5199         domain->auxd_refcnt--;
5200
5201         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5202                 ioasid_free(domain->default_pasid);
5203 }
5204
5205 static int aux_domain_add_dev(struct dmar_domain *domain,
5206                               struct device *dev)
5207 {
5208         int ret;
5209         unsigned long flags;
5210         struct intel_iommu *iommu;
5211
5212         iommu = device_to_iommu(dev, NULL, NULL);
5213         if (!iommu)
5214                 return -ENODEV;
5215
5216         if (domain->default_pasid <= 0) {
5217                 u32 pasid;
5218
5219                 /* No private data needed for the default pasid */
5220                 pasid = ioasid_alloc(NULL, PASID_MIN,
5221                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5222                                      NULL);
5223                 if (pasid == INVALID_IOASID) {
5224                         pr_err("Can't allocate default pasid\n");
5225                         return -ENODEV;
5226                 }
5227                 domain->default_pasid = pasid;
5228         }
5229
5230         spin_lock_irqsave(&device_domain_lock, flags);
5231         /*
5232          * iommu->lock must be held to attach domain to iommu and setup the
5233          * pasid entry for second level translation.
5234          */
5235         spin_lock(&iommu->lock);
5236         ret = domain_attach_iommu(domain, iommu);
5237         if (ret)
5238                 goto attach_failed;
5239
5240         /* Setup the PASID entry for mediated devices: */
5241         if (domain_use_first_level(domain))
5242                 ret = domain_setup_first_level(iommu, domain, dev,
5243                                                domain->default_pasid);
5244         else
5245                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5246                                                      domain->default_pasid);
5247         if (ret)
5248                 goto table_failed;
5249         spin_unlock(&iommu->lock);
5250
5251         auxiliary_link_device(domain, dev);
5252
5253         spin_unlock_irqrestore(&device_domain_lock, flags);
5254
5255         return 0;
5256
5257 table_failed:
5258         domain_detach_iommu(domain, iommu);
5259 attach_failed:
5260         spin_unlock(&iommu->lock);
5261         spin_unlock_irqrestore(&device_domain_lock, flags);
5262         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5263                 ioasid_free(domain->default_pasid);
5264
5265         return ret;
5266 }
5267
5268 static void aux_domain_remove_dev(struct dmar_domain *domain,
5269                                   struct device *dev)
5270 {
5271         struct device_domain_info *info;
5272         struct intel_iommu *iommu;
5273         unsigned long flags;
5274
5275         if (!is_aux_domain(dev, &domain->domain))
5276                 return;
5277
5278         spin_lock_irqsave(&device_domain_lock, flags);
5279         info = get_domain_info(dev);
5280         iommu = info->iommu;
5281
5282         auxiliary_unlink_device(domain, dev);
5283
5284         spin_lock(&iommu->lock);
5285         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5286         domain_detach_iommu(domain, iommu);
5287         spin_unlock(&iommu->lock);
5288
5289         spin_unlock_irqrestore(&device_domain_lock, flags);
5290 }
5291
5292 static int prepare_domain_attach_device(struct iommu_domain *domain,
5293                                         struct device *dev)
5294 {
5295         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5296         struct intel_iommu *iommu;
5297         int addr_width;
5298
5299         iommu = device_to_iommu(dev, NULL, NULL);
5300         if (!iommu)
5301                 return -ENODEV;
5302
5303         /* check if this iommu agaw is sufficient for max mapped address */
5304         addr_width = agaw_to_width(iommu->agaw);
5305         if (addr_width > cap_mgaw(iommu->cap))
5306                 addr_width = cap_mgaw(iommu->cap);
5307
5308         if (dmar_domain->max_addr > (1LL << addr_width)) {
5309                 dev_err(dev, "%s: iommu width (%d) is not "
5310                         "sufficient for the mapped address (%llx)\n",
5311                         __func__, addr_width, dmar_domain->max_addr);
5312                 return -EFAULT;
5313         }
5314         dmar_domain->gaw = addr_width;
5315
5316         /*
5317          * Knock out extra levels of page tables if necessary
5318          */
5319         while (iommu->agaw < dmar_domain->agaw) {
5320                 struct dma_pte *pte;
5321
5322                 pte = dmar_domain->pgd;
5323                 if (dma_pte_present(pte)) {
5324                         dmar_domain->pgd = (struct dma_pte *)
5325                                 phys_to_virt(dma_pte_addr(pte));
5326                         free_pgtable_page(pte);
5327                 }
5328                 dmar_domain->agaw--;
5329         }
5330
5331         return 0;
5332 }
5333
5334 static int intel_iommu_attach_device(struct iommu_domain *domain,
5335                                      struct device *dev)
5336 {
5337         int ret;
5338
5339         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5340             device_is_rmrr_locked(dev)) {
5341                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5342                 return -EPERM;
5343         }
5344
5345         if (is_aux_domain(dev, domain))
5346                 return -EPERM;
5347
5348         /* normally dev is not mapped */
5349         if (unlikely(domain_context_mapped(dev))) {
5350                 struct dmar_domain *old_domain;
5351
5352                 old_domain = find_domain(dev);
5353                 if (old_domain)
5354                         dmar_remove_one_dev_info(dev);
5355         }
5356
5357         ret = prepare_domain_attach_device(domain, dev);
5358         if (ret)
5359                 return ret;
5360
5361         return domain_add_dev_info(to_dmar_domain(domain), dev);
5362 }
5363
5364 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5365                                          struct device *dev)
5366 {
5367         int ret;
5368
5369         if (!is_aux_domain(dev, domain))
5370                 return -EPERM;
5371
5372         ret = prepare_domain_attach_device(domain, dev);
5373         if (ret)
5374                 return ret;
5375
5376         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5377 }
5378
5379 static void intel_iommu_detach_device(struct iommu_domain *domain,
5380                                       struct device *dev)
5381 {
5382         dmar_remove_one_dev_info(dev);
5383 }
5384
5385 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5386                                           struct device *dev)
5387 {
5388         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5389 }
5390
5391 #ifdef CONFIG_INTEL_IOMMU_SVM
5392 /*
5393  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5394  * VT-d granularity. Invalidation is typically included in the unmap operation
5395  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5396  * owns the first level page tables. Invalidations of translation caches in the
5397  * guest are trapped and passed down to the host.
5398  *
5399  * vIOMMU in the guest will only expose first level page tables, therefore
5400  * we do not support IOTLB granularity for request without PASID (second level).
5401  *
5402  * For example, to find the VT-d granularity encoding for IOTLB
5403  * type and page selective granularity within PASID:
5404  * X: indexed by iommu cache type
5405  * Y: indexed by enum iommu_inv_granularity
5406  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5407  */
5408
5409 static const int
5410 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5411         /*
5412          * PASID based IOTLB invalidation: PASID selective (per PASID),
5413          * page selective (address granularity)
5414          */
5415         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5416         /* PASID based dev TLBs */
5417         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5418         /* PASID cache */
5419         {-EINVAL, -EINVAL, -EINVAL}
5420 };
5421
5422 static inline int to_vtd_granularity(int type, int granu)
5423 {
5424         return inv_type_granu_table[type][granu];
5425 }
5426
5427 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5428 {
5429         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5430
5431         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5432          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5433          * granu size in contiguous memory.
5434          */
5435         return order_base_2(nr_pages);
5436 }
5437
5438 static int
5439 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5440                            struct iommu_cache_invalidate_info *inv_info)
5441 {
5442         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5443         struct device_domain_info *info;
5444         struct intel_iommu *iommu;
5445         unsigned long flags;
5446         int cache_type;
5447         u8 bus, devfn;
5448         u16 did, sid;
5449         int ret = 0;
5450         u64 size = 0;
5451
5452         if (!inv_info || !dmar_domain)
5453                 return -EINVAL;
5454
5455         if (!dev || !dev_is_pci(dev))
5456                 return -ENODEV;
5457
5458         iommu = device_to_iommu(dev, &bus, &devfn);
5459         if (!iommu)
5460                 return -ENODEV;
5461
5462         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5463                 return -EINVAL;
5464
5465         spin_lock_irqsave(&device_domain_lock, flags);
5466         spin_lock(&iommu->lock);
5467         info = get_domain_info(dev);
5468         if (!info) {
5469                 ret = -EINVAL;
5470                 goto out_unlock;
5471         }
5472         did = dmar_domain->iommu_did[iommu->seq_id];
5473         sid = PCI_DEVID(bus, devfn);
5474
5475         /* Size is only valid in address selective invalidation */
5476         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5477                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5478                                    inv_info->granu.addr_info.nb_granules);
5479
5480         for_each_set_bit(cache_type,
5481                          (unsigned long *)&inv_info->cache,
5482                          IOMMU_CACHE_INV_TYPE_NR) {
5483                 int granu = 0;
5484                 u64 pasid = 0;
5485                 u64 addr = 0;
5486
5487                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5488                 if (granu == -EINVAL) {
5489                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5490                                            cache_type, inv_info->granularity);
5491                         break;
5492                 }
5493
5494                 /*
5495                  * PASID is stored in different locations based on the
5496                  * granularity.
5497                  */
5498                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5499                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5500                         pasid = inv_info->granu.pasid_info.pasid;
5501                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5502                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5503                         pasid = inv_info->granu.addr_info.pasid;
5504
5505                 switch (BIT(cache_type)) {
5506                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5507                         /* HW will ignore LSB bits based on address mask */
5508                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5509                             size &&
5510                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5511                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5512                                                    inv_info->granu.addr_info.addr, size);
5513                         }
5514
5515                         /*
5516                          * If granu is PASID-selective, address is ignored.
5517                          * We use npages = -1 to indicate that.
5518                          */
5519                         qi_flush_piotlb(iommu, did, pasid,
5520                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5521                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5522                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5523
5524                         if (!info->ats_enabled)
5525                                 break;
5526                         /*
5527                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5528                          * in the guest may assume IOTLB flush is inclusive,
5529                          * which is more efficient.
5530                          */
5531                         fallthrough;
5532                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5533                         /*
5534                          * PASID based device TLB invalidation does not support
5535                          * IOMMU_INV_GRANU_PASID granularity but only supports
5536                          * IOMMU_INV_GRANU_ADDR.
5537                          * The equivalent of that is we set the size to be the
5538                          * entire range of 64 bit. User only provides PASID info
5539                          * without address info. So we set addr to 0.
5540                          */
5541                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5542                                 size = 64 - VTD_PAGE_SHIFT;
5543                                 addr = 0;
5544                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5545                                 addr = inv_info->granu.addr_info.addr;
5546                         }
5547
5548                         if (info->ats_enabled)
5549                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5550                                                 info->pfsid, pasid,
5551                                                 info->ats_qdep, addr,
5552                                                 size);
5553                         else
5554                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5555                         break;
5556                 default:
5557                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5558                                             cache_type);
5559                         ret = -EINVAL;
5560                 }
5561         }
5562 out_unlock:
5563         spin_unlock(&iommu->lock);
5564         spin_unlock_irqrestore(&device_domain_lock, flags);
5565
5566         return ret;
5567 }
5568 #endif
5569
5570 static int intel_iommu_map(struct iommu_domain *domain,
5571                            unsigned long iova, phys_addr_t hpa,
5572                            size_t size, int iommu_prot, gfp_t gfp)
5573 {
5574         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5575         u64 max_addr;
5576         int prot = 0;
5577         int ret;
5578
5579         if (iommu_prot & IOMMU_READ)
5580                 prot |= DMA_PTE_READ;
5581         if (iommu_prot & IOMMU_WRITE)
5582                 prot |= DMA_PTE_WRITE;
5583         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5584                 prot |= DMA_PTE_SNP;
5585
5586         max_addr = iova + size;
5587         if (dmar_domain->max_addr < max_addr) {
5588                 u64 end;
5589
5590                 /* check if minimum agaw is sufficient for mapped address */
5591                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5592                 if (end < max_addr) {
5593                         pr_err("%s: iommu width (%d) is not "
5594                                "sufficient for the mapped address (%llx)\n",
5595                                __func__, dmar_domain->gaw, max_addr);
5596                         return -EFAULT;
5597                 }
5598                 dmar_domain->max_addr = max_addr;
5599         }
5600         /* Round up size to next multiple of PAGE_SIZE, if it and
5601            the low bits of hpa would take us onto the next page */
5602         size = aligned_nrpages(hpa, size);
5603         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5604                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5605         return ret;
5606 }
5607
5608 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5609                                 unsigned long iova, size_t size,
5610                                 struct iommu_iotlb_gather *gather)
5611 {
5612         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5613         unsigned long start_pfn, last_pfn;
5614         int level = 0;
5615
5616         /* Cope with horrid API which requires us to unmap more than the
5617            size argument if it happens to be a large-page mapping. */
5618         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5619
5620         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5621                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5622
5623         start_pfn = iova >> VTD_PAGE_SHIFT;
5624         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5625
5626         gather->freelist = domain_unmap(dmar_domain, start_pfn,
5627                                         last_pfn, gather->freelist);
5628
5629         if (dmar_domain->max_addr == iova + size)
5630                 dmar_domain->max_addr = iova;
5631
5632         iommu_iotlb_gather_add_page(domain, gather, iova, size);
5633
5634         return size;
5635 }
5636
5637 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5638                                  struct iommu_iotlb_gather *gather)
5639 {
5640         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5641         unsigned long iova_pfn = IOVA_PFN(gather->start);
5642         size_t size = gather->end - gather->start;
5643         unsigned long start_pfn, last_pfn;
5644         unsigned long nrpages;
5645         int iommu_id;
5646
5647         nrpages = aligned_nrpages(gather->start, size);
5648         start_pfn = mm_to_dma_pfn(iova_pfn);
5649         last_pfn = start_pfn + nrpages - 1;
5650
5651         for_each_domain_iommu(iommu_id, dmar_domain)
5652                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5653                                       start_pfn, nrpages, !gather->freelist, 0);
5654
5655         dma_free_pagelist(gather->freelist);
5656 }
5657
5658 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5659                                             dma_addr_t iova)
5660 {
5661         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5662         struct dma_pte *pte;
5663         int level = 0;
5664         u64 phys = 0;
5665
5666         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5667         if (pte && dma_pte_present(pte))
5668                 phys = dma_pte_addr(pte) +
5669                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5670                                                 VTD_PAGE_SHIFT) - 1));
5671
5672         return phys;
5673 }
5674
5675 static inline bool scalable_mode_support(void)
5676 {
5677         struct dmar_drhd_unit *drhd;
5678         struct intel_iommu *iommu;
5679         bool ret = true;
5680
5681         rcu_read_lock();
5682         for_each_active_iommu(iommu, drhd) {
5683                 if (!sm_supported(iommu)) {
5684                         ret = false;
5685                         break;
5686                 }
5687         }
5688         rcu_read_unlock();
5689
5690         return ret;
5691 }
5692
5693 static inline bool iommu_pasid_support(void)
5694 {
5695         struct dmar_drhd_unit *drhd;
5696         struct intel_iommu *iommu;
5697         bool ret = true;
5698
5699         rcu_read_lock();
5700         for_each_active_iommu(iommu, drhd) {
5701                 if (!pasid_supported(iommu)) {
5702                         ret = false;
5703                         break;
5704                 }
5705         }
5706         rcu_read_unlock();
5707
5708         return ret;
5709 }
5710
5711 static inline bool nested_mode_support(void)
5712 {
5713         struct dmar_drhd_unit *drhd;
5714         struct intel_iommu *iommu;
5715         bool ret = true;
5716
5717         rcu_read_lock();
5718         for_each_active_iommu(iommu, drhd) {
5719                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5720                         ret = false;
5721                         break;
5722                 }
5723         }
5724         rcu_read_unlock();
5725
5726         return ret;
5727 }
5728
5729 static bool intel_iommu_capable(enum iommu_cap cap)
5730 {
5731         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5732                 return domain_update_iommu_snooping(NULL) == 1;
5733         if (cap == IOMMU_CAP_INTR_REMAP)
5734                 return irq_remapping_enabled == 1;
5735
5736         return false;
5737 }
5738
5739 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5740 {
5741         struct intel_iommu *iommu;
5742
5743         iommu = device_to_iommu(dev, NULL, NULL);
5744         if (!iommu)
5745                 return ERR_PTR(-ENODEV);
5746
5747         if (translation_pre_enabled(iommu))
5748                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5749
5750         return &iommu->iommu;
5751 }
5752
5753 static void intel_iommu_release_device(struct device *dev)
5754 {
5755         struct intel_iommu *iommu;
5756
5757         iommu = device_to_iommu(dev, NULL, NULL);
5758         if (!iommu)
5759                 return;
5760
5761         dmar_remove_one_dev_info(dev);
5762
5763         set_dma_ops(dev, NULL);
5764 }
5765
5766 static void intel_iommu_probe_finalize(struct device *dev)
5767 {
5768         struct iommu_domain *domain;
5769
5770         domain = iommu_get_domain_for_dev(dev);
5771         if (device_needs_bounce(dev))
5772                 set_dma_ops(dev, &bounce_dma_ops);
5773         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5774                 set_dma_ops(dev, &intel_dma_ops);
5775         else
5776                 set_dma_ops(dev, NULL);
5777 }
5778
5779 static void intel_iommu_get_resv_regions(struct device *device,
5780                                          struct list_head *head)
5781 {
5782         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5783         struct iommu_resv_region *reg;
5784         struct dmar_rmrr_unit *rmrr;
5785         struct device *i_dev;
5786         int i;
5787
5788         down_read(&dmar_global_lock);
5789         for_each_rmrr_units(rmrr) {
5790                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5791                                           i, i_dev) {
5792                         struct iommu_resv_region *resv;
5793                         enum iommu_resv_type type;
5794                         size_t length;
5795
5796                         if (i_dev != device &&
5797                             !is_downstream_to_pci_bridge(device, i_dev))
5798                                 continue;
5799
5800                         length = rmrr->end_address - rmrr->base_address + 1;
5801
5802                         type = device_rmrr_is_relaxable(device) ?
5803                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5804
5805                         resv = iommu_alloc_resv_region(rmrr->base_address,
5806                                                        length, prot, type);
5807                         if (!resv)
5808                                 break;
5809
5810                         list_add_tail(&resv->list, head);
5811                 }
5812         }
5813         up_read(&dmar_global_lock);
5814
5815 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5816         if (dev_is_pci(device)) {
5817                 struct pci_dev *pdev = to_pci_dev(device);
5818
5819                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5820                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5821                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5822                         if (reg)
5823                                 list_add_tail(&reg->list, head);
5824                 }
5825         }
5826 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5827
5828         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5829                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5830                                       0, IOMMU_RESV_MSI);
5831         if (!reg)
5832                 return;
5833         list_add_tail(&reg->list, head);
5834 }
5835
5836 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5837 {
5838         struct device_domain_info *info;
5839         struct context_entry *context;
5840         struct dmar_domain *domain;
5841         unsigned long flags;
5842         u64 ctx_lo;
5843         int ret;
5844
5845         domain = find_domain(dev);
5846         if (!domain)
5847                 return -EINVAL;
5848
5849         spin_lock_irqsave(&device_domain_lock, flags);
5850         spin_lock(&iommu->lock);
5851
5852         ret = -EINVAL;
5853         info = get_domain_info(dev);
5854         if (!info || !info->pasid_supported)
5855                 goto out;
5856
5857         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5858         if (WARN_ON(!context))
5859                 goto out;
5860
5861         ctx_lo = context[0].lo;
5862
5863         if (!(ctx_lo & CONTEXT_PASIDE)) {
5864                 ctx_lo |= CONTEXT_PASIDE;
5865                 context[0].lo = ctx_lo;
5866                 wmb();
5867                 iommu->flush.flush_context(iommu,
5868                                            domain->iommu_did[iommu->seq_id],
5869                                            PCI_DEVID(info->bus, info->devfn),
5870                                            DMA_CCMD_MASK_NOBIT,
5871                                            DMA_CCMD_DEVICE_INVL);
5872         }
5873
5874         /* Enable PASID support in the device, if it wasn't already */
5875         if (!info->pasid_enabled)
5876                 iommu_enable_dev_iotlb(info);
5877
5878         ret = 0;
5879
5880  out:
5881         spin_unlock(&iommu->lock);
5882         spin_unlock_irqrestore(&device_domain_lock, flags);
5883
5884         return ret;
5885 }
5886
5887 static void intel_iommu_apply_resv_region(struct device *dev,
5888                                           struct iommu_domain *domain,
5889                                           struct iommu_resv_region *region)
5890 {
5891         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5892         unsigned long start, end;
5893
5894         start = IOVA_PFN(region->start);
5895         end   = IOVA_PFN(region->start + region->length - 1);
5896
5897         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5898 }
5899
5900 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5901 {
5902         if (dev_is_pci(dev))
5903                 return pci_device_group(dev);
5904         return generic_device_group(dev);
5905 }
5906
5907 static int intel_iommu_enable_auxd(struct device *dev)
5908 {
5909         struct device_domain_info *info;
5910         struct intel_iommu *iommu;
5911         unsigned long flags;
5912         int ret;
5913
5914         iommu = device_to_iommu(dev, NULL, NULL);
5915         if (!iommu || dmar_disabled)
5916                 return -EINVAL;
5917
5918         if (!sm_supported(iommu) || !pasid_supported(iommu))
5919                 return -EINVAL;
5920
5921         ret = intel_iommu_enable_pasid(iommu, dev);
5922         if (ret)
5923                 return -ENODEV;
5924
5925         spin_lock_irqsave(&device_domain_lock, flags);
5926         info = get_domain_info(dev);
5927         info->auxd_enabled = 1;
5928         spin_unlock_irqrestore(&device_domain_lock, flags);
5929
5930         return 0;
5931 }
5932
5933 static int intel_iommu_disable_auxd(struct device *dev)
5934 {
5935         struct device_domain_info *info;
5936         unsigned long flags;
5937
5938         spin_lock_irqsave(&device_domain_lock, flags);
5939         info = get_domain_info(dev);
5940         if (!WARN_ON(!info))
5941                 info->auxd_enabled = 0;
5942         spin_unlock_irqrestore(&device_domain_lock, flags);
5943
5944         return 0;
5945 }
5946
5947 /*
5948  * A PCI express designated vendor specific extended capability is defined
5949  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5950  * for system software and tools to detect endpoint devices supporting the
5951  * Intel scalable IO virtualization without host driver dependency.
5952  *
5953  * Returns the address of the matching extended capability structure within
5954  * the device's PCI configuration space or 0 if the device does not support
5955  * it.
5956  */
5957 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5958 {
5959         int pos;
5960         u16 vendor, id;
5961
5962         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5963         while (pos) {
5964                 pci_read_config_word(pdev, pos + 4, &vendor);
5965                 pci_read_config_word(pdev, pos + 8, &id);
5966                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5967                         return pos;
5968
5969                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5970         }
5971
5972         return 0;
5973 }
5974
5975 static bool
5976 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5977 {
5978         if (feat == IOMMU_DEV_FEAT_AUX) {
5979                 int ret;
5980
5981                 if (!dev_is_pci(dev) || dmar_disabled ||
5982                     !scalable_mode_support() || !iommu_pasid_support())
5983                         return false;
5984
5985                 ret = pci_pasid_features(to_pci_dev(dev));
5986                 if (ret < 0)
5987                         return false;
5988
5989                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5990         }
5991
5992         if (feat == IOMMU_DEV_FEAT_SVA) {
5993                 struct device_domain_info *info = get_domain_info(dev);
5994
5995                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5996                         info->pasid_supported && info->pri_supported &&
5997                         info->ats_supported;
5998         }
5999
6000         return false;
6001 }
6002
6003 static int
6004 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6005 {
6006         if (feat == IOMMU_DEV_FEAT_AUX)
6007                 return intel_iommu_enable_auxd(dev);
6008
6009         if (feat == IOMMU_DEV_FEAT_SVA) {
6010                 struct device_domain_info *info = get_domain_info(dev);
6011
6012                 if (!info)
6013                         return -EINVAL;
6014
6015                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6016                         return 0;
6017         }
6018
6019         return -ENODEV;
6020 }
6021
6022 static int
6023 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6024 {
6025         if (feat == IOMMU_DEV_FEAT_AUX)
6026                 return intel_iommu_disable_auxd(dev);
6027
6028         return -ENODEV;
6029 }
6030
6031 static bool
6032 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6033 {
6034         struct device_domain_info *info = get_domain_info(dev);
6035
6036         if (feat == IOMMU_DEV_FEAT_AUX)
6037                 return scalable_mode_support() && info && info->auxd_enabled;
6038
6039         return false;
6040 }
6041
6042 static int
6043 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6044 {
6045         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6046
6047         return dmar_domain->default_pasid > 0 ?
6048                         dmar_domain->default_pasid : -EINVAL;
6049 }
6050
6051 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6052                                            struct device *dev)
6053 {
6054         return attach_deferred(dev);
6055 }
6056
6057 static int
6058 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6059                             enum iommu_attr attr, void *data)
6060 {
6061         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6062         unsigned long flags;
6063         int ret = 0;
6064
6065         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6066                 return -EINVAL;
6067
6068         switch (attr) {
6069         case DOMAIN_ATTR_NESTING:
6070                 spin_lock_irqsave(&device_domain_lock, flags);
6071                 if (nested_mode_support() &&
6072                     list_empty(&dmar_domain->devices)) {
6073                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6074                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6075                 } else {
6076                         ret = -ENODEV;
6077                 }
6078                 spin_unlock_irqrestore(&device_domain_lock, flags);
6079                 break;
6080         default:
6081                 ret = -EINVAL;
6082                 break;
6083         }
6084
6085         return ret;
6086 }
6087
6088 /*
6089  * Check that the device does not live on an external facing PCI port that is
6090  * marked as untrusted. Such devices should not be able to apply quirks and
6091  * thus not be able to bypass the IOMMU restrictions.
6092  */
6093 static bool risky_device(struct pci_dev *pdev)
6094 {
6095         if (pdev->untrusted) {
6096                 pci_info(pdev,
6097                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6098                          pdev->vendor, pdev->device);
6099                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6100                 return true;
6101         }
6102         return false;
6103 }
6104
6105 const struct iommu_ops intel_iommu_ops = {
6106         .capable                = intel_iommu_capable,
6107         .domain_alloc           = intel_iommu_domain_alloc,
6108         .domain_free            = intel_iommu_domain_free,
6109         .domain_set_attr        = intel_iommu_domain_set_attr,
6110         .attach_dev             = intel_iommu_attach_device,
6111         .detach_dev             = intel_iommu_detach_device,
6112         .aux_attach_dev         = intel_iommu_aux_attach_device,
6113         .aux_detach_dev         = intel_iommu_aux_detach_device,
6114         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6115         .map                    = intel_iommu_map,
6116         .unmap                  = intel_iommu_unmap,
6117         .iotlb_sync             = intel_iommu_tlb_sync,
6118         .iova_to_phys           = intel_iommu_iova_to_phys,
6119         .probe_device           = intel_iommu_probe_device,
6120         .probe_finalize         = intel_iommu_probe_finalize,
6121         .release_device         = intel_iommu_release_device,
6122         .get_resv_regions       = intel_iommu_get_resv_regions,
6123         .put_resv_regions       = generic_iommu_put_resv_regions,
6124         .apply_resv_region      = intel_iommu_apply_resv_region,
6125         .device_group           = intel_iommu_device_group,
6126         .dev_has_feat           = intel_iommu_dev_has_feat,
6127         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6128         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6129         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6130         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6131         .def_domain_type        = device_def_domain_type,
6132         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6133 #ifdef CONFIG_INTEL_IOMMU_SVM
6134         .cache_invalidate       = intel_iommu_sva_invalidate,
6135         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6136         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6137         .sva_bind               = intel_svm_bind,
6138         .sva_unbind             = intel_svm_unbind,
6139         .sva_get_pasid          = intel_svm_get_pasid,
6140         .page_response          = intel_svm_page_response,
6141 #endif
6142 };
6143
6144 static void quirk_iommu_igfx(struct pci_dev *dev)
6145 {
6146         if (risky_device(dev))
6147                 return;
6148
6149         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6150         dmar_map_gfx = 0;
6151 }
6152
6153 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6161
6162 /* Broadwell igfx malfunctions with dmar */
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6170 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6171 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6172 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6173 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6174 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6175 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6176 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6177 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6178 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6187
6188 static void quirk_iommu_rwbf(struct pci_dev *dev)
6189 {
6190         if (risky_device(dev))
6191                 return;
6192
6193         /*
6194          * Mobile 4 Series Chipset neglects to set RWBF capability,
6195          * but needs it. Same seems to hold for the desktop versions.
6196          */
6197         pci_info(dev, "Forcing write-buffer flush capability\n");
6198         rwbf_quirk = 1;
6199 }
6200
6201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6206 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6208
6209 #define GGC 0x52
6210 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6211 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6212 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6213 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6214 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6215 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6216 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6217 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6218
6219 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6220 {
6221         unsigned short ggc;
6222
6223         if (risky_device(dev))
6224                 return;
6225
6226         if (pci_read_config_word(dev, GGC, &ggc))
6227                 return;
6228
6229         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6230                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6231                 dmar_map_gfx = 0;
6232         } else if (dmar_map_gfx) {
6233                 /* we have to ensure the gfx device is idle before we flush */
6234                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6235                 intel_iommu_strict = 1;
6236        }
6237 }
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6242
6243 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6244 {
6245         unsigned short ver;
6246
6247         if (!IS_GFX_DEVICE(dev))
6248                 return;
6249
6250         ver = (dev->device >> 8) & 0xff;
6251         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6252             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6253             ver != 0x9a)
6254                 return;
6255
6256         if (risky_device(dev))
6257                 return;
6258
6259         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6260         iommu_skip_te_disable = 1;
6261 }
6262 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6263
6264 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6265    ISOCH DMAR unit for the Azalia sound device, but not give it any
6266    TLB entries, which causes it to deadlock. Check for that.  We do
6267    this in a function called from init_dmars(), instead of in a PCI
6268    quirk, because we don't want to print the obnoxious "BIOS broken"
6269    message if VT-d is actually disabled.
6270 */
6271 static void __init check_tylersburg_isoch(void)
6272 {
6273         struct pci_dev *pdev;
6274         uint32_t vtisochctrl;
6275
6276         /* If there's no Azalia in the system anyway, forget it. */
6277         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6278         if (!pdev)
6279                 return;
6280
6281         if (risky_device(pdev)) {
6282                 pci_dev_put(pdev);
6283                 return;
6284         }
6285
6286         pci_dev_put(pdev);
6287
6288         /* System Management Registers. Might be hidden, in which case
6289            we can't do the sanity check. But that's OK, because the
6290            known-broken BIOSes _don't_ actually hide it, so far. */
6291         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6292         if (!pdev)
6293                 return;
6294
6295         if (risky_device(pdev)) {
6296                 pci_dev_put(pdev);
6297                 return;
6298         }
6299
6300         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6301                 pci_dev_put(pdev);
6302                 return;
6303         }
6304
6305         pci_dev_put(pdev);
6306
6307         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6308         if (vtisochctrl & 1)
6309                 return;
6310
6311         /* Drop all bits other than the number of TLB entries */
6312         vtisochctrl &= 0x1c;
6313
6314         /* If we have the recommended number of TLB entries (16), fine. */
6315         if (vtisochctrl == 0x10)
6316                 return;
6317
6318         /* Zero TLB entries? You get to ride the short bus to school. */
6319         if (!vtisochctrl) {
6320                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6321                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6322                      dmi_get_system_info(DMI_BIOS_VENDOR),
6323                      dmi_get_system_info(DMI_BIOS_VERSION),
6324                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6325                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6326                 return;
6327         }
6328
6329         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6330                vtisochctrl);
6331 }