Merge tag 'iommu-fixes-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev_iommu_priv_get(dev);
376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377                 return NULL;
378
379         return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
386                                 to_pci_dev(d)->untrusted)
387
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393                                      void *data), void *data)
394 {
395         int ret = 0;
396         unsigned long flags;
397         struct device_domain_info *info;
398
399         spin_lock_irqsave(&device_domain_lock, flags);
400         list_for_each_entry(info, &device_domain_list, global) {
401                 ret = fn(info, data);
402                 if (ret) {
403                         spin_unlock_irqrestore(&device_domain_lock, flags);
404                         return ret;
405                 }
406         }
407         spin_unlock_irqrestore(&device_domain_lock, flags);
408
409         return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426         u32 gsts;
427
428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
429         if (gsts & DMA_GSTS_TES)
430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
433 static int __init intel_iommu_setup(char *str)
434 {
435         if (!str)
436                 return -EINVAL;
437         while (*str) {
438                 if (!strncmp(str, "on", 2)) {
439                         dmar_disabled = 0;
440                         pr_info("IOMMU enabled\n");
441                 } else if (!strncmp(str, "off", 3)) {
442                         dmar_disabled = 1;
443                         no_platform_optin = 1;
444                         pr_info("IOMMU disabled\n");
445                 } else if (!strncmp(str, "igfx_off", 8)) {
446                         dmar_map_gfx = 0;
447                         pr_info("Disable GFX device mapping\n");
448                 } else if (!strncmp(str, "forcedac", 8)) {
449                         pr_info("Forcing DAC for PCI devices\n");
450                         dmar_forcedac = 1;
451                 } else if (!strncmp(str, "strict", 6)) {
452                         pr_info("Disable batched IOTLB flush\n");
453                         intel_iommu_strict = 1;
454                 } else if (!strncmp(str, "sp_off", 6)) {
455                         pr_info("Disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 } else if (!strncmp(str, "sm_on", 5)) {
458                         pr_info("Intel-IOMMU: scalable mode supported\n");
459                         intel_iommu_sm = 1;
460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462                         intel_iommu_tboot_noforce = 1;
463                 } else if (!strncmp(str, "nobounce", 8)) {
464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465                         intel_no_bounce = 1;
466                 }
467
468                 str += strcspn(str, ",");
469                 while (*str == ',')
470                         str++;
471         }
472         return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481         struct dmar_domain **domains;
482         int idx = did >> 8;
483
484         domains = iommu->domains[idx];
485         if (!domains)
486                 return NULL;
487
488         return domains[did & 0xff];
489 }
490
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492                              struct dmar_domain *domain)
493 {
494         struct dmar_domain **domains;
495         int idx = did >> 8;
496
497         if (!iommu->domains[idx]) {
498                 size_t size = 256 * sizeof(struct dmar_domain *);
499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500         }
501
502         domains = iommu->domains[idx];
503         if (WARN_ON(!domains))
504                 return;
505         else
506                 domains[did & 0xff] = domain;
507 }
508
509 void *alloc_pgtable_page(int node)
510 {
511         struct page *page;
512         void *vaddr = NULL;
513
514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515         if (page)
516                 vaddr = page_address(page);
517         return vaddr;
518 }
519
520 void free_pgtable_page(void *vaddr)
521 {
522         free_page((unsigned long)vaddr);
523 }
524
525 static inline void *alloc_domain_mem(void)
526 {
527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
530 static void free_domain_mem(void *vaddr)
531 {
532         kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
535 static inline void * alloc_devinfo_mem(void)
536 {
537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616         return sm_supported(iommu) ?
617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu;
624         bool found = false;
625         int i;
626
627         domain->iommu_coherency = 1;
628
629         for_each_domain_iommu(i, domain) {
630                 found = true;
631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632                         domain->iommu_coherency = 0;
633                         break;
634                 }
635         }
636         if (found)
637                 return;
638
639         /* No hardware attached; use lowest common denominator */
640         rcu_read_lock();
641         for_each_active_iommu(iommu, drhd) {
642                 if (!iommu_paging_structure_coherency(iommu)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         rcu_read_unlock();
648 }
649
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652         struct dmar_drhd_unit *drhd;
653         struct intel_iommu *iommu;
654         int ret = 1;
655
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         if (!ecap_sc_support(iommu->ecap)) {
660                                 ret = 0;
661                                 break;
662                         }
663                 }
664         }
665         rcu_read_unlock();
666
667         return ret;
668 }
669
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671                                          struct intel_iommu *skip)
672 {
673         struct dmar_drhd_unit *drhd;
674         struct intel_iommu *iommu;
675         int mask = 0x3;
676
677         if (!intel_iommu_superpage) {
678                 return 0;
679         }
680
681         /* set iommu_superpage to the smallest common denominator */
682         rcu_read_lock();
683         for_each_active_iommu(iommu, drhd) {
684                 if (iommu != skip) {
685                         if (domain && domain_use_first_level(domain)) {
686                                 if (!cap_fl1gp_support(iommu->cap))
687                                         mask = 0x1;
688                         } else {
689                                 mask &= cap_super_page_val(iommu->cap);
690                         }
691
692                         if (!mask)
693                                 break;
694                 }
695         }
696         rcu_read_unlock();
697
698         return fls(mask);
699 }
700
701 static int domain_update_device_node(struct dmar_domain *domain)
702 {
703         struct device_domain_info *info;
704         int nid = NUMA_NO_NODE;
705
706         assert_spin_locked(&device_domain_lock);
707
708         if (list_empty(&domain->devices))
709                 return NUMA_NO_NODE;
710
711         list_for_each_entry(info, &domain->devices, link) {
712                 if (!info->dev)
713                         continue;
714
715                 /*
716                  * There could possibly be multiple device numa nodes as devices
717                  * within the same domain may sit behind different IOMMUs. There
718                  * isn't perfect answer in such situation, so we select first
719                  * come first served policy.
720                  */
721                 nid = dev_to_node(info->dev);
722                 if (nid != NUMA_NO_NODE)
723                         break;
724         }
725
726         return nid;
727 }
728
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
731 {
732         domain_update_iommu_coherency(domain);
733         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
735
736         /*
737          * If RHSA is missing, we should default to the device numa domain
738          * as fall back.
739          */
740         if (domain->nid == NUMA_NO_NODE)
741                 domain->nid = domain_update_device_node(domain);
742 }
743
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
745                                          u8 devfn, int alloc)
746 {
747         struct root_entry *root = &iommu->root_entry[bus];
748         struct context_entry *context;
749         u64 *entry;
750
751         entry = &root->lo;
752         if (sm_supported(iommu)) {
753                 if (devfn >= 0x80) {
754                         devfn -= 0x80;
755                         entry = &root->hi;
756                 }
757                 devfn *= 2;
758         }
759         if (*entry & 1)
760                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
761         else {
762                 unsigned long phy_addr;
763                 if (!alloc)
764                         return NULL;
765
766                 context = alloc_pgtable_page(iommu->node);
767                 if (!context)
768                         return NULL;
769
770                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771                 phy_addr = virt_to_phys((void *)context);
772                 *entry = phy_addr | 1;
773                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
774         }
775         return &context[devfn];
776 }
777
778 static bool attach_deferred(struct device *dev)
779 {
780         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
781 }
782
783 /**
784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785  *                               sub-hierarchy of a candidate PCI-PCI bridge
786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787  * @bridge: the candidate PCI-PCI bridge
788  *
789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
790  */
791 static bool
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
793 {
794         struct pci_dev *pdev, *pbridge;
795
796         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
797                 return false;
798
799         pdev = to_pci_dev(dev);
800         pbridge = to_pci_dev(bridge);
801
802         if (pbridge->subordinate &&
803             pbridge->subordinate->number <= pdev->bus->number &&
804             pbridge->subordinate->busn_res.end >= pdev->bus->number)
805                 return true;
806
807         return false;
808 }
809
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
811 {
812         struct dmar_drhd_unit *drhd;
813         u32 vtbar;
814         int rc;
815
816         /* We know that this device on this chipset has its own IOMMU.
817          * If we find it under a different IOMMU, then the BIOS is lying
818          * to us. Hope that the IOMMU for this device is actually
819          * disabled, and it needs no translation...
820          */
821         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
822         if (rc) {
823                 /* "can't" happen */
824                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
825                 return false;
826         }
827         vtbar &= 0xffff0000;
828
829         /* we know that the this iommu should be at offset 0xa000 from vtbar */
830         drhd = dmar_find_matched_drhd_unit(pdev);
831         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
834                 return true;
835         }
836
837         return false;
838 }
839
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
841 {
842         if (!iommu || iommu->drhd->ignored)
843                 return true;
844
845         if (dev_is_pci(dev)) {
846                 struct pci_dev *pdev = to_pci_dev(dev);
847
848                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850                     quirk_ioat_snb_local_iommu(pdev))
851                         return true;
852         }
853
854         return false;
855 }
856
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
858 {
859         struct dmar_drhd_unit *drhd = NULL;
860         struct pci_dev *pdev = NULL;
861         struct intel_iommu *iommu;
862         struct device *tmp;
863         u16 segment = 0;
864         int i;
865
866         if (!dev)
867                 return NULL;
868
869         if (dev_is_pci(dev)) {
870                 struct pci_dev *pf_pdev;
871
872                 pdev = pci_real_dma_dev(to_pci_dev(dev));
873
874                 /* VFs aren't listed in scope tables; we need to look up
875                  * the PF instead to find the IOMMU. */
876                 pf_pdev = pci_physfn(pdev);
877                 dev = &pf_pdev->dev;
878                 segment = pci_domain_nr(pdev->bus);
879         } else if (has_acpi_companion(dev))
880                 dev = &ACPI_COMPANION(dev)->dev;
881
882         rcu_read_lock();
883         for_each_iommu(iommu, drhd) {
884                 if (pdev && segment != drhd->segment)
885                         continue;
886
887                 for_each_active_dev_scope(drhd->devices,
888                                           drhd->devices_cnt, i, tmp) {
889                         if (tmp == dev) {
890                                 /* For a VF use its original BDF# not that of the PF
891                                  * which we used for the IOMMU lookup. Strictly speaking
892                                  * we could do this for all PCI devices; we only need to
893                                  * get the BDF# from the scope table for ACPI matches. */
894                                 if (pdev && pdev->is_virtfn)
895                                         goto got_pdev;
896
897                                 if (bus && devfn) {
898                                         *bus = drhd->devices[i].bus;
899                                         *devfn = drhd->devices[i].devfn;
900                                 }
901                                 goto out;
902                         }
903
904                         if (is_downstream_to_pci_bridge(dev, tmp))
905                                 goto got_pdev;
906                 }
907
908                 if (pdev && drhd->include_all) {
909                 got_pdev:
910                         if (bus && devfn) {
911                                 *bus = pdev->bus->number;
912                                 *devfn = pdev->devfn;
913                         }
914                         goto out;
915                 }
916         }
917         iommu = NULL;
918  out:
919         if (iommu_is_dummy(iommu, dev))
920                 iommu = NULL;
921
922         rcu_read_unlock();
923
924         return iommu;
925 }
926
927 static void domain_flush_cache(struct dmar_domain *domain,
928                                void *addr, int size)
929 {
930         if (!domain->iommu_coherency)
931                 clflush_cache_range(addr, size);
932 }
933
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
935 {
936         struct context_entry *context;
937         int ret = 0;
938         unsigned long flags;
939
940         spin_lock_irqsave(&iommu->lock, flags);
941         context = iommu_context_addr(iommu, bus, devfn, 0);
942         if (context)
943                 ret = context_present(context);
944         spin_unlock_irqrestore(&iommu->lock, flags);
945         return ret;
946 }
947
948 static void free_context_table(struct intel_iommu *iommu)
949 {
950         int i;
951         unsigned long flags;
952         struct context_entry *context;
953
954         spin_lock_irqsave(&iommu->lock, flags);
955         if (!iommu->root_entry) {
956                 goto out;
957         }
958         for (i = 0; i < ROOT_ENTRY_NR; i++) {
959                 context = iommu_context_addr(iommu, i, 0, 0);
960                 if (context)
961                         free_pgtable_page(context);
962
963                 if (!sm_supported(iommu))
964                         continue;
965
966                 context = iommu_context_addr(iommu, i, 0x80, 0);
967                 if (context)
968                         free_pgtable_page(context);
969
970         }
971         free_pgtable_page(iommu->root_entry);
972         iommu->root_entry = NULL;
973 out:
974         spin_unlock_irqrestore(&iommu->lock, flags);
975 }
976
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978                                       unsigned long pfn, int *target_level)
979 {
980         struct dma_pte *parent, *pte;
981         int level = agaw_to_level(domain->agaw);
982         int offset;
983
984         BUG_ON(!domain->pgd);
985
986         if (!domain_pfn_supported(domain, pfn))
987                 /* Address beyond IOMMU's addressing capabilities. */
988                 return NULL;
989
990         parent = domain->pgd;
991
992         while (1) {
993                 void *tmp_page;
994
995                 offset = pfn_level_offset(pfn, level);
996                 pte = &parent[offset];
997                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
998                         break;
999                 if (level == *target_level)
1000                         break;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         uint64_t pteval;
1004
1005                         tmp_page = alloc_pgtable_page(domain->nid);
1006
1007                         if (!tmp_page)
1008                                 return NULL;
1009
1010                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012                         if (domain_use_first_level(domain))
1013                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1015                                 /* Someone else set it while we were thinking; use theirs. */
1016                                 free_pgtable_page(tmp_page);
1017                         else
1018                                 domain_flush_cache(domain, pte, sizeof(*pte));
1019                 }
1020                 if (level == 1)
1021                         break;
1022
1023                 parent = phys_to_virt(dma_pte_addr(pte));
1024                 level--;
1025         }
1026
1027         if (!*target_level)
1028                 *target_level = level;
1029
1030         return pte;
1031 }
1032
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035                                          unsigned long pfn,
1036                                          int level, int *large_page)
1037 {
1038         struct dma_pte *parent, *pte;
1039         int total = agaw_to_level(domain->agaw);
1040         int offset;
1041
1042         parent = domain->pgd;
1043         while (level <= total) {
1044                 offset = pfn_level_offset(pfn, total);
1045                 pte = &parent[offset];
1046                 if (level == total)
1047                         return pte;
1048
1049                 if (!dma_pte_present(pte)) {
1050                         *large_page = total;
1051                         break;
1052                 }
1053
1054                 if (dma_pte_superpage(pte)) {
1055                         *large_page = total;
1056                         return pte;
1057                 }
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 total--;
1061         }
1062         return NULL;
1063 }
1064
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067                                 unsigned long start_pfn,
1068                                 unsigned long last_pfn)
1069 {
1070         unsigned int large_page;
1071         struct dma_pte *first_pte, *pte;
1072
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         /* we don't need lock here; nobody else touches the iova range */
1078         do {
1079                 large_page = 1;
1080                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081                 if (!pte) {
1082                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083                         continue;
1084                 }
1085                 do {
1086                         dma_clear_pte(pte);
1087                         start_pfn += lvl_to_nr_pages(large_page);
1088                         pte++;
1089                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090
1091                 domain_flush_cache(domain, first_pte,
1092                                    (void *)pte - (void *)first_pte);
1093
1094         } while (start_pfn && start_pfn <= last_pfn);
1095 }
1096
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098                                int retain_level, struct dma_pte *pte,
1099                                unsigned long pfn, unsigned long start_pfn,
1100                                unsigned long last_pfn)
1101 {
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107                 struct dma_pte *level_pte;
1108
1109                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110                         goto next;
1111
1112                 level_pfn = pfn & level_mask(level);
1113                 level_pte = phys_to_virt(dma_pte_addr(pte));
1114
1115                 if (level > 2) {
1116                         dma_pte_free_level(domain, level - 1, retain_level,
1117                                            level_pte, level_pfn, start_pfn,
1118                                            last_pfn);
1119                 }
1120
1121                 /*
1122                  * Free the page table if we're below the level we want to
1123                  * retain and the range covers the entire table.
1124                  */
1125                 if (level < retain_level && !(start_pfn > level_pfn ||
1126                       last_pfn < level_pfn + level_size(level) - 1)) {
1127                         dma_clear_pte(pte);
1128                         domain_flush_cache(domain, pte, sizeof(*pte));
1129                         free_pgtable_page(level_pte);
1130                 }
1131 next:
1132                 pfn += level_size(level);
1133         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141                                    unsigned long start_pfn,
1142                                    unsigned long last_pfn,
1143                                    int retain_level)
1144 {
1145         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147         BUG_ON(start_pfn > last_pfn);
1148
1149         dma_pte_clear_range(domain, start_pfn, last_pfn);
1150
1151         /* We don't need lock here; nobody else touches the iova range */
1152         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153                            domain->pgd, 0, start_pfn, last_pfn);
1154
1155         /* free pgd */
1156         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157                 free_pgtable_page(domain->pgd);
1158                 domain->pgd = NULL;
1159         }
1160 }
1161
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169                                             int level, struct dma_pte *pte,
1170                                             struct page *freelist)
1171 {
1172         struct page *pg;
1173
1174         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175         pg->freelist = freelist;
1176         freelist = pg;
1177
1178         if (level == 1)
1179                 return freelist;
1180
1181         pte = page_address(pg);
1182         do {
1183                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184                         freelist = dma_pte_list_pagetables(domain, level - 1,
1185                                                            pte, freelist);
1186                 pte++;
1187         } while (!first_pte_in_page(pte));
1188
1189         return freelist;
1190 }
1191
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193                                         struct dma_pte *pte, unsigned long pfn,
1194                                         unsigned long start_pfn,
1195                                         unsigned long last_pfn,
1196                                         struct page *freelist)
1197 {
1198         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199
1200         pfn = max(start_pfn, pfn);
1201         pte = &pte[pfn_level_offset(pfn, level)];
1202
1203         do {
1204                 unsigned long level_pfn;
1205
1206                 if (!dma_pte_present(pte))
1207                         goto next;
1208
1209                 level_pfn = pfn & level_mask(level);
1210
1211                 /* If range covers entire pagetable, free it */
1212                 if (start_pfn <= level_pfn &&
1213                     last_pfn >= level_pfn + level_size(level) - 1) {
1214                         /* These suborbinate page tables are going away entirely. Don't
1215                            bother to clear them; we're just going to *free* them. */
1216                         if (level > 1 && !dma_pte_superpage(pte))
1217                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218
1219                         dma_clear_pte(pte);
1220                         if (!first_pte)
1221                                 first_pte = pte;
1222                         last_pte = pte;
1223                 } else if (level > 1) {
1224                         /* Recurse down into a level that isn't *entirely* obsolete */
1225                         freelist = dma_pte_clear_level(domain, level - 1,
1226                                                        phys_to_virt(dma_pte_addr(pte)),
1227                                                        level_pfn, start_pfn, last_pfn,
1228                                                        freelist);
1229                 }
1230 next:
1231                 pfn += level_size(level);
1232         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233
1234         if (first_pte)
1235                 domain_flush_cache(domain, first_pte,
1236                                    (void *)++last_pte - (void *)first_pte);
1237
1238         return freelist;
1239 }
1240
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245                                  unsigned long start_pfn,
1246                                  unsigned long last_pfn)
1247 {
1248         struct page *freelist;
1249
1250         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252         BUG_ON(start_pfn > last_pfn);
1253
1254         /* we don't need lock here; nobody else touches the iova range */
1255         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1257
1258         /* free pgd */
1259         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260                 struct page *pgd_page = virt_to_page(domain->pgd);
1261                 pgd_page->freelist = freelist;
1262                 freelist = pgd_page;
1263
1264                 domain->pgd = NULL;
1265         }
1266
1267         return freelist;
1268 }
1269
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272         struct page *pg;
1273
1274         while ((pg = freelist)) {
1275                 freelist = pg->freelist;
1276                 free_pgtable_page(page_address(pg));
1277         }
1278 }
1279
1280 static void iova_entry_free(unsigned long data)
1281 {
1282         struct page *freelist = (struct page *)data;
1283
1284         dma_free_pagelist(freelist);
1285 }
1286
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290         struct root_entry *root;
1291         unsigned long flags;
1292
1293         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294         if (!root) {
1295                 pr_err("Allocating root entry for %s failed\n",
1296                         iommu->name);
1297                 return -ENOMEM;
1298         }
1299
1300         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         iommu->root_entry = root;
1304         spin_unlock_irqrestore(&iommu->lock, flags);
1305
1306         return 0;
1307 }
1308
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311         u64 addr;
1312         u32 sts;
1313         unsigned long flag;
1314
1315         addr = virt_to_phys(iommu->root_entry);
1316         if (sm_supported(iommu))
1317                 addr |= DMA_RTADDR_SMT;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321
1322         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (sts & DMA_GSTS_RTPS), sts);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333         u32 val;
1334         unsigned long flag;
1335
1336         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337                 return;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344                       readl, (!(val & DMA_GSTS_WBFS)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351                                   u16 did, u16 source_id, u8 function_mask,
1352                                   u64 type)
1353 {
1354         u64 val = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_CCMD_GLOBAL_INVL:
1359                 val = DMA_CCMD_GLOBAL_INVL;
1360                 break;
1361         case DMA_CCMD_DOMAIN_INVL:
1362                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363                 break;
1364         case DMA_CCMD_DEVICE_INVL:
1365                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367                 break;
1368         default:
1369                 BUG();
1370         }
1371         val |= DMA_CCMD_ICC;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385                                 u64 addr, unsigned int size_order, u64 type)
1386 {
1387         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388         u64 val = 0, val_iva = 0;
1389         unsigned long flag;
1390
1391         switch (type) {
1392         case DMA_TLB_GLOBAL_FLUSH:
1393                 /* global flush doesn't need set IVA_REG */
1394                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395                 break;
1396         case DMA_TLB_DSI_FLUSH:
1397                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398                 break;
1399         case DMA_TLB_PSI_FLUSH:
1400                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 /* IH bit is passed in as part of address */
1402                 val_iva = size_order | addr;
1403                 break;
1404         default:
1405                 BUG();
1406         }
1407         /* Note: set drain read/write */
1408 #if 0
1409         /*
1410          * This is probably to be super secure.. Looks like we can
1411          * ignore it without any impact.
1412          */
1413         if (cap_read_drain(iommu->cap))
1414                 val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416         if (cap_write_drain(iommu->cap))
1417                 val |= DMA_TLB_WRITE_DRAIN;
1418
1419         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420         /* Note: Only uses first TLB reg currently */
1421         if (val_iva)
1422                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430
1431         /* check IOTLB invalidation granularity */
1432         if (DMA_TLB_IAIG(val) == 0)
1433                 pr_err("Flush IOTLB failed\n");
1434         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436                         (unsigned long long)DMA_TLB_IIRG(type),
1437                         (unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442                          u8 bus, u8 devfn)
1443 {
1444         struct device_domain_info *info;
1445
1446         assert_spin_locked(&device_domain_lock);
1447
1448         if (!iommu->qi)
1449                 return NULL;
1450
1451         list_for_each_entry(info, &domain->devices, link)
1452                 if (info->iommu == iommu && info->bus == bus &&
1453                     info->devfn == devfn) {
1454                         if (info->ats_supported && info->dev)
1455                                 return info;
1456                         break;
1457                 }
1458
1459         return NULL;
1460 }
1461
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464         struct device_domain_info *info;
1465         bool has_iotlb_device = false;
1466
1467         assert_spin_locked(&device_domain_lock);
1468
1469         list_for_each_entry(info, &domain->devices, link) {
1470                 struct pci_dev *pdev;
1471
1472                 if (!info->dev || !dev_is_pci(info->dev))
1473                         continue;
1474
1475                 pdev = to_pci_dev(info->dev);
1476                 if (pdev->ats_enabled) {
1477                         has_iotlb_device = true;
1478                         break;
1479                 }
1480         }
1481
1482         domain->has_iotlb_device = has_iotlb_device;
1483 }
1484
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487         struct pci_dev *pdev;
1488
1489         assert_spin_locked(&device_domain_lock);
1490
1491         if (!info || !dev_is_pci(info->dev))
1492                 return;
1493
1494         pdev = to_pci_dev(info->dev);
1495         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498          * reserved, which should be set to 0.
1499          */
1500         if (!ecap_dit(info->iommu->ecap))
1501                 info->pfsid = 0;
1502         else {
1503                 struct pci_dev *pf_pdev;
1504
1505                 /* pdev will be returned if device is not a vf */
1506                 pf_pdev = pci_physfn(pdev);
1507                 info->pfsid = pci_dev_id(pf_pdev);
1508         }
1509
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511         /* The PCIe spec, in its wisdom, declares that the behaviour of
1512            the device if you enable PASID support after ATS support is
1513            undefined. So always enable PASID support on devices which
1514            have it, even if we can't yet know if we're ever going to
1515            use it. */
1516         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517                 info->pasid_enabled = 1;
1518
1519         if (info->pri_supported &&
1520             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522                 info->pri_enabled = 1;
1523 #endif
1524         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526                 info->ats_enabled = 1;
1527                 domain_update_iotlb(info->domain);
1528                 info->ats_qdep = pci_ats_queue_depth(pdev);
1529         }
1530 }
1531
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534         struct pci_dev *pdev;
1535
1536         assert_spin_locked(&device_domain_lock);
1537
1538         if (!dev_is_pci(info->dev))
1539                 return;
1540
1541         pdev = to_pci_dev(info->dev);
1542
1543         if (info->ats_enabled) {
1544                 pci_disable_ats(pdev);
1545                 info->ats_enabled = 0;
1546                 domain_update_iotlb(info->domain);
1547         }
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549         if (info->pri_enabled) {
1550                 pci_disable_pri(pdev);
1551                 info->pri_enabled = 0;
1552         }
1553         if (info->pasid_enabled) {
1554                 pci_disable_pasid(pdev);
1555                 info->pasid_enabled = 0;
1556         }
1557 #endif
1558 }
1559
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561                                   u64 addr, unsigned mask)
1562 {
1563         u16 sid, qdep;
1564         unsigned long flags;
1565         struct device_domain_info *info;
1566
1567         if (!domain->has_iotlb_device)
1568                 return;
1569
1570         spin_lock_irqsave(&device_domain_lock, flags);
1571         list_for_each_entry(info, &domain->devices, link) {
1572                 if (!info->ats_enabled)
1573                         continue;
1574
1575                 sid = info->bus << 8 | info->devfn;
1576                 qdep = info->ats_qdep;
1577                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578                                 qdep, addr, mask);
1579         }
1580         spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584                                 struct dmar_domain *domain,
1585                                 u64 addr, unsigned long npages, bool ih)
1586 {
1587         u16 did = domain->iommu_did[iommu->seq_id];
1588
1589         if (domain->default_pasid)
1590                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591                                 addr, npages, ih);
1592
1593         if (!list_empty(&domain->devices))
1594                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598                                   struct dmar_domain *domain,
1599                                   unsigned long pfn, unsigned int pages,
1600                                   int ih, int map)
1601 {
1602         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604         u16 did = domain->iommu_did[iommu->seq_id];
1605
1606         BUG_ON(pages == 0);
1607
1608         if (ih)
1609                 ih = 1 << 6;
1610
1611         if (domain_use_first_level(domain)) {
1612                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613         } else {
1614                 /*
1615                  * Fallback to domain selective flush if no PSI support or
1616                  * the size is too big. PSI requires page size to be 2 ^ x,
1617                  * and the base address is naturally aligned to the size.
1618                  */
1619                 if (!cap_pgsel_inv(iommu->cap) ||
1620                     mask > cap_max_amask_val(iommu->cap))
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                         DMA_TLB_DSI_FLUSH);
1623                 else
1624                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625                                                         DMA_TLB_PSI_FLUSH);
1626         }
1627
1628         /*
1629          * In caching mode, changes of pages from non-present to present require
1630          * flush. However, device IOTLB doesn't need to be flushed in this case.
1631          */
1632         if (!cap_caching_mode(iommu->cap) || !map)
1633                 iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638                                         struct dmar_domain *domain,
1639                                         unsigned long pfn, unsigned int pages)
1640 {
1641         /*
1642          * It's a non-present to present mapping. Only flush if caching mode
1643          * and second level.
1644          */
1645         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647         else
1648                 iommu_flush_write_buffer(iommu);
1649 }
1650
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653         struct dmar_domain *domain;
1654         int idx;
1655
1656         domain = container_of(iovad, struct dmar_domain, iovad);
1657
1658         for_each_domain_iommu(idx, domain) {
1659                 struct intel_iommu *iommu = g_iommus[idx];
1660                 u16 did = domain->iommu_did[iommu->seq_id];
1661
1662                 if (domain_use_first_level(domain))
1663                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666                                                  DMA_TLB_DSI_FLUSH);
1667
1668                 if (!cap_caching_mode(iommu->cap))
1669                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670                                               0, MAX_AGAW_PFN_WIDTH);
1671         }
1672 }
1673
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676         u32 pmen;
1677         unsigned long flags;
1678
1679         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680                 return;
1681
1682         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684         pmen &= ~DMA_PMEN_EPM;
1685         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686
1687         /* wait for the protected region status bit to clear */
1688         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1690
1691         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696         u32 sts;
1697         unsigned long flags;
1698
1699         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700         iommu->gcmd |= DMA_GCMD_TE;
1701         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702
1703         /* Make sure hardware complete it */
1704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705                       readl, (sts & DMA_GSTS_TES), sts);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flag;
1714
1715         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717                 return;
1718
1719         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720         iommu->gcmd &= ~DMA_GCMD_TE;
1721         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722
1723         /* Make sure hardware complete it */
1724         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725                       readl, (!(sts & DMA_GSTS_TES)), sts);
1726
1727         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732         u32 ndomains, nlongs;
1733         size_t size;
1734
1735         ndomains = cap_ndoms(iommu->cap);
1736         pr_debug("%s: Number of Domains supported <%d>\n",
1737                  iommu->name, ndomains);
1738         nlongs = BITS_TO_LONGS(ndomains);
1739
1740         spin_lock_init(&iommu->lock);
1741
1742         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743         if (!iommu->domain_ids) {
1744                 pr_err("%s: Allocating domain id array failed\n",
1745                        iommu->name);
1746                 return -ENOMEM;
1747         }
1748
1749         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750         iommu->domains = kzalloc(size, GFP_KERNEL);
1751
1752         if (iommu->domains) {
1753                 size = 256 * sizeof(struct dmar_domain *);
1754                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755         }
1756
1757         if (!iommu->domains || !iommu->domains[0]) {
1758                 pr_err("%s: Allocating domain array failed\n",
1759                        iommu->name);
1760                 kfree(iommu->domain_ids);
1761                 kfree(iommu->domains);
1762                 iommu->domain_ids = NULL;
1763                 iommu->domains    = NULL;
1764                 return -ENOMEM;
1765         }
1766
1767         /*
1768          * If Caching mode is set, then invalid translations are tagged
1769          * with domain-id 0, hence we need to pre-allocate it. We also
1770          * use domain-id 0 as a marker for non-allocated domain-id, so
1771          * make sure it is not used for a real domain.
1772          */
1773         set_bit(0, iommu->domain_ids);
1774
1775         /*
1776          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777          * entry for first-level or pass-through translation modes should
1778          * be programmed with a domain id different from those used for
1779          * second-level or nested translation. We reserve a domain id for
1780          * this purpose.
1781          */
1782         if (sm_supported(iommu))
1783                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784
1785         return 0;
1786 }
1787
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790         struct device_domain_info *info, *tmp;
1791         unsigned long flags;
1792
1793         if (!iommu->domains || !iommu->domain_ids)
1794                 return;
1795
1796         spin_lock_irqsave(&device_domain_lock, flags);
1797         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798                 if (info->iommu != iommu)
1799                         continue;
1800
1801                 if (!info->dev || !info->domain)
1802                         continue;
1803
1804                 __dmar_remove_one_dev_info(info);
1805         }
1806         spin_unlock_irqrestore(&device_domain_lock, flags);
1807
1808         if (iommu->gcmd & DMA_GCMD_TE)
1809                 iommu_disable_translation(iommu);
1810 }
1811
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814         if ((iommu->domains) && (iommu->domain_ids)) {
1815                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816                 int i;
1817
1818                 for (i = 0; i < elems; i++)
1819                         kfree(iommu->domains[i]);
1820                 kfree(iommu->domains);
1821                 kfree(iommu->domain_ids);
1822                 iommu->domains = NULL;
1823                 iommu->domain_ids = NULL;
1824         }
1825
1826         g_iommus[iommu->seq_id] = NULL;
1827
1828         /* free context mapping */
1829         free_context_table(iommu);
1830
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832         if (pasid_supported(iommu)) {
1833                 if (ecap_prs(iommu->ecap))
1834                         intel_svm_finish_prq(iommu);
1835         }
1836         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1838
1839 #endif
1840 }
1841
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848         struct dmar_drhd_unit *drhd;
1849         struct intel_iommu *iommu;
1850         static int first_level_support = -1;
1851
1852         if (likely(first_level_support != -1))
1853                 return first_level_support;
1854
1855         first_level_support = 1;
1856
1857         rcu_read_lock();
1858         for_each_active_iommu(iommu, drhd) {
1859                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860                         first_level_support = 0;
1861                         break;
1862                 }
1863         }
1864         rcu_read_unlock();
1865
1866         return first_level_support;
1867 }
1868
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871         struct dmar_domain *domain;
1872
1873         domain = alloc_domain_mem();
1874         if (!domain)
1875                 return NULL;
1876
1877         memset(domain, 0, sizeof(*domain));
1878         domain->nid = NUMA_NO_NODE;
1879         domain->flags = flags;
1880         if (first_level_by_default())
1881                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882         domain->has_iotlb_device = false;
1883         INIT_LIST_HEAD(&domain->devices);
1884
1885         return domain;
1886 }
1887
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890                                struct intel_iommu *iommu)
1891 {
1892         unsigned long ndomains;
1893         int num;
1894
1895         assert_spin_locked(&device_domain_lock);
1896         assert_spin_locked(&iommu->lock);
1897
1898         domain->iommu_refcnt[iommu->seq_id] += 1;
1899         domain->iommu_count += 1;
1900         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901                 ndomains = cap_ndoms(iommu->cap);
1902                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903
1904                 if (num >= ndomains) {
1905                         pr_err("%s: No free domain ids\n", iommu->name);
1906                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1907                         domain->iommu_count -= 1;
1908                         return -ENOSPC;
1909                 }
1910
1911                 set_bit(num, iommu->domain_ids);
1912                 set_iommu_domain(iommu, num, domain);
1913
1914                 domain->iommu_did[iommu->seq_id] = num;
1915                 domain->nid                      = iommu->node;
1916
1917                 domain_update_iommu_cap(domain);
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924                                struct intel_iommu *iommu)
1925 {
1926         int num, count;
1927
1928         assert_spin_locked(&device_domain_lock);
1929         assert_spin_locked(&iommu->lock);
1930
1931         domain->iommu_refcnt[iommu->seq_id] -= 1;
1932         count = --domain->iommu_count;
1933         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934                 num = domain->iommu_did[iommu->seq_id];
1935                 clear_bit(num, iommu->domain_ids);
1936                 set_iommu_domain(iommu, num, NULL);
1937
1938                 domain_update_iommu_cap(domain);
1939                 domain->iommu_did[iommu->seq_id] = 0;
1940         }
1941
1942         return count;
1943 }
1944
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950         struct pci_dev *pdev = NULL;
1951         struct iova *iova;
1952         int i;
1953
1954         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955
1956         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957                 &reserved_rbtree_key);
1958
1959         /* IOAPIC ranges shouldn't be accessed by DMA */
1960         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961                 IOVA_PFN(IOAPIC_RANGE_END));
1962         if (!iova) {
1963                 pr_err("Reserve IOAPIC range failed\n");
1964                 return -ENODEV;
1965         }
1966
1967         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968         for_each_pci_dev(pdev) {
1969                 struct resource *r;
1970
1971                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972                         r = &pdev->resource[i];
1973                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974                                 continue;
1975                         iova = reserve_iova(&reserved_iova_list,
1976                                             IOVA_PFN(r->start),
1977                                             IOVA_PFN(r->end));
1978                         if (!iova) {
1979                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980                                 return -ENODEV;
1981                         }
1982                 }
1983         }
1984         return 0;
1985 }
1986
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989         int agaw;
1990         int r = (gaw - 12) % 9;
1991
1992         if (r == 0)
1993                 agaw = gaw;
1994         else
1995                 agaw = gaw + 9 - r;
1996         if (agaw > 64)
1997                 agaw = 64;
1998         return agaw;
1999 }
2000
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003
2004         /* Remove associated devices and clear attached or cached domains */
2005         domain_remove_dev_info(domain);
2006
2007         /* destroy iovas */
2008         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009                 put_iova_domain(&domain->iovad);
2010
2011         if (domain->pgd) {
2012                 struct page *freelist;
2013
2014                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015                 dma_free_pagelist(freelist);
2016         }
2017
2018         free_domain_mem(domain);
2019 }
2020
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028         int pds, max_pde;
2029
2030         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032         if (pds < 7)
2033                 return 0;
2034
2035         return pds - 7;
2036 }
2037
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046         context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 2);
2056 }
2057
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064         context->lo |= (1 << 4);
2065 }
2066
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2069
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071                                       struct intel_iommu *iommu,
2072                                       struct pasid_table *table,
2073                                       u8 bus, u8 devfn)
2074 {
2075         u16 did = domain->iommu_did[iommu->seq_id];
2076         int translation = CONTEXT_TT_MULTI_LEVEL;
2077         struct device_domain_info *info = NULL;
2078         struct context_entry *context;
2079         unsigned long flags;
2080         int ret;
2081
2082         WARN_ON(did == 0);
2083
2084         if (hw_pass_through && domain_type_is_si(domain))
2085                 translation = CONTEXT_TT_PASS_THROUGH;
2086
2087         pr_debug("Set context mapping for %02x:%02x.%d\n",
2088                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089
2090         BUG_ON(!domain->pgd);
2091
2092         spin_lock_irqsave(&device_domain_lock, flags);
2093         spin_lock(&iommu->lock);
2094
2095         ret = -ENOMEM;
2096         context = iommu_context_addr(iommu, bus, devfn, 1);
2097         if (!context)
2098                 goto out_unlock;
2099
2100         ret = 0;
2101         if (context_present(context))
2102                 goto out_unlock;
2103
2104         /*
2105          * For kdump cases, old valid entries may be cached due to the
2106          * in-flight DMA and copied pgtable, but there is no unmapping
2107          * behaviour for them, thus we need an explicit cache flush for
2108          * the newly-mapped device. For kdump, at this point, the device
2109          * is supposed to finish reset at its driver probe stage, so no
2110          * in-flight DMA will exist, and we don't need to worry anymore
2111          * hereafter.
2112          */
2113         if (context_copied(context)) {
2114                 u16 did_old = context_domain_id(context);
2115
2116                 if (did_old < cap_ndoms(iommu->cap)) {
2117                         iommu->flush.flush_context(iommu, did_old,
2118                                                    (((u16)bus) << 8) | devfn,
2119                                                    DMA_CCMD_MASK_NOBIT,
2120                                                    DMA_CCMD_DEVICE_INVL);
2121                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122                                                  DMA_TLB_DSI_FLUSH);
2123                 }
2124         }
2125
2126         context_clear_entry(context);
2127
2128         if (sm_supported(iommu)) {
2129                 unsigned long pds;
2130
2131                 WARN_ON(!table);
2132
2133                 /* Setup the PASID DIR pointer: */
2134                 pds = context_get_sm_pds(table);
2135                 context->lo = (u64)virt_to_phys(table->table) |
2136                                 context_pdts(pds);
2137
2138                 /* Setup the RID_PASID field: */
2139                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140
2141                 /*
2142                  * Setup the Device-TLB enable bit and Page request
2143                  * Enable bit:
2144                  */
2145                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146                 if (info && info->ats_supported)
2147                         context_set_sm_dte(context);
2148                 if (info && info->pri_supported)
2149                         context_set_sm_pre(context);
2150         } else {
2151                 struct dma_pte *pgd = domain->pgd;
2152                 int agaw;
2153
2154                 context_set_domain_id(context, did);
2155
2156                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2157                         /*
2158                          * Skip top levels of page tables for iommu which has
2159                          * less agaw than default. Unnecessary for PT mode.
2160                          */
2161                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162                                 ret = -ENOMEM;
2163                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2164                                 if (!dma_pte_present(pgd))
2165                                         goto out_unlock;
2166                         }
2167
2168                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169                         if (info && info->ats_supported)
2170                                 translation = CONTEXT_TT_DEV_IOTLB;
2171                         else
2172                                 translation = CONTEXT_TT_MULTI_LEVEL;
2173
2174                         context_set_address_root(context, virt_to_phys(pgd));
2175                         context_set_address_width(context, agaw);
2176                 } else {
2177                         /*
2178                          * In pass through mode, AW must be programmed to
2179                          * indicate the largest AGAW value supported by
2180                          * hardware. And ASR is ignored by hardware.
2181                          */
2182                         context_set_address_width(context, iommu->msagaw);
2183                 }
2184
2185                 context_set_translation_type(context, translation);
2186         }
2187
2188         context_set_fault_enable(context);
2189         context_set_present(context);
2190         if (!ecap_coherent(iommu->ecap))
2191                 clflush_cache_range(context, sizeof(*context));
2192
2193         /*
2194          * It's a non-present to present mapping. If hardware doesn't cache
2195          * non-present entry we only need to flush the write-buffer. If the
2196          * _does_ cache non-present entries, then it does so in the special
2197          * domain #0, which we have to flush:
2198          */
2199         if (cap_caching_mode(iommu->cap)) {
2200                 iommu->flush.flush_context(iommu, 0,
2201                                            (((u16)bus) << 8) | devfn,
2202                                            DMA_CCMD_MASK_NOBIT,
2203                                            DMA_CCMD_DEVICE_INVL);
2204                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205         } else {
2206                 iommu_flush_write_buffer(iommu);
2207         }
2208         iommu_enable_dev_iotlb(info);
2209
2210         ret = 0;
2211
2212 out_unlock:
2213         spin_unlock(&iommu->lock);
2214         spin_unlock_irqrestore(&device_domain_lock, flags);
2215
2216         return ret;
2217 }
2218
2219 struct domain_context_mapping_data {
2220         struct dmar_domain *domain;
2221         struct intel_iommu *iommu;
2222         struct pasid_table *table;
2223 };
2224
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226                                      u16 alias, void *opaque)
2227 {
2228         struct domain_context_mapping_data *data = opaque;
2229
2230         return domain_context_mapping_one(data->domain, data->iommu,
2231                                           data->table, PCI_BUS_NUM(alias),
2232                                           alias & 0xff);
2233 }
2234
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238         struct domain_context_mapping_data data;
2239         struct pasid_table *table;
2240         struct intel_iommu *iommu;
2241         u8 bus, devfn;
2242
2243         iommu = device_to_iommu(dev, &bus, &devfn);
2244         if (!iommu)
2245                 return -ENODEV;
2246
2247         table = intel_pasid_get_table(dev);
2248
2249         if (!dev_is_pci(dev))
2250                 return domain_context_mapping_one(domain, iommu, table,
2251                                                   bus, devfn);
2252
2253         data.domain = domain;
2254         data.iommu = iommu;
2255         data.table = table;
2256
2257         return pci_for_each_dma_alias(to_pci_dev(dev),
2258                                       &domain_context_mapping_cb, &data);
2259 }
2260
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262                                     u16 alias, void *opaque)
2263 {
2264         struct intel_iommu *iommu = opaque;
2265
2266         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271         struct intel_iommu *iommu;
2272         u8 bus, devfn;
2273
2274         iommu = device_to_iommu(dev, &bus, &devfn);
2275         if (!iommu)
2276                 return -ENODEV;
2277
2278         if (!dev_is_pci(dev))
2279                 return device_context_mapped(iommu, bus, devfn);
2280
2281         return !pci_for_each_dma_alias(to_pci_dev(dev),
2282                                        domain_context_mapped_cb, iommu);
2283 }
2284
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287                                             size_t size)
2288 {
2289         host_addr &= ~PAGE_MASK;
2290         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295                                           unsigned long iov_pfn,
2296                                           unsigned long phy_pfn,
2297                                           unsigned long pages)
2298 {
2299         int support, level = 1;
2300         unsigned long pfnmerge;
2301
2302         support = domain->iommu_superpage;
2303
2304         /* To use a large page, the virtual *and* physical addresses
2305            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306            of them will mean we have to use smaller pages. So just
2307            merge them and check both at once. */
2308         pfnmerge = iov_pfn | phy_pfn;
2309
2310         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311                 pages >>= VTD_STRIDE_SHIFT;
2312                 if (!pages)
2313                         break;
2314                 pfnmerge >>= VTD_STRIDE_SHIFT;
2315                 level++;
2316                 support--;
2317         }
2318         return level;
2319 }
2320
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                             struct scatterlist *sg, unsigned long phys_pfn,
2323                             unsigned long nr_pages, int prot)
2324 {
2325         struct dma_pte *first_pte = NULL, *pte = NULL;
2326         phys_addr_t pteval;
2327         unsigned long sg_res = 0;
2328         unsigned int largepage_lvl = 0;
2329         unsigned long lvl_pages = 0;
2330         u64 attr;
2331
2332         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333
2334         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335                 return -EINVAL;
2336
2337         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338         if (domain_use_first_level(domain))
2339                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340
2341         if (!sg) {
2342                 sg_res = nr_pages;
2343                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344         }
2345
2346         while (nr_pages > 0) {
2347                 uint64_t tmp;
2348
2349                 if (!sg_res) {
2350                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351
2352                         sg_res = aligned_nrpages(sg->offset, sg->length);
2353                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354                         sg->dma_length = sg->length;
2355                         pteval = (sg_phys(sg) - pgoff) | attr;
2356                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357                 }
2358
2359                 if (!pte) {
2360                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361
2362                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363                         if (!pte)
2364                                 return -ENOMEM;
2365                         /* It is large page*/
2366                         if (largepage_lvl > 1) {
2367                                 unsigned long nr_superpages, end_pfn;
2368
2369                                 pteval |= DMA_PTE_LARGE_PAGE;
2370                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371
2372                                 nr_superpages = sg_res / lvl_pages;
2373                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374
2375                                 /*
2376                                  * Ensure that old small page tables are
2377                                  * removed to make room for superpage(s).
2378                                  * We're adding new large pages, so make sure
2379                                  * we don't remove their parent tables.
2380                                  */
2381                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382                                                        largepage_lvl + 1);
2383                         } else {
2384                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385                         }
2386
2387                 }
2388                 /* We don't need lock here, nobody else
2389                  * touches the iova range
2390                  */
2391                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392                 if (tmp) {
2393                         static int dumps = 5;
2394                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395                                 iov_pfn, tmp, (unsigned long long)pteval);
2396                         if (dumps) {
2397                                 dumps--;
2398                                 debug_dma_dump_mappings(NULL);
2399                         }
2400                         WARN_ON(1);
2401                 }
2402
2403                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404
2405                 BUG_ON(nr_pages < lvl_pages);
2406                 BUG_ON(sg_res < lvl_pages);
2407
2408                 nr_pages -= lvl_pages;
2409                 iov_pfn += lvl_pages;
2410                 phys_pfn += lvl_pages;
2411                 pteval += lvl_pages * VTD_PAGE_SIZE;
2412                 sg_res -= lvl_pages;
2413
2414                 /* If the next PTE would be the first in a new page, then we
2415                    need to flush the cache on the entries we've just written.
2416                    And then we'll need to recalculate 'pte', so clear it and
2417                    let it get set again in the if (!pte) block above.
2418
2419                    If we're done (!nr_pages) we need to flush the cache too.
2420
2421                    Also if we've been setting superpages, we may need to
2422                    recalculate 'pte' and switch back to smaller pages for the
2423                    end of the mapping, if the trailing size is not enough to
2424                    use another superpage (i.e. sg_res < lvl_pages). */
2425                 pte++;
2426                 if (!nr_pages || first_pte_in_page(pte) ||
2427                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428                         domain_flush_cache(domain, first_pte,
2429                                            (void *)pte - (void *)first_pte);
2430                         pte = NULL;
2431                 }
2432
2433                 if (!sg_res && nr_pages)
2434                         sg = sg_next(sg);
2435         }
2436         return 0;
2437 }
2438
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440                           struct scatterlist *sg, unsigned long phys_pfn,
2441                           unsigned long nr_pages, int prot)
2442 {
2443         int iommu_id, ret;
2444         struct intel_iommu *iommu;
2445
2446         /* Do the real mapping first */
2447         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448         if (ret)
2449                 return ret;
2450
2451         for_each_domain_iommu(iommu_id, domain) {
2452                 iommu = g_iommus[iommu_id];
2453                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454         }
2455
2456         return 0;
2457 }
2458
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460                                     struct scatterlist *sg, unsigned long nr_pages,
2461                                     int prot)
2462 {
2463         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467                                      unsigned long phys_pfn, unsigned long nr_pages,
2468                                      int prot)
2469 {
2470         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475         unsigned long flags;
2476         struct context_entry *context;
2477         u16 did_old;
2478
2479         if (!iommu)
2480                 return;
2481
2482         spin_lock_irqsave(&iommu->lock, flags);
2483         context = iommu_context_addr(iommu, bus, devfn, 0);
2484         if (!context) {
2485                 spin_unlock_irqrestore(&iommu->lock, flags);
2486                 return;
2487         }
2488         did_old = context_domain_id(context);
2489         context_clear_entry(context);
2490         __iommu_flush_cache(iommu, context, sizeof(*context));
2491         spin_unlock_irqrestore(&iommu->lock, flags);
2492         iommu->flush.flush_context(iommu,
2493                                    did_old,
2494                                    (((u16)bus) << 8) | devfn,
2495                                    DMA_CCMD_MASK_NOBIT,
2496                                    DMA_CCMD_DEVICE_INVL);
2497         iommu->flush.flush_iotlb(iommu,
2498                                  did_old,
2499                                  0,
2500                                  0,
2501                                  DMA_TLB_DSI_FLUSH);
2502 }
2503
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506         assert_spin_locked(&device_domain_lock);
2507         list_del(&info->link);
2508         list_del(&info->global);
2509         if (info->dev)
2510                 dev_iommu_priv_set(info->dev, NULL);
2511 }
2512
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515         struct device_domain_info *info, *tmp;
2516         unsigned long flags;
2517
2518         spin_lock_irqsave(&device_domain_lock, flags);
2519         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520                 __dmar_remove_one_dev_info(info);
2521         spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526         struct device_domain_info *info;
2527
2528         if (unlikely(!dev || !dev->iommu))
2529                 return NULL;
2530
2531         if (unlikely(attach_deferred(dev)))
2532                 return NULL;
2533
2534         /* No lock here, assumes no domain exit in normal case */
2535         info = get_domain_info(dev);
2536         if (likely(info))
2537                 return info->domain;
2538
2539         return NULL;
2540 }
2541
2542 static void do_deferred_attach(struct device *dev)
2543 {
2544         struct iommu_domain *domain;
2545
2546         dev_iommu_priv_set(dev, NULL);
2547         domain = iommu_get_domain_for_dev(dev);
2548         if (domain)
2549                 intel_iommu_attach_device(domain, dev);
2550 }
2551
2552 static inline struct device_domain_info *
2553 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2554 {
2555         struct device_domain_info *info;
2556
2557         list_for_each_entry(info, &device_domain_list, global)
2558                 if (info->segment == segment && info->bus == bus &&
2559                     info->devfn == devfn)
2560                         return info;
2561
2562         return NULL;
2563 }
2564
2565 static int domain_setup_first_level(struct intel_iommu *iommu,
2566                                     struct dmar_domain *domain,
2567                                     struct device *dev,
2568                                     u32 pasid)
2569 {
2570         int flags = PASID_FLAG_SUPERVISOR_MODE;
2571         struct dma_pte *pgd = domain->pgd;
2572         int agaw, level;
2573
2574         /*
2575          * Skip top levels of page tables for iommu which has
2576          * less agaw than default. Unnecessary for PT mode.
2577          */
2578         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2579                 pgd = phys_to_virt(dma_pte_addr(pgd));
2580                 if (!dma_pte_present(pgd))
2581                         return -ENOMEM;
2582         }
2583
2584         level = agaw_to_level(agaw);
2585         if (level != 4 && level != 5)
2586                 return -EINVAL;
2587
2588         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2589
2590         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2591                                              domain->iommu_did[iommu->seq_id],
2592                                              flags);
2593 }
2594
2595 static bool dev_is_real_dma_subdevice(struct device *dev)
2596 {
2597         return dev && dev_is_pci(dev) &&
2598                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2599 }
2600
2601 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2602                                                     int bus, int devfn,
2603                                                     struct device *dev,
2604                                                     struct dmar_domain *domain)
2605 {
2606         struct dmar_domain *found = NULL;
2607         struct device_domain_info *info;
2608         unsigned long flags;
2609         int ret;
2610
2611         info = alloc_devinfo_mem();
2612         if (!info)
2613                 return NULL;
2614
2615         if (!dev_is_real_dma_subdevice(dev)) {
2616                 info->bus = bus;
2617                 info->devfn = devfn;
2618                 info->segment = iommu->segment;
2619         } else {
2620                 struct pci_dev *pdev = to_pci_dev(dev);
2621
2622                 info->bus = pdev->bus->number;
2623                 info->devfn = pdev->devfn;
2624                 info->segment = pci_domain_nr(pdev->bus);
2625         }
2626
2627         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2628         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2629         info->ats_qdep = 0;
2630         info->dev = dev;
2631         info->domain = domain;
2632         info->iommu = iommu;
2633         info->pasid_table = NULL;
2634         info->auxd_enabled = 0;
2635         INIT_LIST_HEAD(&info->auxiliary_domains);
2636
2637         if (dev && dev_is_pci(dev)) {
2638                 struct pci_dev *pdev = to_pci_dev(info->dev);
2639
2640                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2641                     pci_ats_supported(pdev) &&
2642                     dmar_find_matched_atsr_unit(pdev))
2643                         info->ats_supported = 1;
2644
2645                 if (sm_supported(iommu)) {
2646                         if (pasid_supported(iommu)) {
2647                                 int features = pci_pasid_features(pdev);
2648                                 if (features >= 0)
2649                                         info->pasid_supported = features | 1;
2650                         }
2651
2652                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2653                             pci_pri_supported(pdev))
2654                                 info->pri_supported = 1;
2655                 }
2656         }
2657
2658         spin_lock_irqsave(&device_domain_lock, flags);
2659         if (dev)
2660                 found = find_domain(dev);
2661
2662         if (!found) {
2663                 struct device_domain_info *info2;
2664                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2665                                                        info->devfn);
2666                 if (info2) {
2667                         found      = info2->domain;
2668                         info2->dev = dev;
2669                 }
2670         }
2671
2672         if (found) {
2673                 spin_unlock_irqrestore(&device_domain_lock, flags);
2674                 free_devinfo_mem(info);
2675                 /* Caller must free the original domain */
2676                 return found;
2677         }
2678
2679         spin_lock(&iommu->lock);
2680         ret = domain_attach_iommu(domain, iommu);
2681         spin_unlock(&iommu->lock);
2682
2683         if (ret) {
2684                 spin_unlock_irqrestore(&device_domain_lock, flags);
2685                 free_devinfo_mem(info);
2686                 return NULL;
2687         }
2688
2689         list_add(&info->link, &domain->devices);
2690         list_add(&info->global, &device_domain_list);
2691         if (dev)
2692                 dev_iommu_priv_set(dev, info);
2693         spin_unlock_irqrestore(&device_domain_lock, flags);
2694
2695         /* PASID table is mandatory for a PCI device in scalable mode. */
2696         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2697                 ret = intel_pasid_alloc_table(dev);
2698                 if (ret) {
2699                         dev_err(dev, "PASID table allocation failed\n");
2700                         dmar_remove_one_dev_info(dev);
2701                         return NULL;
2702                 }
2703
2704                 /* Setup the PASID entry for requests without PASID: */
2705                 spin_lock_irqsave(&iommu->lock, flags);
2706                 if (hw_pass_through && domain_type_is_si(domain))
2707                         ret = intel_pasid_setup_pass_through(iommu, domain,
2708                                         dev, PASID_RID2PASID);
2709                 else if (domain_use_first_level(domain))
2710                         ret = domain_setup_first_level(iommu, domain, dev,
2711                                         PASID_RID2PASID);
2712                 else
2713                         ret = intel_pasid_setup_second_level(iommu, domain,
2714                                         dev, PASID_RID2PASID);
2715                 spin_unlock_irqrestore(&iommu->lock, flags);
2716                 if (ret) {
2717                         dev_err(dev, "Setup RID2PASID failed\n");
2718                         dmar_remove_one_dev_info(dev);
2719                         return NULL;
2720                 }
2721         }
2722
2723         if (dev && domain_context_mapping(domain, dev)) {
2724                 dev_err(dev, "Domain context map failed\n");
2725                 dmar_remove_one_dev_info(dev);
2726                 return NULL;
2727         }
2728
2729         return domain;
2730 }
2731
2732 static int iommu_domain_identity_map(struct dmar_domain *domain,
2733                                      unsigned long first_vpfn,
2734                                      unsigned long last_vpfn)
2735 {
2736         /*
2737          * RMRR range might have overlap with physical memory range,
2738          * clear it first
2739          */
2740         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2741
2742         return __domain_mapping(domain, first_vpfn, NULL,
2743                                 first_vpfn, last_vpfn - first_vpfn + 1,
2744                                 DMA_PTE_READ|DMA_PTE_WRITE);
2745 }
2746
2747 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2748
2749 static int __init si_domain_init(int hw)
2750 {
2751         struct dmar_rmrr_unit *rmrr;
2752         struct device *dev;
2753         int i, nid, ret;
2754
2755         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2756         if (!si_domain)
2757                 return -EFAULT;
2758
2759         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2760                 domain_exit(si_domain);
2761                 return -EFAULT;
2762         }
2763
2764         if (hw)
2765                 return 0;
2766
2767         for_each_online_node(nid) {
2768                 unsigned long start_pfn, end_pfn;
2769                 int i;
2770
2771                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2772                         ret = iommu_domain_identity_map(si_domain,
2773                                         mm_to_dma_pfn(start_pfn),
2774                                         mm_to_dma_pfn(end_pfn));
2775                         if (ret)
2776                                 return ret;
2777                 }
2778         }
2779
2780         /*
2781          * Identity map the RMRRs so that devices with RMRRs could also use
2782          * the si_domain.
2783          */
2784         for_each_rmrr_units(rmrr) {
2785                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2786                                           i, dev) {
2787                         unsigned long long start = rmrr->base_address;
2788                         unsigned long long end = rmrr->end_address;
2789
2790                         if (WARN_ON(end < start ||
2791                                     end >> agaw_to_width(si_domain->agaw)))
2792                                 continue;
2793
2794                         ret = iommu_domain_identity_map(si_domain,
2795                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2796                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2797                         if (ret)
2798                                 return ret;
2799                 }
2800         }
2801
2802         return 0;
2803 }
2804
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806 {
2807         struct dmar_domain *ndomain;
2808         struct intel_iommu *iommu;
2809         u8 bus, devfn;
2810
2811         iommu = device_to_iommu(dev, &bus, &devfn);
2812         if (!iommu)
2813                 return -ENODEV;
2814
2815         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816         if (ndomain != domain)
2817                 return -EBUSY;
2818
2819         return 0;
2820 }
2821
2822 static bool device_has_rmrr(struct device *dev)
2823 {
2824         struct dmar_rmrr_unit *rmrr;
2825         struct device *tmp;
2826         int i;
2827
2828         rcu_read_lock();
2829         for_each_rmrr_units(rmrr) {
2830                 /*
2831                  * Return TRUE if this RMRR contains the device that
2832                  * is passed in.
2833                  */
2834                 for_each_active_dev_scope(rmrr->devices,
2835                                           rmrr->devices_cnt, i, tmp)
2836                         if (tmp == dev ||
2837                             is_downstream_to_pci_bridge(dev, tmp)) {
2838                                 rcu_read_unlock();
2839                                 return true;
2840                         }
2841         }
2842         rcu_read_unlock();
2843         return false;
2844 }
2845
2846 /**
2847  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848  * is relaxable (ie. is allowed to be not enforced under some conditions)
2849  * @dev: device handle
2850  *
2851  * We assume that PCI USB devices with RMRRs have them largely
2852  * for historical reasons and that the RMRR space is not actively used post
2853  * boot.  This exclusion may change if vendors begin to abuse it.
2854  *
2855  * The same exception is made for graphics devices, with the requirement that
2856  * any use of the RMRR regions will be torn down before assigning the device
2857  * to a guest.
2858  *
2859  * Return: true if the RMRR is relaxable, false otherwise
2860  */
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2862 {
2863         struct pci_dev *pdev;
2864
2865         if (!dev_is_pci(dev))
2866                 return false;
2867
2868         pdev = to_pci_dev(dev);
2869         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870                 return true;
2871         else
2872                 return false;
2873 }
2874
2875 /*
2876  * There are a couple cases where we need to restrict the functionality of
2877  * devices associated with RMRRs.  The first is when evaluating a device for
2878  * identity mapping because problems exist when devices are moved in and out
2879  * of domains and their respective RMRR information is lost.  This means that
2880  * a device with associated RMRRs will never be in a "passthrough" domain.
2881  * The second is use of the device through the IOMMU API.  This interface
2882  * expects to have full control of the IOVA space for the device.  We cannot
2883  * satisfy both the requirement that RMRR access is maintained and have an
2884  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2885  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886  * We therefore prevent devices associated with an RMRR from participating in
2887  * the IOMMU API, which eliminates them from device assignment.
2888  *
2889  * In both cases, devices which have relaxable RMRRs are not concerned by this
2890  * restriction. See device_rmrr_is_relaxable comment.
2891  */
2892 static bool device_is_rmrr_locked(struct device *dev)
2893 {
2894         if (!device_has_rmrr(dev))
2895                 return false;
2896
2897         if (device_rmrr_is_relaxable(dev))
2898                 return false;
2899
2900         return true;
2901 }
2902
2903 /*
2904  * Return the required default domain type for a specific device.
2905  *
2906  * @dev: the device in query
2907  * @startup: true if this is during early boot
2908  *
2909  * Returns:
2910  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912  *  - 0: both identity and dynamic domains work for this device
2913  */
2914 static int device_def_domain_type(struct device *dev)
2915 {
2916         if (dev_is_pci(dev)) {
2917                 struct pci_dev *pdev = to_pci_dev(dev);
2918
2919                 /*
2920                  * Prevent any device marked as untrusted from getting
2921                  * placed into the statically identity mapping domain.
2922                  */
2923                 if (pdev->untrusted)
2924                         return IOMMU_DOMAIN_DMA;
2925
2926                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2927                         return IOMMU_DOMAIN_IDENTITY;
2928
2929                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2930                         return IOMMU_DOMAIN_IDENTITY;
2931         }
2932
2933         return 0;
2934 }
2935
2936 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2937 {
2938         /*
2939          * Start from the sane iommu hardware state.
2940          * If the queued invalidation is already initialized by us
2941          * (for example, while enabling interrupt-remapping) then
2942          * we got the things already rolling from a sane state.
2943          */
2944         if (!iommu->qi) {
2945                 /*
2946                  * Clear any previous faults.
2947                  */
2948                 dmar_fault(-1, iommu);
2949                 /*
2950                  * Disable queued invalidation if supported and already enabled
2951                  * before OS handover.
2952                  */
2953                 dmar_disable_qi(iommu);
2954         }
2955
2956         if (dmar_enable_qi(iommu)) {
2957                 /*
2958                  * Queued Invalidate not enabled, use Register Based Invalidate
2959                  */
2960                 iommu->flush.flush_context = __iommu_flush_context;
2961                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2962                 pr_info("%s: Using Register based invalidation\n",
2963                         iommu->name);
2964         } else {
2965                 iommu->flush.flush_context = qi_flush_context;
2966                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2967                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2968         }
2969 }
2970
2971 static int copy_context_table(struct intel_iommu *iommu,
2972                               struct root_entry *old_re,
2973                               struct context_entry **tbl,
2974                               int bus, bool ext)
2975 {
2976         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2977         struct context_entry *new_ce = NULL, ce;
2978         struct context_entry *old_ce = NULL;
2979         struct root_entry re;
2980         phys_addr_t old_ce_phys;
2981
2982         tbl_idx = ext ? bus * 2 : bus;
2983         memcpy(&re, old_re, sizeof(re));
2984
2985         for (devfn = 0; devfn < 256; devfn++) {
2986                 /* First calculate the correct index */
2987                 idx = (ext ? devfn * 2 : devfn) % 256;
2988
2989                 if (idx == 0) {
2990                         /* First save what we may have and clean up */
2991                         if (new_ce) {
2992                                 tbl[tbl_idx] = new_ce;
2993                                 __iommu_flush_cache(iommu, new_ce,
2994                                                     VTD_PAGE_SIZE);
2995                                 pos = 1;
2996                         }
2997
2998                         if (old_ce)
2999                                 memunmap(old_ce);
3000
3001                         ret = 0;
3002                         if (devfn < 0x80)
3003                                 old_ce_phys = root_entry_lctp(&re);
3004                         else
3005                                 old_ce_phys = root_entry_uctp(&re);
3006
3007                         if (!old_ce_phys) {
3008                                 if (ext && devfn == 0) {
3009                                         /* No LCTP, try UCTP */
3010                                         devfn = 0x7f;
3011                                         continue;
3012                                 } else {
3013                                         goto out;
3014                                 }
3015                         }
3016
3017                         ret = -ENOMEM;
3018                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3019                                         MEMREMAP_WB);
3020                         if (!old_ce)
3021                                 goto out;
3022
3023                         new_ce = alloc_pgtable_page(iommu->node);
3024                         if (!new_ce)
3025                                 goto out_unmap;
3026
3027                         ret = 0;
3028                 }
3029
3030                 /* Now copy the context entry */
3031                 memcpy(&ce, old_ce + idx, sizeof(ce));
3032
3033                 if (!__context_present(&ce))
3034                         continue;
3035
3036                 did = context_domain_id(&ce);
3037                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3038                         set_bit(did, iommu->domain_ids);
3039
3040                 /*
3041                  * We need a marker for copied context entries. This
3042                  * marker needs to work for the old format as well as
3043                  * for extended context entries.
3044                  *
3045                  * Bit 67 of the context entry is used. In the old
3046                  * format this bit is available to software, in the
3047                  * extended format it is the PGE bit, but PGE is ignored
3048                  * by HW if PASIDs are disabled (and thus still
3049                  * available).
3050                  *
3051                  * So disable PASIDs first and then mark the entry
3052                  * copied. This means that we don't copy PASID
3053                  * translations from the old kernel, but this is fine as
3054                  * faults there are not fatal.
3055                  */
3056                 context_clear_pasid_enable(&ce);
3057                 context_set_copied(&ce);
3058
3059                 new_ce[idx] = ce;
3060         }
3061
3062         tbl[tbl_idx + pos] = new_ce;
3063
3064         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3065
3066 out_unmap:
3067         memunmap(old_ce);
3068
3069 out:
3070         return ret;
3071 }
3072
3073 static int copy_translation_tables(struct intel_iommu *iommu)
3074 {
3075         struct context_entry **ctxt_tbls;
3076         struct root_entry *old_rt;
3077         phys_addr_t old_rt_phys;
3078         int ctxt_table_entries;
3079         unsigned long flags;
3080         u64 rtaddr_reg;
3081         int bus, ret;
3082         bool new_ext, ext;
3083
3084         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3085         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3086         new_ext    = !!ecap_ecs(iommu->ecap);
3087
3088         /*
3089          * The RTT bit can only be changed when translation is disabled,
3090          * but disabling translation means to open a window for data
3091          * corruption. So bail out and don't copy anything if we would
3092          * have to change the bit.
3093          */
3094         if (new_ext != ext)
3095                 return -EINVAL;
3096
3097         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3098         if (!old_rt_phys)
3099                 return -EINVAL;
3100
3101         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3102         if (!old_rt)
3103                 return -ENOMEM;
3104
3105         /* This is too big for the stack - allocate it from slab */
3106         ctxt_table_entries = ext ? 512 : 256;
3107         ret = -ENOMEM;
3108         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3109         if (!ctxt_tbls)
3110                 goto out_unmap;
3111
3112         for (bus = 0; bus < 256; bus++) {
3113                 ret = copy_context_table(iommu, &old_rt[bus],
3114                                          ctxt_tbls, bus, ext);
3115                 if (ret) {
3116                         pr_err("%s: Failed to copy context table for bus %d\n",
3117                                 iommu->name, bus);
3118                         continue;
3119                 }
3120         }
3121
3122         spin_lock_irqsave(&iommu->lock, flags);
3123
3124         /* Context tables are copied, now write them to the root_entry table */
3125         for (bus = 0; bus < 256; bus++) {
3126                 int idx = ext ? bus * 2 : bus;
3127                 u64 val;
3128
3129                 if (ctxt_tbls[idx]) {
3130                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3131                         iommu->root_entry[bus].lo = val;
3132                 }
3133
3134                 if (!ext || !ctxt_tbls[idx + 1])
3135                         continue;
3136
3137                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3138                 iommu->root_entry[bus].hi = val;
3139         }
3140
3141         spin_unlock_irqrestore(&iommu->lock, flags);
3142
3143         kfree(ctxt_tbls);
3144
3145         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3146
3147         ret = 0;
3148
3149 out_unmap:
3150         memunmap(old_rt);
3151
3152         return ret;
3153 }
3154
3155 #ifdef CONFIG_INTEL_IOMMU_SVM
3156 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3157 {
3158         struct intel_iommu *iommu = data;
3159         ioasid_t ioasid;
3160
3161         if (!iommu)
3162                 return INVALID_IOASID;
3163         /*
3164          * VT-d virtual command interface always uses the full 20 bit
3165          * PASID range. Host can partition guest PASID range based on
3166          * policies but it is out of guest's control.
3167          */
3168         if (min < PASID_MIN || max > intel_pasid_max_id)
3169                 return INVALID_IOASID;
3170
3171         if (vcmd_alloc_pasid(iommu, &ioasid))
3172                 return INVALID_IOASID;
3173
3174         return ioasid;
3175 }
3176
3177 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3178 {
3179         struct intel_iommu *iommu = data;
3180
3181         if (!iommu)
3182                 return;
3183         /*
3184          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3185          * We can only free the PASID when all the devices are unbound.
3186          */
3187         if (ioasid_find(NULL, ioasid, NULL)) {
3188                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3189                 return;
3190         }
3191         vcmd_free_pasid(iommu, ioasid);
3192 }
3193
3194 static void register_pasid_allocator(struct intel_iommu *iommu)
3195 {
3196         /*
3197          * If we are running in the host, no need for custom allocator
3198          * in that PASIDs are allocated from the host system-wide.
3199          */
3200         if (!cap_caching_mode(iommu->cap))
3201                 return;
3202
3203         if (!sm_supported(iommu)) {
3204                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3205                 return;
3206         }
3207
3208         /*
3209          * Register a custom PASID allocator if we are running in a guest,
3210          * guest PASID must be obtained via virtual command interface.
3211          * There can be multiple vIOMMUs in each guest but only one allocator
3212          * is active. All vIOMMU allocators will eventually be calling the same
3213          * host allocator.
3214          */
3215         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3216                 return;
3217
3218         pr_info("Register custom PASID allocator\n");
3219         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3220         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3221         iommu->pasid_allocator.pdata = (void *)iommu;
3222         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3223                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3224                 /*
3225                  * Disable scalable mode on this IOMMU if there
3226                  * is no custom allocator. Mixing SM capable vIOMMU
3227                  * and non-SM vIOMMU are not supported.
3228                  */
3229                 intel_iommu_sm = 0;
3230         }
3231 }
3232 #endif
3233
3234 static int __init init_dmars(void)
3235 {
3236         struct dmar_drhd_unit *drhd;
3237         struct intel_iommu *iommu;
3238         int ret;
3239
3240         /*
3241          * for each drhd
3242          *    allocate root
3243          *    initialize and program root entry to not present
3244          * endfor
3245          */
3246         for_each_drhd_unit(drhd) {
3247                 /*
3248                  * lock not needed as this is only incremented in the single
3249                  * threaded kernel __init code path all other access are read
3250                  * only
3251                  */
3252                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3253                         g_num_of_iommus++;
3254                         continue;
3255                 }
3256                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3257         }
3258
3259         /* Preallocate enough resources for IOMMU hot-addition */
3260         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3261                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3262
3263         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3264                         GFP_KERNEL);
3265         if (!g_iommus) {
3266                 pr_err("Allocating global iommu array failed\n");
3267                 ret = -ENOMEM;
3268                 goto error;
3269         }
3270
3271         for_each_iommu(iommu, drhd) {
3272                 if (drhd->ignored) {
3273                         iommu_disable_translation(iommu);
3274                         continue;
3275                 }
3276
3277                 /*
3278                  * Find the max pasid size of all IOMMU's in the system.
3279                  * We need to ensure the system pasid table is no bigger
3280                  * than the smallest supported.
3281                  */
3282                 if (pasid_supported(iommu)) {
3283                         u32 temp = 2 << ecap_pss(iommu->ecap);
3284
3285                         intel_pasid_max_id = min_t(u32, temp,
3286                                                    intel_pasid_max_id);
3287                 }
3288
3289                 g_iommus[iommu->seq_id] = iommu;
3290
3291                 intel_iommu_init_qi(iommu);
3292
3293                 ret = iommu_init_domains(iommu);
3294                 if (ret)
3295                         goto free_iommu;
3296
3297                 init_translation_status(iommu);
3298
3299                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3300                         iommu_disable_translation(iommu);
3301                         clear_translation_pre_enabled(iommu);
3302                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3303                                 iommu->name);
3304                 }
3305
3306                 /*
3307                  * TBD:
3308                  * we could share the same root & context tables
3309                  * among all IOMMU's. Need to Split it later.
3310                  */
3311                 ret = iommu_alloc_root_entry(iommu);
3312                 if (ret)
3313                         goto free_iommu;
3314
3315                 if (translation_pre_enabled(iommu)) {
3316                         pr_info("Translation already enabled - trying to copy translation structures\n");
3317
3318                         ret = copy_translation_tables(iommu);
3319                         if (ret) {
3320                                 /*
3321                                  * We found the IOMMU with translation
3322                                  * enabled - but failed to copy over the
3323                                  * old root-entry table. Try to proceed
3324                                  * by disabling translation now and
3325                                  * allocating a clean root-entry table.
3326                                  * This might cause DMAR faults, but
3327                                  * probably the dump will still succeed.
3328                                  */
3329                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3330                                        iommu->name);
3331                                 iommu_disable_translation(iommu);
3332                                 clear_translation_pre_enabled(iommu);
3333                         } else {
3334                                 pr_info("Copied translation tables from previous kernel for %s\n",
3335                                         iommu->name);
3336                         }
3337                 }
3338
3339                 if (!ecap_pass_through(iommu->ecap))
3340                         hw_pass_through = 0;
3341                 intel_svm_check(iommu);
3342         }
3343
3344         /*
3345          * Now that qi is enabled on all iommus, set the root entry and flush
3346          * caches. This is required on some Intel X58 chipsets, otherwise the
3347          * flush_context function will loop forever and the boot hangs.
3348          */
3349         for_each_active_iommu(iommu, drhd) {
3350                 iommu_flush_write_buffer(iommu);
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352                 register_pasid_allocator(iommu);
3353 #endif
3354                 iommu_set_root_entry(iommu);
3355                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3356                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3357         }
3358
3359 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3360         dmar_map_gfx = 0;
3361 #endif
3362
3363         if (!dmar_map_gfx)
3364                 iommu_identity_mapping |= IDENTMAP_GFX;
3365
3366         check_tylersburg_isoch();
3367
3368         ret = si_domain_init(hw_pass_through);
3369         if (ret)
3370                 goto free_iommu;
3371
3372         /*
3373          * for each drhd
3374          *   enable fault log
3375          *   global invalidate context cache
3376          *   global invalidate iotlb
3377          *   enable translation
3378          */
3379         for_each_iommu(iommu, drhd) {
3380                 if (drhd->ignored) {
3381                         /*
3382                          * we always have to disable PMRs or DMA may fail on
3383                          * this device
3384                          */
3385                         if (force_on)
3386                                 iommu_disable_protect_mem_regions(iommu);
3387                         continue;
3388                 }
3389
3390                 iommu_flush_write_buffer(iommu);
3391
3392 #ifdef CONFIG_INTEL_IOMMU_SVM
3393                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3394                         /*
3395                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3396                          * could cause possible lock race condition.
3397                          */
3398                         up_write(&dmar_global_lock);
3399                         ret = intel_svm_enable_prq(iommu);
3400                         down_write(&dmar_global_lock);
3401                         if (ret)
3402                                 goto free_iommu;
3403                 }
3404 #endif
3405                 ret = dmar_set_interrupt(iommu);
3406                 if (ret)
3407                         goto free_iommu;
3408         }
3409
3410         return 0;
3411
3412 free_iommu:
3413         for_each_active_iommu(iommu, drhd) {
3414                 disable_dmar_iommu(iommu);
3415                 free_dmar_iommu(iommu);
3416         }
3417
3418         kfree(g_iommus);
3419
3420 error:
3421         return ret;
3422 }
3423
3424 /* This takes a number of _MM_ pages, not VTD pages */
3425 static unsigned long intel_alloc_iova(struct device *dev,
3426                                      struct dmar_domain *domain,
3427                                      unsigned long nrpages, uint64_t dma_mask)
3428 {
3429         unsigned long iova_pfn;
3430
3431         /*
3432          * Restrict dma_mask to the width that the iommu can handle.
3433          * First-level translation restricts the input-address to a
3434          * canonical address (i.e., address bits 63:N have the same
3435          * value as address bit [N-1], where N is 48-bits with 4-level
3436          * paging and 57-bits with 5-level paging). Hence, skip bit
3437          * [N-1].
3438          */
3439         if (domain_use_first_level(domain))
3440                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3441                                  dma_mask);
3442         else
3443                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3444                                  dma_mask);
3445
3446         /* Ensure we reserve the whole size-aligned region */
3447         nrpages = __roundup_pow_of_two(nrpages);
3448
3449         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3450                 /*
3451                  * First try to allocate an io virtual address in
3452                  * DMA_BIT_MASK(32) and if that fails then try allocating
3453                  * from higher range
3454                  */
3455                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3456                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3457                 if (iova_pfn)
3458                         return iova_pfn;
3459         }
3460         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3461                                    IOVA_PFN(dma_mask), true);
3462         if (unlikely(!iova_pfn)) {
3463                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3464                              nrpages);
3465                 return 0;
3466         }
3467
3468         return iova_pfn;
3469 }
3470
3471 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3472                                      size_t size, int dir, u64 dma_mask)
3473 {
3474         struct dmar_domain *domain;
3475         phys_addr_t start_paddr;
3476         unsigned long iova_pfn;
3477         int prot = 0;
3478         int ret;
3479         struct intel_iommu *iommu;
3480         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3481
3482         BUG_ON(dir == DMA_NONE);
3483
3484         if (unlikely(attach_deferred(dev)))
3485                 do_deferred_attach(dev);
3486
3487         domain = find_domain(dev);
3488         if (!domain)
3489                 return DMA_MAPPING_ERROR;
3490
3491         iommu = domain_get_iommu(domain);
3492         size = aligned_nrpages(paddr, size);
3493
3494         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3495         if (!iova_pfn)
3496                 goto error;
3497
3498         /*
3499          * Check if DMAR supports zero-length reads on write only
3500          * mappings..
3501          */
3502         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3503                         !cap_zlr(iommu->cap))
3504                 prot |= DMA_PTE_READ;
3505         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3506                 prot |= DMA_PTE_WRITE;
3507         /*
3508          * paddr - (paddr + size) might be partial page, we should map the whole
3509          * page.  Note: if two part of one page are separately mapped, we
3510          * might have two guest_addr mapping to the same host paddr, but this
3511          * is not a big problem
3512          */
3513         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3514                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3515         if (ret)
3516                 goto error;
3517
3518         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3519         start_paddr += paddr & ~PAGE_MASK;
3520
3521         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3522
3523         return start_paddr;
3524
3525 error:
3526         if (iova_pfn)
3527                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3528         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3529                 size, (unsigned long long)paddr, dir);
3530         return DMA_MAPPING_ERROR;
3531 }
3532
3533 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3534                                  unsigned long offset, size_t size,
3535                                  enum dma_data_direction dir,
3536                                  unsigned long attrs)
3537 {
3538         return __intel_map_single(dev, page_to_phys(page) + offset,
3539                                   size, dir, *dev->dma_mask);
3540 }
3541
3542 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3543                                      size_t size, enum dma_data_direction dir,
3544                                      unsigned long attrs)
3545 {
3546         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3547 }
3548
3549 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3550 {
3551         struct dmar_domain *domain;
3552         unsigned long start_pfn, last_pfn;
3553         unsigned long nrpages;
3554         unsigned long iova_pfn;
3555         struct intel_iommu *iommu;
3556         struct page *freelist;
3557         struct pci_dev *pdev = NULL;
3558
3559         domain = find_domain(dev);
3560         BUG_ON(!domain);
3561
3562         iommu = domain_get_iommu(domain);
3563
3564         iova_pfn = IOVA_PFN(dev_addr);
3565
3566         nrpages = aligned_nrpages(dev_addr, size);
3567         start_pfn = mm_to_dma_pfn(iova_pfn);
3568         last_pfn = start_pfn + nrpages - 1;
3569
3570         if (dev_is_pci(dev))
3571                 pdev = to_pci_dev(dev);
3572
3573         freelist = domain_unmap(domain, start_pfn, last_pfn);
3574         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3575                         !has_iova_flush_queue(&domain->iovad)) {
3576                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577                                       nrpages, !freelist, 0);
3578                 /* free iova */
3579                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580                 dma_free_pagelist(freelist);
3581         } else {
3582                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3583                            (unsigned long)freelist);
3584                 /*
3585                  * queue up the release of the unmap to save the 1/6th of the
3586                  * cpu used up by the iotlb flush operation...
3587                  */
3588         }
3589
3590         trace_unmap_single(dev, dev_addr, size);
3591 }
3592
3593 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3594                              size_t size, enum dma_data_direction dir,
3595                              unsigned long attrs)
3596 {
3597         intel_unmap(dev, dev_addr, size);
3598 }
3599
3600 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3601                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3602 {
3603         intel_unmap(dev, dev_addr, size);
3604 }
3605
3606 static void *intel_alloc_coherent(struct device *dev, size_t size,
3607                                   dma_addr_t *dma_handle, gfp_t flags,
3608                                   unsigned long attrs)
3609 {
3610         struct page *page = NULL;
3611         int order;
3612
3613         if (unlikely(attach_deferred(dev)))
3614                 do_deferred_attach(dev);
3615
3616         size = PAGE_ALIGN(size);
3617         order = get_order(size);
3618
3619         if (gfpflags_allow_blocking(flags)) {
3620                 unsigned int count = size >> PAGE_SHIFT;
3621
3622                 page = dma_alloc_from_contiguous(dev, count, order,
3623                                                  flags & __GFP_NOWARN);
3624         }
3625
3626         if (!page)
3627                 page = alloc_pages(flags, order);
3628         if (!page)
3629                 return NULL;
3630         memset(page_address(page), 0, size);
3631
3632         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3633                                          DMA_BIDIRECTIONAL,
3634                                          dev->coherent_dma_mask);
3635         if (*dma_handle != DMA_MAPPING_ERROR)
3636                 return page_address(page);
3637         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3638                 __free_pages(page, order);
3639
3640         return NULL;
3641 }
3642
3643 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3644                                 dma_addr_t dma_handle, unsigned long attrs)
3645 {
3646         int order;
3647         struct page *page = virt_to_page(vaddr);
3648
3649         size = PAGE_ALIGN(size);
3650         order = get_order(size);
3651
3652         intel_unmap(dev, dma_handle, size);
3653         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3654                 __free_pages(page, order);
3655 }
3656
3657 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3658                            int nelems, enum dma_data_direction dir,
3659                            unsigned long attrs)
3660 {
3661         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3662         unsigned long nrpages = 0;
3663         struct scatterlist *sg;
3664         int i;
3665
3666         for_each_sg(sglist, sg, nelems, i) {
3667                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3668         }
3669
3670         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3671
3672         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 }
3674
3675 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3676                         enum dma_data_direction dir, unsigned long attrs)
3677 {
3678         int i;
3679         struct dmar_domain *domain;
3680         size_t size = 0;
3681         int prot = 0;
3682         unsigned long iova_pfn;
3683         int ret;
3684         struct scatterlist *sg;
3685         unsigned long start_vpfn;
3686         struct intel_iommu *iommu;
3687
3688         BUG_ON(dir == DMA_NONE);
3689
3690         if (unlikely(attach_deferred(dev)))
3691                 do_deferred_attach(dev);
3692
3693         domain = find_domain(dev);
3694         if (!domain)
3695                 return 0;
3696
3697         iommu = domain_get_iommu(domain);
3698
3699         for_each_sg(sglist, sg, nelems, i)
3700                 size += aligned_nrpages(sg->offset, sg->length);
3701
3702         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3703                                 *dev->dma_mask);
3704         if (!iova_pfn) {
3705                 sglist->dma_length = 0;
3706                 return 0;
3707         }
3708
3709         /*
3710          * Check if DMAR supports zero-length reads on write only
3711          * mappings..
3712          */
3713         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3714                         !cap_zlr(iommu->cap))
3715                 prot |= DMA_PTE_READ;
3716         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3717                 prot |= DMA_PTE_WRITE;
3718
3719         start_vpfn = mm_to_dma_pfn(iova_pfn);
3720
3721         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3722         if (unlikely(ret)) {
3723                 dma_pte_free_pagetable(domain, start_vpfn,
3724                                        start_vpfn + size - 1,
3725                                        agaw_to_level(domain->agaw) + 1);
3726                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3727                 return 0;
3728         }
3729
3730         for_each_sg(sglist, sg, nelems, i)
3731                 trace_map_sg(dev, i + 1, nelems, sg);
3732
3733         return nelems;
3734 }
3735
3736 static u64 intel_get_required_mask(struct device *dev)
3737 {
3738         return DMA_BIT_MASK(32);
3739 }
3740
3741 static const struct dma_map_ops intel_dma_ops = {
3742         .alloc = intel_alloc_coherent,
3743         .free = intel_free_coherent,
3744         .map_sg = intel_map_sg,
3745         .unmap_sg = intel_unmap_sg,
3746         .map_page = intel_map_page,
3747         .unmap_page = intel_unmap_page,
3748         .map_resource = intel_map_resource,
3749         .unmap_resource = intel_unmap_resource,
3750         .dma_supported = dma_direct_supported,
3751         .mmap = dma_common_mmap,
3752         .get_sgtable = dma_common_get_sgtable,
3753         .alloc_pages = dma_common_alloc_pages,
3754         .free_pages = dma_common_free_pages,
3755         .get_required_mask = intel_get_required_mask,
3756 };
3757
3758 static void
3759 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3760                    enum dma_data_direction dir, enum dma_sync_target target)
3761 {
3762         struct dmar_domain *domain;
3763         phys_addr_t tlb_addr;
3764
3765         domain = find_domain(dev);
3766         if (WARN_ON(!domain))
3767                 return;
3768
3769         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3770         if (is_swiotlb_buffer(tlb_addr))
3771                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3772 }
3773
3774 static dma_addr_t
3775 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3776                   enum dma_data_direction dir, unsigned long attrs,
3777                   u64 dma_mask)
3778 {
3779         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3780         struct dmar_domain *domain;
3781         struct intel_iommu *iommu;
3782         unsigned long iova_pfn;
3783         unsigned long nrpages;
3784         phys_addr_t tlb_addr;
3785         int prot = 0;
3786         int ret;
3787
3788         if (unlikely(attach_deferred(dev)))
3789                 do_deferred_attach(dev);
3790
3791         domain = find_domain(dev);
3792
3793         if (WARN_ON(dir == DMA_NONE || !domain))
3794                 return DMA_MAPPING_ERROR;
3795
3796         iommu = domain_get_iommu(domain);
3797         if (WARN_ON(!iommu))
3798                 return DMA_MAPPING_ERROR;
3799
3800         nrpages = aligned_nrpages(0, size);
3801         iova_pfn = intel_alloc_iova(dev, domain,
3802                                     dma_to_mm_pfn(nrpages), dma_mask);
3803         if (!iova_pfn)
3804                 return DMA_MAPPING_ERROR;
3805
3806         /*
3807          * Check if DMAR supports zero-length reads on write only
3808          * mappings..
3809          */
3810         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3811                         !cap_zlr(iommu->cap))
3812                 prot |= DMA_PTE_READ;
3813         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3814                 prot |= DMA_PTE_WRITE;
3815
3816         /*
3817          * If both the physical buffer start address and size are
3818          * page aligned, we don't need to use a bounce page.
3819          */
3820         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3821                 tlb_addr = swiotlb_tbl_map_single(dev,
3822                                 phys_to_dma_unencrypted(dev, io_tlb_start),
3823                                 paddr, size, aligned_size, dir, attrs);
3824                 if (tlb_addr == DMA_MAPPING_ERROR) {
3825                         goto swiotlb_error;
3826                 } else {
3827                         /* Cleanup the padding area. */
3828                         void *padding_start = phys_to_virt(tlb_addr);
3829                         size_t padding_size = aligned_size;
3830
3831                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3832                             (dir == DMA_TO_DEVICE ||
3833                              dir == DMA_BIDIRECTIONAL)) {
3834                                 padding_start += size;
3835                                 padding_size -= size;
3836                         }
3837
3838                         memset(padding_start, 0, padding_size);
3839                 }
3840         } else {
3841                 tlb_addr = paddr;
3842         }
3843
3844         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3845                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3846         if (ret)
3847                 goto mapping_error;
3848
3849         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3850
3851         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3852
3853 mapping_error:
3854         if (is_swiotlb_buffer(tlb_addr))
3855                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3856                                          aligned_size, dir, attrs);
3857 swiotlb_error:
3858         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3859         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3860                 size, (unsigned long long)paddr, dir);
3861
3862         return DMA_MAPPING_ERROR;
3863 }
3864
3865 static void
3866 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3867                     enum dma_data_direction dir, unsigned long attrs)
3868 {
3869         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3870         struct dmar_domain *domain;
3871         phys_addr_t tlb_addr;
3872
3873         domain = find_domain(dev);
3874         if (WARN_ON(!domain))
3875                 return;
3876
3877         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3878         if (WARN_ON(!tlb_addr))
3879                 return;
3880
3881         intel_unmap(dev, dev_addr, size);
3882         if (is_swiotlb_buffer(tlb_addr))
3883                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3884                                          aligned_size, dir, attrs);
3885
3886         trace_bounce_unmap_single(dev, dev_addr, size);
3887 }
3888
3889 static dma_addr_t
3890 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3891                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3892 {
3893         return bounce_map_single(dev, page_to_phys(page) + offset,
3894                                  size, dir, attrs, *dev->dma_mask);
3895 }
3896
3897 static dma_addr_t
3898 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3899                     enum dma_data_direction dir, unsigned long attrs)
3900 {
3901         return bounce_map_single(dev, phys_addr, size,
3902                                  dir, attrs, *dev->dma_mask);
3903 }
3904
3905 static void
3906 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3907                   enum dma_data_direction dir, unsigned long attrs)
3908 {
3909         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3910 }
3911
3912 static void
3913 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3914                       enum dma_data_direction dir, unsigned long attrs)
3915 {
3916         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3917 }
3918
3919 static void
3920 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3921                 enum dma_data_direction dir, unsigned long attrs)
3922 {
3923         struct scatterlist *sg;
3924         int i;
3925
3926         for_each_sg(sglist, sg, nelems, i)
3927                 bounce_unmap_page(dev, sg->dma_address,
3928                                   sg_dma_len(sg), dir, attrs);
3929 }
3930
3931 static int
3932 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3933               enum dma_data_direction dir, unsigned long attrs)
3934 {
3935         int i;
3936         struct scatterlist *sg;
3937
3938         for_each_sg(sglist, sg, nelems, i) {
3939                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3940                                                   sg->offset, sg->length,
3941                                                   dir, attrs);
3942                 if (sg->dma_address == DMA_MAPPING_ERROR)
3943                         goto out_unmap;
3944                 sg_dma_len(sg) = sg->length;
3945         }
3946
3947         for_each_sg(sglist, sg, nelems, i)
3948                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3949
3950         return nelems;
3951
3952 out_unmap:
3953         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3954         return 0;
3955 }
3956
3957 static void
3958 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3959                            size_t size, enum dma_data_direction dir)
3960 {
3961         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3962 }
3963
3964 static void
3965 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3966                               size_t size, enum dma_data_direction dir)
3967 {
3968         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3969 }
3970
3971 static void
3972 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3973                        int nelems, enum dma_data_direction dir)
3974 {
3975         struct scatterlist *sg;
3976         int i;
3977
3978         for_each_sg(sglist, sg, nelems, i)
3979                 bounce_sync_single(dev, sg_dma_address(sg),
3980                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3981 }
3982
3983 static void
3984 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3985                           int nelems, enum dma_data_direction dir)
3986 {
3987         struct scatterlist *sg;
3988         int i;
3989
3990         for_each_sg(sglist, sg, nelems, i)
3991                 bounce_sync_single(dev, sg_dma_address(sg),
3992                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3993 }
3994
3995 static const struct dma_map_ops bounce_dma_ops = {
3996         .alloc                  = intel_alloc_coherent,
3997         .free                   = intel_free_coherent,
3998         .map_sg                 = bounce_map_sg,
3999         .unmap_sg               = bounce_unmap_sg,
4000         .map_page               = bounce_map_page,
4001         .unmap_page             = bounce_unmap_page,
4002         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4003         .sync_single_for_device = bounce_sync_single_for_device,
4004         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4005         .sync_sg_for_device     = bounce_sync_sg_for_device,
4006         .map_resource           = bounce_map_resource,
4007         .unmap_resource         = bounce_unmap_resource,
4008         .alloc_pages            = dma_common_alloc_pages,
4009         .free_pages             = dma_common_free_pages,
4010         .dma_supported          = dma_direct_supported,
4011 };
4012
4013 static inline int iommu_domain_cache_init(void)
4014 {
4015         int ret = 0;
4016
4017         iommu_domain_cache = kmem_cache_create("iommu_domain",
4018                                          sizeof(struct dmar_domain),
4019                                          0,
4020                                          SLAB_HWCACHE_ALIGN,
4021
4022                                          NULL);
4023         if (!iommu_domain_cache) {
4024                 pr_err("Couldn't create iommu_domain cache\n");
4025                 ret = -ENOMEM;
4026         }
4027
4028         return ret;
4029 }
4030
4031 static inline int iommu_devinfo_cache_init(void)
4032 {
4033         int ret = 0;
4034
4035         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4036                                          sizeof(struct device_domain_info),
4037                                          0,
4038                                          SLAB_HWCACHE_ALIGN,
4039                                          NULL);
4040         if (!iommu_devinfo_cache) {
4041                 pr_err("Couldn't create devinfo cache\n");
4042                 ret = -ENOMEM;
4043         }
4044
4045         return ret;
4046 }
4047
4048 static int __init iommu_init_mempool(void)
4049 {
4050         int ret;
4051         ret = iova_cache_get();
4052         if (ret)
4053                 return ret;
4054
4055         ret = iommu_domain_cache_init();
4056         if (ret)
4057                 goto domain_error;
4058
4059         ret = iommu_devinfo_cache_init();
4060         if (!ret)
4061                 return ret;
4062
4063         kmem_cache_destroy(iommu_domain_cache);
4064 domain_error:
4065         iova_cache_put();
4066
4067         return -ENOMEM;
4068 }
4069
4070 static void __init iommu_exit_mempool(void)
4071 {
4072         kmem_cache_destroy(iommu_devinfo_cache);
4073         kmem_cache_destroy(iommu_domain_cache);
4074         iova_cache_put();
4075 }
4076
4077 static void __init init_no_remapping_devices(void)
4078 {
4079         struct dmar_drhd_unit *drhd;
4080         struct device *dev;
4081         int i;
4082
4083         for_each_drhd_unit(drhd) {
4084                 if (!drhd->include_all) {
4085                         for_each_active_dev_scope(drhd->devices,
4086                                                   drhd->devices_cnt, i, dev)
4087                                 break;
4088                         /* ignore DMAR unit if no devices exist */
4089                         if (i == drhd->devices_cnt)
4090                                 drhd->ignored = 1;
4091                 }
4092         }
4093
4094         for_each_active_drhd_unit(drhd) {
4095                 if (drhd->include_all)
4096                         continue;
4097
4098                 for_each_active_dev_scope(drhd->devices,
4099                                           drhd->devices_cnt, i, dev)
4100                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4101                                 break;
4102                 if (i < drhd->devices_cnt)
4103                         continue;
4104
4105                 /* This IOMMU has *only* gfx devices. Either bypass it or
4106                    set the gfx_mapped flag, as appropriate */
4107                 drhd->gfx_dedicated = 1;
4108                 if (!dmar_map_gfx)
4109                         drhd->ignored = 1;
4110         }
4111 }
4112
4113 #ifdef CONFIG_SUSPEND
4114 static int init_iommu_hw(void)
4115 {
4116         struct dmar_drhd_unit *drhd;
4117         struct intel_iommu *iommu = NULL;
4118
4119         for_each_active_iommu(iommu, drhd)
4120                 if (iommu->qi)
4121                         dmar_reenable_qi(iommu);
4122
4123         for_each_iommu(iommu, drhd) {
4124                 if (drhd->ignored) {
4125                         /*
4126                          * we always have to disable PMRs or DMA may fail on
4127                          * this device
4128                          */
4129                         if (force_on)
4130                                 iommu_disable_protect_mem_regions(iommu);
4131                         continue;
4132                 }
4133
4134                 iommu_flush_write_buffer(iommu);
4135
4136                 iommu_set_root_entry(iommu);
4137
4138                 iommu->flush.flush_context(iommu, 0, 0, 0,
4139                                            DMA_CCMD_GLOBAL_INVL);
4140                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4141                 iommu_enable_translation(iommu);
4142                 iommu_disable_protect_mem_regions(iommu);
4143         }
4144
4145         return 0;
4146 }
4147
4148 static void iommu_flush_all(void)
4149 {
4150         struct dmar_drhd_unit *drhd;
4151         struct intel_iommu *iommu;
4152
4153         for_each_active_iommu(iommu, drhd) {
4154                 iommu->flush.flush_context(iommu, 0, 0, 0,
4155                                            DMA_CCMD_GLOBAL_INVL);
4156                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4157                                          DMA_TLB_GLOBAL_FLUSH);
4158         }
4159 }
4160
4161 static int iommu_suspend(void)
4162 {
4163         struct dmar_drhd_unit *drhd;
4164         struct intel_iommu *iommu = NULL;
4165         unsigned long flag;
4166
4167         for_each_active_iommu(iommu, drhd) {
4168                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4169                                                  GFP_ATOMIC);
4170                 if (!iommu->iommu_state)
4171                         goto nomem;
4172         }
4173
4174         iommu_flush_all();
4175
4176         for_each_active_iommu(iommu, drhd) {
4177                 iommu_disable_translation(iommu);
4178
4179                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4180
4181                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4182                         readl(iommu->reg + DMAR_FECTL_REG);
4183                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4184                         readl(iommu->reg + DMAR_FEDATA_REG);
4185                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4186                         readl(iommu->reg + DMAR_FEADDR_REG);
4187                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4188                         readl(iommu->reg + DMAR_FEUADDR_REG);
4189
4190                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4191         }
4192         return 0;
4193
4194 nomem:
4195         for_each_active_iommu(iommu, drhd)
4196                 kfree(iommu->iommu_state);
4197
4198         return -ENOMEM;
4199 }
4200
4201 static void iommu_resume(void)
4202 {
4203         struct dmar_drhd_unit *drhd;
4204         struct intel_iommu *iommu = NULL;
4205         unsigned long flag;
4206
4207         if (init_iommu_hw()) {
4208                 if (force_on)
4209                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4210                 else
4211                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4212                 return;
4213         }
4214
4215         for_each_active_iommu(iommu, drhd) {
4216
4217                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4218
4219                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4220                         iommu->reg + DMAR_FECTL_REG);
4221                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4222                         iommu->reg + DMAR_FEDATA_REG);
4223                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4224                         iommu->reg + DMAR_FEADDR_REG);
4225                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4226                         iommu->reg + DMAR_FEUADDR_REG);
4227
4228                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4229         }
4230
4231         for_each_active_iommu(iommu, drhd)
4232                 kfree(iommu->iommu_state);
4233 }
4234
4235 static struct syscore_ops iommu_syscore_ops = {
4236         .resume         = iommu_resume,
4237         .suspend        = iommu_suspend,
4238 };
4239
4240 static void __init init_iommu_pm_ops(void)
4241 {
4242         register_syscore_ops(&iommu_syscore_ops);
4243 }
4244
4245 #else
4246 static inline void init_iommu_pm_ops(void) {}
4247 #endif  /* CONFIG_PM */
4248
4249 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4250 {
4251         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4252             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4253             rmrr->end_address <= rmrr->base_address ||
4254             arch_rmrr_sanity_check(rmrr))
4255                 return -EINVAL;
4256
4257         return 0;
4258 }
4259
4260 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4261 {
4262         struct acpi_dmar_reserved_memory *rmrr;
4263         struct dmar_rmrr_unit *rmrru;
4264
4265         rmrr = (struct acpi_dmar_reserved_memory *)header;
4266         if (rmrr_sanity_check(rmrr)) {
4267                 pr_warn(FW_BUG
4268                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4269                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4270                            rmrr->base_address, rmrr->end_address,
4271                            dmi_get_system_info(DMI_BIOS_VENDOR),
4272                            dmi_get_system_info(DMI_BIOS_VERSION),
4273                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4274                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4275         }
4276
4277         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4278         if (!rmrru)
4279                 goto out;
4280
4281         rmrru->hdr = header;
4282
4283         rmrru->base_address = rmrr->base_address;
4284         rmrru->end_address = rmrr->end_address;
4285
4286         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4287                                 ((void *)rmrr) + rmrr->header.length,
4288                                 &rmrru->devices_cnt);
4289         if (rmrru->devices_cnt && rmrru->devices == NULL)
4290                 goto free_rmrru;
4291
4292         list_add(&rmrru->list, &dmar_rmrr_units);
4293
4294         return 0;
4295 free_rmrru:
4296         kfree(rmrru);
4297 out:
4298         return -ENOMEM;
4299 }
4300
4301 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4302 {
4303         struct dmar_atsr_unit *atsru;
4304         struct acpi_dmar_atsr *tmp;
4305
4306         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4307                                 dmar_rcu_check()) {
4308                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4309                 if (atsr->segment != tmp->segment)
4310                         continue;
4311                 if (atsr->header.length != tmp->header.length)
4312                         continue;
4313                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4314                         return atsru;
4315         }
4316
4317         return NULL;
4318 }
4319
4320 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4321 {
4322         struct acpi_dmar_atsr *atsr;
4323         struct dmar_atsr_unit *atsru;
4324
4325         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4326                 return 0;
4327
4328         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4329         atsru = dmar_find_atsr(atsr);
4330         if (atsru)
4331                 return 0;
4332
4333         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4334         if (!atsru)
4335                 return -ENOMEM;
4336
4337         /*
4338          * If memory is allocated from slab by ACPI _DSM method, we need to
4339          * copy the memory content because the memory buffer will be freed
4340          * on return.
4341          */
4342         atsru->hdr = (void *)(atsru + 1);
4343         memcpy(atsru->hdr, hdr, hdr->length);
4344         atsru->include_all = atsr->flags & 0x1;
4345         if (!atsru->include_all) {
4346                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4347                                 (void *)atsr + atsr->header.length,
4348                                 &atsru->devices_cnt);
4349                 if (atsru->devices_cnt && atsru->devices == NULL) {
4350                         kfree(atsru);
4351                         return -ENOMEM;
4352                 }
4353         }
4354
4355         list_add_rcu(&atsru->list, &dmar_atsr_units);
4356
4357         return 0;
4358 }
4359
4360 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4361 {
4362         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4363         kfree(atsru);
4364 }
4365
4366 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4367 {
4368         struct acpi_dmar_atsr *atsr;
4369         struct dmar_atsr_unit *atsru;
4370
4371         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4372         atsru = dmar_find_atsr(atsr);
4373         if (atsru) {
4374                 list_del_rcu(&atsru->list);
4375                 synchronize_rcu();
4376                 intel_iommu_free_atsr(atsru);
4377         }
4378
4379         return 0;
4380 }
4381
4382 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4383 {
4384         int i;
4385         struct device *dev;
4386         struct acpi_dmar_atsr *atsr;
4387         struct dmar_atsr_unit *atsru;
4388
4389         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4390         atsru = dmar_find_atsr(atsr);
4391         if (!atsru)
4392                 return 0;
4393
4394         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4395                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4396                                           i, dev)
4397                         return -EBUSY;
4398         }
4399
4400         return 0;
4401 }
4402
4403 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4404 {
4405         int sp, ret;
4406         struct intel_iommu *iommu = dmaru->iommu;
4407
4408         if (g_iommus[iommu->seq_id])
4409                 return 0;
4410
4411         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4412                 pr_warn("%s: Doesn't support hardware pass through.\n",
4413                         iommu->name);
4414                 return -ENXIO;
4415         }
4416         if (!ecap_sc_support(iommu->ecap) &&
4417             domain_update_iommu_snooping(iommu)) {
4418                 pr_warn("%s: Doesn't support snooping.\n",
4419                         iommu->name);
4420                 return -ENXIO;
4421         }
4422         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4423         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4424                 pr_warn("%s: Doesn't support large page.\n",
4425                         iommu->name);
4426                 return -ENXIO;
4427         }
4428
4429         /*
4430          * Disable translation if already enabled prior to OS handover.
4431          */
4432         if (iommu->gcmd & DMA_GCMD_TE)
4433                 iommu_disable_translation(iommu);
4434
4435         g_iommus[iommu->seq_id] = iommu;
4436         ret = iommu_init_domains(iommu);
4437         if (ret == 0)
4438                 ret = iommu_alloc_root_entry(iommu);
4439         if (ret)
4440                 goto out;
4441
4442         intel_svm_check(iommu);
4443
4444         if (dmaru->ignored) {
4445                 /*
4446                  * we always have to disable PMRs or DMA may fail on this device
4447                  */
4448                 if (force_on)
4449                         iommu_disable_protect_mem_regions(iommu);
4450                 return 0;
4451         }
4452
4453         intel_iommu_init_qi(iommu);
4454         iommu_flush_write_buffer(iommu);
4455
4456 #ifdef CONFIG_INTEL_IOMMU_SVM
4457         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4458                 ret = intel_svm_enable_prq(iommu);
4459                 if (ret)
4460                         goto disable_iommu;
4461         }
4462 #endif
4463         ret = dmar_set_interrupt(iommu);
4464         if (ret)
4465                 goto disable_iommu;
4466
4467         iommu_set_root_entry(iommu);
4468         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4469         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4470         iommu_enable_translation(iommu);
4471
4472         iommu_disable_protect_mem_regions(iommu);
4473         return 0;
4474
4475 disable_iommu:
4476         disable_dmar_iommu(iommu);
4477 out:
4478         free_dmar_iommu(iommu);
4479         return ret;
4480 }
4481
4482 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4483 {
4484         int ret = 0;
4485         struct intel_iommu *iommu = dmaru->iommu;
4486
4487         if (!intel_iommu_enabled)
4488                 return 0;
4489         if (iommu == NULL)
4490                 return -EINVAL;
4491
4492         if (insert) {
4493                 ret = intel_iommu_add(dmaru);
4494         } else {
4495                 disable_dmar_iommu(iommu);
4496                 free_dmar_iommu(iommu);
4497         }
4498
4499         return ret;
4500 }
4501
4502 static void intel_iommu_free_dmars(void)
4503 {
4504         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4505         struct dmar_atsr_unit *atsru, *atsr_n;
4506
4507         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4508                 list_del(&rmrru->list);
4509                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4510                 kfree(rmrru);
4511         }
4512
4513         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4514                 list_del(&atsru->list);
4515                 intel_iommu_free_atsr(atsru);
4516         }
4517 }
4518
4519 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4520 {
4521         int i, ret = 1;
4522         struct pci_bus *bus;
4523         struct pci_dev *bridge = NULL;
4524         struct device *tmp;
4525         struct acpi_dmar_atsr *atsr;
4526         struct dmar_atsr_unit *atsru;
4527
4528         dev = pci_physfn(dev);
4529         for (bus = dev->bus; bus; bus = bus->parent) {
4530                 bridge = bus->self;
4531                 /* If it's an integrated device, allow ATS */
4532                 if (!bridge)
4533                         return 1;
4534                 /* Connected via non-PCIe: no ATS */
4535                 if (!pci_is_pcie(bridge) ||
4536                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4537                         return 0;
4538                 /* If we found the root port, look it up in the ATSR */
4539                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4540                         break;
4541         }
4542
4543         rcu_read_lock();
4544         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4545                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4546                 if (atsr->segment != pci_domain_nr(dev->bus))
4547                         continue;
4548
4549                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4550                         if (tmp == &bridge->dev)
4551                                 goto out;
4552
4553                 if (atsru->include_all)
4554                         goto out;
4555         }
4556         ret = 0;
4557 out:
4558         rcu_read_unlock();
4559
4560         return ret;
4561 }
4562
4563 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4564 {
4565         int ret;
4566         struct dmar_rmrr_unit *rmrru;
4567         struct dmar_atsr_unit *atsru;
4568         struct acpi_dmar_atsr *atsr;
4569         struct acpi_dmar_reserved_memory *rmrr;
4570
4571         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4572                 return 0;
4573
4574         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4575                 rmrr = container_of(rmrru->hdr,
4576                                     struct acpi_dmar_reserved_memory, header);
4577                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4578                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4579                                 ((void *)rmrr) + rmrr->header.length,
4580                                 rmrr->segment, rmrru->devices,
4581                                 rmrru->devices_cnt);
4582                         if (ret < 0)
4583                                 return ret;
4584                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4585                         dmar_remove_dev_scope(info, rmrr->segment,
4586                                 rmrru->devices, rmrru->devices_cnt);
4587                 }
4588         }
4589
4590         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4591                 if (atsru->include_all)
4592                         continue;
4593
4594                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4595                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4596                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4597                                         (void *)atsr + atsr->header.length,
4598                                         atsr->segment, atsru->devices,
4599                                         atsru->devices_cnt);
4600                         if (ret > 0)
4601                                 break;
4602                         else if (ret < 0)
4603                                 return ret;
4604                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4605                         if (dmar_remove_dev_scope(info, atsr->segment,
4606                                         atsru->devices, atsru->devices_cnt))
4607                                 break;
4608                 }
4609         }
4610
4611         return 0;
4612 }
4613
4614 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4615                                        unsigned long val, void *v)
4616 {
4617         struct memory_notify *mhp = v;
4618         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4619         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4620                         mhp->nr_pages - 1);
4621
4622         switch (val) {
4623         case MEM_GOING_ONLINE:
4624                 if (iommu_domain_identity_map(si_domain,
4625                                               start_vpfn, last_vpfn)) {
4626                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4627                                 start_vpfn, last_vpfn);
4628                         return NOTIFY_BAD;
4629                 }
4630                 break;
4631
4632         case MEM_OFFLINE:
4633         case MEM_CANCEL_ONLINE:
4634                 {
4635                         struct dmar_drhd_unit *drhd;
4636                         struct intel_iommu *iommu;
4637                         struct page *freelist;
4638
4639                         freelist = domain_unmap(si_domain,
4640                                                 start_vpfn, last_vpfn);
4641
4642                         rcu_read_lock();
4643                         for_each_active_iommu(iommu, drhd)
4644                                 iommu_flush_iotlb_psi(iommu, si_domain,
4645                                         start_vpfn, mhp->nr_pages,
4646                                         !freelist, 0);
4647                         rcu_read_unlock();
4648                         dma_free_pagelist(freelist);
4649                 }
4650                 break;
4651         }
4652
4653         return NOTIFY_OK;
4654 }
4655
4656 static struct notifier_block intel_iommu_memory_nb = {
4657         .notifier_call = intel_iommu_memory_notifier,
4658         .priority = 0
4659 };
4660
4661 static void free_all_cpu_cached_iovas(unsigned int cpu)
4662 {
4663         int i;
4664
4665         for (i = 0; i < g_num_of_iommus; i++) {
4666                 struct intel_iommu *iommu = g_iommus[i];
4667                 struct dmar_domain *domain;
4668                 int did;
4669
4670                 if (!iommu)
4671                         continue;
4672
4673                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4674                         domain = get_iommu_domain(iommu, (u16)did);
4675
4676                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4677                                 continue;
4678
4679                         free_cpu_cached_iovas(cpu, &domain->iovad);
4680                 }
4681         }
4682 }
4683
4684 static int intel_iommu_cpu_dead(unsigned int cpu)
4685 {
4686         free_all_cpu_cached_iovas(cpu);
4687         return 0;
4688 }
4689
4690 static void intel_disable_iommus(void)
4691 {
4692         struct intel_iommu *iommu = NULL;
4693         struct dmar_drhd_unit *drhd;
4694
4695         for_each_iommu(iommu, drhd)
4696                 iommu_disable_translation(iommu);
4697 }
4698
4699 void intel_iommu_shutdown(void)
4700 {
4701         struct dmar_drhd_unit *drhd;
4702         struct intel_iommu *iommu = NULL;
4703
4704         if (no_iommu || dmar_disabled)
4705                 return;
4706
4707         down_write(&dmar_global_lock);
4708
4709         /* Disable PMRs explicitly here. */
4710         for_each_iommu(iommu, drhd)
4711                 iommu_disable_protect_mem_regions(iommu);
4712
4713         /* Make sure the IOMMUs are switched off */
4714         intel_disable_iommus();
4715
4716         up_write(&dmar_global_lock);
4717 }
4718
4719 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4720 {
4721         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4722
4723         return container_of(iommu_dev, struct intel_iommu, iommu);
4724 }
4725
4726 static ssize_t intel_iommu_show_version(struct device *dev,
4727                                         struct device_attribute *attr,
4728                                         char *buf)
4729 {
4730         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4731         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4732         return sprintf(buf, "%d:%d\n",
4733                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4734 }
4735 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4736
4737 static ssize_t intel_iommu_show_address(struct device *dev,
4738                                         struct device_attribute *attr,
4739                                         char *buf)
4740 {
4741         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4742         return sprintf(buf, "%llx\n", iommu->reg_phys);
4743 }
4744 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4745
4746 static ssize_t intel_iommu_show_cap(struct device *dev,
4747                                     struct device_attribute *attr,
4748                                     char *buf)
4749 {
4750         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4751         return sprintf(buf, "%llx\n", iommu->cap);
4752 }
4753 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4754
4755 static ssize_t intel_iommu_show_ecap(struct device *dev,
4756                                     struct device_attribute *attr,
4757                                     char *buf)
4758 {
4759         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4760         return sprintf(buf, "%llx\n", iommu->ecap);
4761 }
4762 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4763
4764 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4765                                       struct device_attribute *attr,
4766                                       char *buf)
4767 {
4768         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4769         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4770 }
4771 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4772
4773 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4774                                            struct device_attribute *attr,
4775                                            char *buf)
4776 {
4777         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4778         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4779                                                   cap_ndoms(iommu->cap)));
4780 }
4781 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4782
4783 static struct attribute *intel_iommu_attrs[] = {
4784         &dev_attr_version.attr,
4785         &dev_attr_address.attr,
4786         &dev_attr_cap.attr,
4787         &dev_attr_ecap.attr,
4788         &dev_attr_domains_supported.attr,
4789         &dev_attr_domains_used.attr,
4790         NULL,
4791 };
4792
4793 static struct attribute_group intel_iommu_group = {
4794         .name = "intel-iommu",
4795         .attrs = intel_iommu_attrs,
4796 };
4797
4798 const struct attribute_group *intel_iommu_groups[] = {
4799         &intel_iommu_group,
4800         NULL,
4801 };
4802
4803 static inline bool has_external_pci(void)
4804 {
4805         struct pci_dev *pdev = NULL;
4806
4807         for_each_pci_dev(pdev)
4808                 if (pdev->external_facing)
4809                         return true;
4810
4811         return false;
4812 }
4813
4814 static int __init platform_optin_force_iommu(void)
4815 {
4816         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4817                 return 0;
4818
4819         if (no_iommu || dmar_disabled)
4820                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4821
4822         /*
4823          * If Intel-IOMMU is disabled by default, we will apply identity
4824          * map for all devices except those marked as being untrusted.
4825          */
4826         if (dmar_disabled)
4827                 iommu_set_default_passthrough(false);
4828
4829         dmar_disabled = 0;
4830         no_iommu = 0;
4831
4832         return 1;
4833 }
4834
4835 static int __init probe_acpi_namespace_devices(void)
4836 {
4837         struct dmar_drhd_unit *drhd;
4838         /* To avoid a -Wunused-but-set-variable warning. */
4839         struct intel_iommu *iommu __maybe_unused;
4840         struct device *dev;
4841         int i, ret = 0;
4842
4843         for_each_active_iommu(iommu, drhd) {
4844                 for_each_active_dev_scope(drhd->devices,
4845                                           drhd->devices_cnt, i, dev) {
4846                         struct acpi_device_physical_node *pn;
4847                         struct iommu_group *group;
4848                         struct acpi_device *adev;
4849
4850                         if (dev->bus != &acpi_bus_type)
4851                                 continue;
4852
4853                         adev = to_acpi_device(dev);
4854                         mutex_lock(&adev->physical_node_lock);
4855                         list_for_each_entry(pn,
4856                                             &adev->physical_node_list, node) {
4857                                 group = iommu_group_get(pn->dev);
4858                                 if (group) {
4859                                         iommu_group_put(group);
4860                                         continue;
4861                                 }
4862
4863                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4864                                 ret = iommu_probe_device(pn->dev);
4865                                 if (ret)
4866                                         break;
4867                         }
4868                         mutex_unlock(&adev->physical_node_lock);
4869
4870                         if (ret)
4871                                 return ret;
4872                 }
4873         }
4874
4875         return 0;
4876 }
4877
4878 int __init intel_iommu_init(void)
4879 {
4880         int ret = -ENODEV;
4881         struct dmar_drhd_unit *drhd;
4882         struct intel_iommu *iommu;
4883
4884         /*
4885          * Intel IOMMU is required for a TXT/tboot launch or platform
4886          * opt in, so enforce that.
4887          */
4888         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4889
4890         if (iommu_init_mempool()) {
4891                 if (force_on)
4892                         panic("tboot: Failed to initialize iommu memory\n");
4893                 return -ENOMEM;
4894         }
4895
4896         down_write(&dmar_global_lock);
4897         if (dmar_table_init()) {
4898                 if (force_on)
4899                         panic("tboot: Failed to initialize DMAR table\n");
4900                 goto out_free_dmar;
4901         }
4902
4903         if (dmar_dev_scope_init() < 0) {
4904                 if (force_on)
4905                         panic("tboot: Failed to initialize DMAR device scope\n");
4906                 goto out_free_dmar;
4907         }
4908
4909         up_write(&dmar_global_lock);
4910
4911         /*
4912          * The bus notifier takes the dmar_global_lock, so lockdep will
4913          * complain later when we register it under the lock.
4914          */
4915         dmar_register_bus_notifier();
4916
4917         down_write(&dmar_global_lock);
4918
4919         if (!no_iommu)
4920                 intel_iommu_debugfs_init();
4921
4922         if (no_iommu || dmar_disabled) {
4923                 /*
4924                  * We exit the function here to ensure IOMMU's remapping and
4925                  * mempool aren't setup, which means that the IOMMU's PMRs
4926                  * won't be disabled via the call to init_dmars(). So disable
4927                  * it explicitly here. The PMRs were setup by tboot prior to
4928                  * calling SENTER, but the kernel is expected to reset/tear
4929                  * down the PMRs.
4930                  */
4931                 if (intel_iommu_tboot_noforce) {
4932                         for_each_iommu(iommu, drhd)
4933                                 iommu_disable_protect_mem_regions(iommu);
4934                 }
4935
4936                 /*
4937                  * Make sure the IOMMUs are switched off, even when we
4938                  * boot into a kexec kernel and the previous kernel left
4939                  * them enabled
4940                  */
4941                 intel_disable_iommus();
4942                 goto out_free_dmar;
4943         }
4944
4945         if (list_empty(&dmar_rmrr_units))
4946                 pr_info("No RMRR found\n");
4947
4948         if (list_empty(&dmar_atsr_units))
4949                 pr_info("No ATSR found\n");
4950
4951         if (dmar_init_reserved_ranges()) {
4952                 if (force_on)
4953                         panic("tboot: Failed to reserve iommu ranges\n");
4954                 goto out_free_reserved_range;
4955         }
4956
4957         if (dmar_map_gfx)
4958                 intel_iommu_gfx_mapped = 1;
4959
4960         init_no_remapping_devices();
4961
4962         ret = init_dmars();
4963         if (ret) {
4964                 if (force_on)
4965                         panic("tboot: Failed to initialize DMARs\n");
4966                 pr_err("Initialization failed\n");
4967                 goto out_free_reserved_range;
4968         }
4969         up_write(&dmar_global_lock);
4970
4971         init_iommu_pm_ops();
4972
4973         down_read(&dmar_global_lock);
4974         for_each_active_iommu(iommu, drhd) {
4975                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4976                                        intel_iommu_groups,
4977                                        "%s", iommu->name);
4978                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4979                 iommu_device_register(&iommu->iommu);
4980         }
4981         up_read(&dmar_global_lock);
4982
4983         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4984         if (si_domain && !hw_pass_through)
4985                 register_memory_notifier(&intel_iommu_memory_nb);
4986         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4987                           intel_iommu_cpu_dead);
4988
4989         down_read(&dmar_global_lock);
4990         if (probe_acpi_namespace_devices())
4991                 pr_warn("ACPI name space devices didn't probe correctly\n");
4992
4993         /* Finally, we enable the DMA remapping hardware. */
4994         for_each_iommu(iommu, drhd) {
4995                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4996                         iommu_enable_translation(iommu);
4997
4998                 iommu_disable_protect_mem_regions(iommu);
4999         }
5000         up_read(&dmar_global_lock);
5001
5002         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5003
5004         intel_iommu_enabled = 1;
5005
5006         return 0;
5007
5008 out_free_reserved_range:
5009         put_iova_domain(&reserved_iova_list);
5010 out_free_dmar:
5011         intel_iommu_free_dmars();
5012         up_write(&dmar_global_lock);
5013         iommu_exit_mempool();
5014         return ret;
5015 }
5016
5017 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5018 {
5019         struct intel_iommu *iommu = opaque;
5020
5021         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5022         return 0;
5023 }
5024
5025 /*
5026  * NB - intel-iommu lacks any sort of reference counting for the users of
5027  * dependent devices.  If multiple endpoints have intersecting dependent
5028  * devices, unbinding the driver from any one of them will possibly leave
5029  * the others unable to operate.
5030  */
5031 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5032 {
5033         if (!iommu || !dev || !dev_is_pci(dev))
5034                 return;
5035
5036         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5037 }
5038
5039 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5040 {
5041         struct dmar_domain *domain;
5042         struct intel_iommu *iommu;
5043         unsigned long flags;
5044
5045         assert_spin_locked(&device_domain_lock);
5046
5047         if (WARN_ON(!info))
5048                 return;
5049
5050         iommu = info->iommu;
5051         domain = info->domain;
5052
5053         if (info->dev) {
5054                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5055                         intel_pasid_tear_down_entry(iommu, info->dev,
5056                                         PASID_RID2PASID, false);
5057
5058                 iommu_disable_dev_iotlb(info);
5059                 if (!dev_is_real_dma_subdevice(info->dev))
5060                         domain_context_clear(iommu, info->dev);
5061                 intel_pasid_free_table(info->dev);
5062         }
5063
5064         unlink_domain_info(info);
5065
5066         spin_lock_irqsave(&iommu->lock, flags);
5067         domain_detach_iommu(domain, iommu);
5068         spin_unlock_irqrestore(&iommu->lock, flags);
5069
5070         free_devinfo_mem(info);
5071 }
5072
5073 static void dmar_remove_one_dev_info(struct device *dev)
5074 {
5075         struct device_domain_info *info;
5076         unsigned long flags;
5077
5078         spin_lock_irqsave(&device_domain_lock, flags);
5079         info = get_domain_info(dev);
5080         if (info)
5081                 __dmar_remove_one_dev_info(info);
5082         spin_unlock_irqrestore(&device_domain_lock, flags);
5083 }
5084
5085 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5086 {
5087         int adjust_width;
5088
5089         /* calculate AGAW */
5090         domain->gaw = guest_width;
5091         adjust_width = guestwidth_to_adjustwidth(guest_width);
5092         domain->agaw = width_to_agaw(adjust_width);
5093
5094         domain->iommu_coherency = 0;
5095         domain->iommu_snooping = 0;
5096         domain->iommu_superpage = 0;
5097         domain->max_addr = 0;
5098
5099         /* always allocate the top pgd */
5100         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5101         if (!domain->pgd)
5102                 return -ENOMEM;
5103         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5104         return 0;
5105 }
5106
5107 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5108 {
5109         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5110         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5111
5112         if (!intel_iommu_strict &&
5113             init_iova_flush_queue(&dmar_domain->iovad,
5114                                   iommu_flush_iova, iova_entry_free))
5115                 pr_info("iova flush queue initialization failed\n");
5116 }
5117
5118 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5119 {
5120         struct dmar_domain *dmar_domain;
5121         struct iommu_domain *domain;
5122
5123         switch (type) {
5124         case IOMMU_DOMAIN_DMA:
5125         case IOMMU_DOMAIN_UNMANAGED:
5126                 dmar_domain = alloc_domain(0);
5127                 if (!dmar_domain) {
5128                         pr_err("Can't allocate dmar_domain\n");
5129                         return NULL;
5130                 }
5131                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5132                         pr_err("Domain initialization failed\n");
5133                         domain_exit(dmar_domain);
5134                         return NULL;
5135                 }
5136
5137                 if (type == IOMMU_DOMAIN_DMA)
5138                         intel_init_iova_domain(dmar_domain);
5139
5140                 domain = &dmar_domain->domain;
5141                 domain->geometry.aperture_start = 0;
5142                 domain->geometry.aperture_end   =
5143                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5144                 domain->geometry.force_aperture = true;
5145
5146                 return domain;
5147         case IOMMU_DOMAIN_IDENTITY:
5148                 return &si_domain->domain;
5149         default:
5150                 return NULL;
5151         }
5152
5153         return NULL;
5154 }
5155
5156 static void intel_iommu_domain_free(struct iommu_domain *domain)
5157 {
5158         if (domain != &si_domain->domain)
5159                 domain_exit(to_dmar_domain(domain));
5160 }
5161
5162 /*
5163  * Check whether a @domain could be attached to the @dev through the
5164  * aux-domain attach/detach APIs.
5165  */
5166 static inline bool
5167 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5168 {
5169         struct device_domain_info *info = get_domain_info(dev);
5170
5171         return info && info->auxd_enabled &&
5172                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5173 }
5174
5175 static void auxiliary_link_device(struct dmar_domain *domain,
5176                                   struct device *dev)
5177 {
5178         struct device_domain_info *info = get_domain_info(dev);
5179
5180         assert_spin_locked(&device_domain_lock);
5181         if (WARN_ON(!info))
5182                 return;
5183
5184         domain->auxd_refcnt++;
5185         list_add(&domain->auxd, &info->auxiliary_domains);
5186 }
5187
5188 static void auxiliary_unlink_device(struct dmar_domain *domain,
5189                                     struct device *dev)
5190 {
5191         struct device_domain_info *info = get_domain_info(dev);
5192
5193         assert_spin_locked(&device_domain_lock);
5194         if (WARN_ON(!info))
5195                 return;
5196
5197         list_del(&domain->auxd);
5198         domain->auxd_refcnt--;
5199
5200         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5201                 ioasid_free(domain->default_pasid);
5202 }
5203
5204 static int aux_domain_add_dev(struct dmar_domain *domain,
5205                               struct device *dev)
5206 {
5207         int ret;
5208         unsigned long flags;
5209         struct intel_iommu *iommu;
5210
5211         iommu = device_to_iommu(dev, NULL, NULL);
5212         if (!iommu)
5213                 return -ENODEV;
5214
5215         if (domain->default_pasid <= 0) {
5216                 u32 pasid;
5217
5218                 /* No private data needed for the default pasid */
5219                 pasid = ioasid_alloc(NULL, PASID_MIN,
5220                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5221                                      NULL);
5222                 if (pasid == INVALID_IOASID) {
5223                         pr_err("Can't allocate default pasid\n");
5224                         return -ENODEV;
5225                 }
5226                 domain->default_pasid = pasid;
5227         }
5228
5229         spin_lock_irqsave(&device_domain_lock, flags);
5230         /*
5231          * iommu->lock must be held to attach domain to iommu and setup the
5232          * pasid entry for second level translation.
5233          */
5234         spin_lock(&iommu->lock);
5235         ret = domain_attach_iommu(domain, iommu);
5236         if (ret)
5237                 goto attach_failed;
5238
5239         /* Setup the PASID entry for mediated devices: */
5240         if (domain_use_first_level(domain))
5241                 ret = domain_setup_first_level(iommu, domain, dev,
5242                                                domain->default_pasid);
5243         else
5244                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5245                                                      domain->default_pasid);
5246         if (ret)
5247                 goto table_failed;
5248         spin_unlock(&iommu->lock);
5249
5250         auxiliary_link_device(domain, dev);
5251
5252         spin_unlock_irqrestore(&device_domain_lock, flags);
5253
5254         return 0;
5255
5256 table_failed:
5257         domain_detach_iommu(domain, iommu);
5258 attach_failed:
5259         spin_unlock(&iommu->lock);
5260         spin_unlock_irqrestore(&device_domain_lock, flags);
5261         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5262                 ioasid_free(domain->default_pasid);
5263
5264         return ret;
5265 }
5266
5267 static void aux_domain_remove_dev(struct dmar_domain *domain,
5268                                   struct device *dev)
5269 {
5270         struct device_domain_info *info;
5271         struct intel_iommu *iommu;
5272         unsigned long flags;
5273
5274         if (!is_aux_domain(dev, &domain->domain))
5275                 return;
5276
5277         spin_lock_irqsave(&device_domain_lock, flags);
5278         info = get_domain_info(dev);
5279         iommu = info->iommu;
5280
5281         auxiliary_unlink_device(domain, dev);
5282
5283         spin_lock(&iommu->lock);
5284         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5285         domain_detach_iommu(domain, iommu);
5286         spin_unlock(&iommu->lock);
5287
5288         spin_unlock_irqrestore(&device_domain_lock, flags);
5289 }
5290
5291 static int prepare_domain_attach_device(struct iommu_domain *domain,
5292                                         struct device *dev)
5293 {
5294         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5295         struct intel_iommu *iommu;
5296         int addr_width;
5297
5298         iommu = device_to_iommu(dev, NULL, NULL);
5299         if (!iommu)
5300                 return -ENODEV;
5301
5302         /* check if this iommu agaw is sufficient for max mapped address */
5303         addr_width = agaw_to_width(iommu->agaw);
5304         if (addr_width > cap_mgaw(iommu->cap))
5305                 addr_width = cap_mgaw(iommu->cap);
5306
5307         if (dmar_domain->max_addr > (1LL << addr_width)) {
5308                 dev_err(dev, "%s: iommu width (%d) is not "
5309                         "sufficient for the mapped address (%llx)\n",
5310                         __func__, addr_width, dmar_domain->max_addr);
5311                 return -EFAULT;
5312         }
5313         dmar_domain->gaw = addr_width;
5314
5315         /*
5316          * Knock out extra levels of page tables if necessary
5317          */
5318         while (iommu->agaw < dmar_domain->agaw) {
5319                 struct dma_pte *pte;
5320
5321                 pte = dmar_domain->pgd;
5322                 if (dma_pte_present(pte)) {
5323                         dmar_domain->pgd = (struct dma_pte *)
5324                                 phys_to_virt(dma_pte_addr(pte));
5325                         free_pgtable_page(pte);
5326                 }
5327                 dmar_domain->agaw--;
5328         }
5329
5330         return 0;
5331 }
5332
5333 static int intel_iommu_attach_device(struct iommu_domain *domain,
5334                                      struct device *dev)
5335 {
5336         int ret;
5337
5338         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5339             device_is_rmrr_locked(dev)) {
5340                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5341                 return -EPERM;
5342         }
5343
5344         if (is_aux_domain(dev, domain))
5345                 return -EPERM;
5346
5347         /* normally dev is not mapped */
5348         if (unlikely(domain_context_mapped(dev))) {
5349                 struct dmar_domain *old_domain;
5350
5351                 old_domain = find_domain(dev);
5352                 if (old_domain)
5353                         dmar_remove_one_dev_info(dev);
5354         }
5355
5356         ret = prepare_domain_attach_device(domain, dev);
5357         if (ret)
5358                 return ret;
5359
5360         return domain_add_dev_info(to_dmar_domain(domain), dev);
5361 }
5362
5363 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5364                                          struct device *dev)
5365 {
5366         int ret;
5367
5368         if (!is_aux_domain(dev, domain))
5369                 return -EPERM;
5370
5371         ret = prepare_domain_attach_device(domain, dev);
5372         if (ret)
5373                 return ret;
5374
5375         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5376 }
5377
5378 static void intel_iommu_detach_device(struct iommu_domain *domain,
5379                                       struct device *dev)
5380 {
5381         dmar_remove_one_dev_info(dev);
5382 }
5383
5384 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5385                                           struct device *dev)
5386 {
5387         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5388 }
5389
5390 /*
5391  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5392  * VT-d granularity. Invalidation is typically included in the unmap operation
5393  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5394  * owns the first level page tables. Invalidations of translation caches in the
5395  * guest are trapped and passed down to the host.
5396  *
5397  * vIOMMU in the guest will only expose first level page tables, therefore
5398  * we do not support IOTLB granularity for request without PASID (second level).
5399  *
5400  * For example, to find the VT-d granularity encoding for IOTLB
5401  * type and page selective granularity within PASID:
5402  * X: indexed by iommu cache type
5403  * Y: indexed by enum iommu_inv_granularity
5404  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5405  */
5406
5407 static const int
5408 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5409         /*
5410          * PASID based IOTLB invalidation: PASID selective (per PASID),
5411          * page selective (address granularity)
5412          */
5413         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5414         /* PASID based dev TLBs */
5415         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5416         /* PASID cache */
5417         {-EINVAL, -EINVAL, -EINVAL}
5418 };
5419
5420 static inline int to_vtd_granularity(int type, int granu)
5421 {
5422         return inv_type_granu_table[type][granu];
5423 }
5424
5425 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5426 {
5427         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5428
5429         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5430          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5431          * granu size in contiguous memory.
5432          */
5433         return order_base_2(nr_pages);
5434 }
5435
5436 #ifdef CONFIG_INTEL_IOMMU_SVM
5437 static int
5438 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5439                            struct iommu_cache_invalidate_info *inv_info)
5440 {
5441         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5442         struct device_domain_info *info;
5443         struct intel_iommu *iommu;
5444         unsigned long flags;
5445         int cache_type;
5446         u8 bus, devfn;
5447         u16 did, sid;
5448         int ret = 0;
5449         u64 size = 0;
5450
5451         if (!inv_info || !dmar_domain)
5452                 return -EINVAL;
5453
5454         if (!dev || !dev_is_pci(dev))
5455                 return -ENODEV;
5456
5457         iommu = device_to_iommu(dev, &bus, &devfn);
5458         if (!iommu)
5459                 return -ENODEV;
5460
5461         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5462                 return -EINVAL;
5463
5464         spin_lock_irqsave(&device_domain_lock, flags);
5465         spin_lock(&iommu->lock);
5466         info = get_domain_info(dev);
5467         if (!info) {
5468                 ret = -EINVAL;
5469                 goto out_unlock;
5470         }
5471         did = dmar_domain->iommu_did[iommu->seq_id];
5472         sid = PCI_DEVID(bus, devfn);
5473
5474         /* Size is only valid in address selective invalidation */
5475         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5476                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5477                                    inv_info->granu.addr_info.nb_granules);
5478
5479         for_each_set_bit(cache_type,
5480                          (unsigned long *)&inv_info->cache,
5481                          IOMMU_CACHE_INV_TYPE_NR) {
5482                 int granu = 0;
5483                 u64 pasid = 0;
5484                 u64 addr = 0;
5485
5486                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5487                 if (granu == -EINVAL) {
5488                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5489                                            cache_type, inv_info->granularity);
5490                         break;
5491                 }
5492
5493                 /*
5494                  * PASID is stored in different locations based on the
5495                  * granularity.
5496                  */
5497                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5498                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5499                         pasid = inv_info->granu.pasid_info.pasid;
5500                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5501                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5502                         pasid = inv_info->granu.addr_info.pasid;
5503
5504                 switch (BIT(cache_type)) {
5505                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5506                         /* HW will ignore LSB bits based on address mask */
5507                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5508                             size &&
5509                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5510                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5511                                                    inv_info->granu.addr_info.addr, size);
5512                         }
5513
5514                         /*
5515                          * If granu is PASID-selective, address is ignored.
5516                          * We use npages = -1 to indicate that.
5517                          */
5518                         qi_flush_piotlb(iommu, did, pasid,
5519                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5520                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5521                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5522
5523                         if (!info->ats_enabled)
5524                                 break;
5525                         /*
5526                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5527                          * in the guest may assume IOTLB flush is inclusive,
5528                          * which is more efficient.
5529                          */
5530                         fallthrough;
5531                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5532                         /*
5533                          * PASID based device TLB invalidation does not support
5534                          * IOMMU_INV_GRANU_PASID granularity but only supports
5535                          * IOMMU_INV_GRANU_ADDR.
5536                          * The equivalent of that is we set the size to be the
5537                          * entire range of 64 bit. User only provides PASID info
5538                          * without address info. So we set addr to 0.
5539                          */
5540                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5541                                 size = 64 - VTD_PAGE_SHIFT;
5542                                 addr = 0;
5543                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5544                                 addr = inv_info->granu.addr_info.addr;
5545                         }
5546
5547                         if (info->ats_enabled)
5548                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5549                                                 info->pfsid, pasid,
5550                                                 info->ats_qdep, addr,
5551                                                 size);
5552                         else
5553                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5554                         break;
5555                 default:
5556                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5557                                             cache_type);
5558                         ret = -EINVAL;
5559                 }
5560         }
5561 out_unlock:
5562         spin_unlock(&iommu->lock);
5563         spin_unlock_irqrestore(&device_domain_lock, flags);
5564
5565         return ret;
5566 }
5567 #endif
5568
5569 static int intel_iommu_map(struct iommu_domain *domain,
5570                            unsigned long iova, phys_addr_t hpa,
5571                            size_t size, int iommu_prot, gfp_t gfp)
5572 {
5573         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5574         u64 max_addr;
5575         int prot = 0;
5576         int ret;
5577
5578         if (iommu_prot & IOMMU_READ)
5579                 prot |= DMA_PTE_READ;
5580         if (iommu_prot & IOMMU_WRITE)
5581                 prot |= DMA_PTE_WRITE;
5582         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5583                 prot |= DMA_PTE_SNP;
5584
5585         max_addr = iova + size;
5586         if (dmar_domain->max_addr < max_addr) {
5587                 u64 end;
5588
5589                 /* check if minimum agaw is sufficient for mapped address */
5590                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5591                 if (end < max_addr) {
5592                         pr_err("%s: iommu width (%d) is not "
5593                                "sufficient for the mapped address (%llx)\n",
5594                                __func__, dmar_domain->gaw, max_addr);
5595                         return -EFAULT;
5596                 }
5597                 dmar_domain->max_addr = max_addr;
5598         }
5599         /* Round up size to next multiple of PAGE_SIZE, if it and
5600            the low bits of hpa would take us onto the next page */
5601         size = aligned_nrpages(hpa, size);
5602         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5603                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5604         return ret;
5605 }
5606
5607 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5608                                 unsigned long iova, size_t size,
5609                                 struct iommu_iotlb_gather *gather)
5610 {
5611         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5612         struct page *freelist = NULL;
5613         unsigned long start_pfn, last_pfn;
5614         unsigned int npages;
5615         int iommu_id, level = 0;
5616
5617         /* Cope with horrid API which requires us to unmap more than the
5618            size argument if it happens to be a large-page mapping. */
5619         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5620
5621         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5622                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5623
5624         start_pfn = iova >> VTD_PAGE_SHIFT;
5625         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5626
5627         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5628
5629         npages = last_pfn - start_pfn + 1;
5630
5631         for_each_domain_iommu(iommu_id, dmar_domain)
5632                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5633                                       start_pfn, npages, !freelist, 0);
5634
5635         dma_free_pagelist(freelist);
5636
5637         if (dmar_domain->max_addr == iova + size)
5638                 dmar_domain->max_addr = iova;
5639
5640         return size;
5641 }
5642
5643 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5644                                             dma_addr_t iova)
5645 {
5646         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5647         struct dma_pte *pte;
5648         int level = 0;
5649         u64 phys = 0;
5650
5651         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5652         if (pte && dma_pte_present(pte))
5653                 phys = dma_pte_addr(pte) +
5654                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5655                                                 VTD_PAGE_SHIFT) - 1));
5656
5657         return phys;
5658 }
5659
5660 static inline bool scalable_mode_support(void)
5661 {
5662         struct dmar_drhd_unit *drhd;
5663         struct intel_iommu *iommu;
5664         bool ret = true;
5665
5666         rcu_read_lock();
5667         for_each_active_iommu(iommu, drhd) {
5668                 if (!sm_supported(iommu)) {
5669                         ret = false;
5670                         break;
5671                 }
5672         }
5673         rcu_read_unlock();
5674
5675         return ret;
5676 }
5677
5678 static inline bool iommu_pasid_support(void)
5679 {
5680         struct dmar_drhd_unit *drhd;
5681         struct intel_iommu *iommu;
5682         bool ret = true;
5683
5684         rcu_read_lock();
5685         for_each_active_iommu(iommu, drhd) {
5686                 if (!pasid_supported(iommu)) {
5687                         ret = false;
5688                         break;
5689                 }
5690         }
5691         rcu_read_unlock();
5692
5693         return ret;
5694 }
5695
5696 static inline bool nested_mode_support(void)
5697 {
5698         struct dmar_drhd_unit *drhd;
5699         struct intel_iommu *iommu;
5700         bool ret = true;
5701
5702         rcu_read_lock();
5703         for_each_active_iommu(iommu, drhd) {
5704                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5705                         ret = false;
5706                         break;
5707                 }
5708         }
5709         rcu_read_unlock();
5710
5711         return ret;
5712 }
5713
5714 static bool intel_iommu_capable(enum iommu_cap cap)
5715 {
5716         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5717                 return domain_update_iommu_snooping(NULL) == 1;
5718         if (cap == IOMMU_CAP_INTR_REMAP)
5719                 return irq_remapping_enabled == 1;
5720
5721         return false;
5722 }
5723
5724 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5725 {
5726         struct intel_iommu *iommu;
5727
5728         iommu = device_to_iommu(dev, NULL, NULL);
5729         if (!iommu)
5730                 return ERR_PTR(-ENODEV);
5731
5732         if (translation_pre_enabled(iommu))
5733                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5734
5735         return &iommu->iommu;
5736 }
5737
5738 static void intel_iommu_release_device(struct device *dev)
5739 {
5740         struct intel_iommu *iommu;
5741
5742         iommu = device_to_iommu(dev, NULL, NULL);
5743         if (!iommu)
5744                 return;
5745
5746         dmar_remove_one_dev_info(dev);
5747
5748         set_dma_ops(dev, NULL);
5749 }
5750
5751 static void intel_iommu_probe_finalize(struct device *dev)
5752 {
5753         struct iommu_domain *domain;
5754
5755         domain = iommu_get_domain_for_dev(dev);
5756         if (device_needs_bounce(dev))
5757                 set_dma_ops(dev, &bounce_dma_ops);
5758         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5759                 set_dma_ops(dev, &intel_dma_ops);
5760         else
5761                 set_dma_ops(dev, NULL);
5762 }
5763
5764 static void intel_iommu_get_resv_regions(struct device *device,
5765                                          struct list_head *head)
5766 {
5767         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5768         struct iommu_resv_region *reg;
5769         struct dmar_rmrr_unit *rmrr;
5770         struct device *i_dev;
5771         int i;
5772
5773         down_read(&dmar_global_lock);
5774         for_each_rmrr_units(rmrr) {
5775                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5776                                           i, i_dev) {
5777                         struct iommu_resv_region *resv;
5778                         enum iommu_resv_type type;
5779                         size_t length;
5780
5781                         if (i_dev != device &&
5782                             !is_downstream_to_pci_bridge(device, i_dev))
5783                                 continue;
5784
5785                         length = rmrr->end_address - rmrr->base_address + 1;
5786
5787                         type = device_rmrr_is_relaxable(device) ?
5788                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5789
5790                         resv = iommu_alloc_resv_region(rmrr->base_address,
5791                                                        length, prot, type);
5792                         if (!resv)
5793                                 break;
5794
5795                         list_add_tail(&resv->list, head);
5796                 }
5797         }
5798         up_read(&dmar_global_lock);
5799
5800 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5801         if (dev_is_pci(device)) {
5802                 struct pci_dev *pdev = to_pci_dev(device);
5803
5804                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5805                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5806                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5807                         if (reg)
5808                                 list_add_tail(&reg->list, head);
5809                 }
5810         }
5811 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5812
5813         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5814                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5815                                       0, IOMMU_RESV_MSI);
5816         if (!reg)
5817                 return;
5818         list_add_tail(&reg->list, head);
5819 }
5820
5821 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5822 {
5823         struct device_domain_info *info;
5824         struct context_entry *context;
5825         struct dmar_domain *domain;
5826         unsigned long flags;
5827         u64 ctx_lo;
5828         int ret;
5829
5830         domain = find_domain(dev);
5831         if (!domain)
5832                 return -EINVAL;
5833
5834         spin_lock_irqsave(&device_domain_lock, flags);
5835         spin_lock(&iommu->lock);
5836
5837         ret = -EINVAL;
5838         info = get_domain_info(dev);
5839         if (!info || !info->pasid_supported)
5840                 goto out;
5841
5842         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5843         if (WARN_ON(!context))
5844                 goto out;
5845
5846         ctx_lo = context[0].lo;
5847
5848         if (!(ctx_lo & CONTEXT_PASIDE)) {
5849                 ctx_lo |= CONTEXT_PASIDE;
5850                 context[0].lo = ctx_lo;
5851                 wmb();
5852                 iommu->flush.flush_context(iommu,
5853                                            domain->iommu_did[iommu->seq_id],
5854                                            PCI_DEVID(info->bus, info->devfn),
5855                                            DMA_CCMD_MASK_NOBIT,
5856                                            DMA_CCMD_DEVICE_INVL);
5857         }
5858
5859         /* Enable PASID support in the device, if it wasn't already */
5860         if (!info->pasid_enabled)
5861                 iommu_enable_dev_iotlb(info);
5862
5863         ret = 0;
5864
5865  out:
5866         spin_unlock(&iommu->lock);
5867         spin_unlock_irqrestore(&device_domain_lock, flags);
5868
5869         return ret;
5870 }
5871
5872 static void intel_iommu_apply_resv_region(struct device *dev,
5873                                           struct iommu_domain *domain,
5874                                           struct iommu_resv_region *region)
5875 {
5876         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5877         unsigned long start, end;
5878
5879         start = IOVA_PFN(region->start);
5880         end   = IOVA_PFN(region->start + region->length - 1);
5881
5882         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5883 }
5884
5885 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5886 {
5887         if (dev_is_pci(dev))
5888                 return pci_device_group(dev);
5889         return generic_device_group(dev);
5890 }
5891
5892 static int intel_iommu_enable_auxd(struct device *dev)
5893 {
5894         struct device_domain_info *info;
5895         struct intel_iommu *iommu;
5896         unsigned long flags;
5897         int ret;
5898
5899         iommu = device_to_iommu(dev, NULL, NULL);
5900         if (!iommu || dmar_disabled)
5901                 return -EINVAL;
5902
5903         if (!sm_supported(iommu) || !pasid_supported(iommu))
5904                 return -EINVAL;
5905
5906         ret = intel_iommu_enable_pasid(iommu, dev);
5907         if (ret)
5908                 return -ENODEV;
5909
5910         spin_lock_irqsave(&device_domain_lock, flags);
5911         info = get_domain_info(dev);
5912         info->auxd_enabled = 1;
5913         spin_unlock_irqrestore(&device_domain_lock, flags);
5914
5915         return 0;
5916 }
5917
5918 static int intel_iommu_disable_auxd(struct device *dev)
5919 {
5920         struct device_domain_info *info;
5921         unsigned long flags;
5922
5923         spin_lock_irqsave(&device_domain_lock, flags);
5924         info = get_domain_info(dev);
5925         if (!WARN_ON(!info))
5926                 info->auxd_enabled = 0;
5927         spin_unlock_irqrestore(&device_domain_lock, flags);
5928
5929         return 0;
5930 }
5931
5932 /*
5933  * A PCI express designated vendor specific extended capability is defined
5934  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5935  * for system software and tools to detect endpoint devices supporting the
5936  * Intel scalable IO virtualization without host driver dependency.
5937  *
5938  * Returns the address of the matching extended capability structure within
5939  * the device's PCI configuration space or 0 if the device does not support
5940  * it.
5941  */
5942 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5943 {
5944         int pos;
5945         u16 vendor, id;
5946
5947         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5948         while (pos) {
5949                 pci_read_config_word(pdev, pos + 4, &vendor);
5950                 pci_read_config_word(pdev, pos + 8, &id);
5951                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5952                         return pos;
5953
5954                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5955         }
5956
5957         return 0;
5958 }
5959
5960 static bool
5961 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5962 {
5963         if (feat == IOMMU_DEV_FEAT_AUX) {
5964                 int ret;
5965
5966                 if (!dev_is_pci(dev) || dmar_disabled ||
5967                     !scalable_mode_support() || !iommu_pasid_support())
5968                         return false;
5969
5970                 ret = pci_pasid_features(to_pci_dev(dev));
5971                 if (ret < 0)
5972                         return false;
5973
5974                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5975         }
5976
5977         if (feat == IOMMU_DEV_FEAT_SVA) {
5978                 struct device_domain_info *info = get_domain_info(dev);
5979
5980                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5981                         info->pasid_supported && info->pri_supported &&
5982                         info->ats_supported;
5983         }
5984
5985         return false;
5986 }
5987
5988 static int
5989 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5990 {
5991         if (feat == IOMMU_DEV_FEAT_AUX)
5992                 return intel_iommu_enable_auxd(dev);
5993
5994         if (feat == IOMMU_DEV_FEAT_SVA) {
5995                 struct device_domain_info *info = get_domain_info(dev);
5996
5997                 if (!info)
5998                         return -EINVAL;
5999
6000                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6001                         return 0;
6002         }
6003
6004         return -ENODEV;
6005 }
6006
6007 static int
6008 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6009 {
6010         if (feat == IOMMU_DEV_FEAT_AUX)
6011                 return intel_iommu_disable_auxd(dev);
6012
6013         return -ENODEV;
6014 }
6015
6016 static bool
6017 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6018 {
6019         struct device_domain_info *info = get_domain_info(dev);
6020
6021         if (feat == IOMMU_DEV_FEAT_AUX)
6022                 return scalable_mode_support() && info && info->auxd_enabled;
6023
6024         return false;
6025 }
6026
6027 static int
6028 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6029 {
6030         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6031
6032         return dmar_domain->default_pasid > 0 ?
6033                         dmar_domain->default_pasid : -EINVAL;
6034 }
6035
6036 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6037                                            struct device *dev)
6038 {
6039         return attach_deferred(dev);
6040 }
6041
6042 static int
6043 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6044                             enum iommu_attr attr, void *data)
6045 {
6046         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6047         unsigned long flags;
6048         int ret = 0;
6049
6050         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6051                 return -EINVAL;
6052
6053         switch (attr) {
6054         case DOMAIN_ATTR_NESTING:
6055                 spin_lock_irqsave(&device_domain_lock, flags);
6056                 if (nested_mode_support() &&
6057                     list_empty(&dmar_domain->devices)) {
6058                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6059                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6060                 } else {
6061                         ret = -ENODEV;
6062                 }
6063                 spin_unlock_irqrestore(&device_domain_lock, flags);
6064                 break;
6065         default:
6066                 ret = -EINVAL;
6067                 break;
6068         }
6069
6070         return ret;
6071 }
6072
6073 /*
6074  * Check that the device does not live on an external facing PCI port that is
6075  * marked as untrusted. Such devices should not be able to apply quirks and
6076  * thus not be able to bypass the IOMMU restrictions.
6077  */
6078 static bool risky_device(struct pci_dev *pdev)
6079 {
6080         if (pdev->untrusted) {
6081                 pci_info(pdev,
6082                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6083                          pdev->vendor, pdev->device);
6084                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6085                 return true;
6086         }
6087         return false;
6088 }
6089
6090 const struct iommu_ops intel_iommu_ops = {
6091         .capable                = intel_iommu_capable,
6092         .domain_alloc           = intel_iommu_domain_alloc,
6093         .domain_free            = intel_iommu_domain_free,
6094         .domain_set_attr        = intel_iommu_domain_set_attr,
6095         .attach_dev             = intel_iommu_attach_device,
6096         .detach_dev             = intel_iommu_detach_device,
6097         .aux_attach_dev         = intel_iommu_aux_attach_device,
6098         .aux_detach_dev         = intel_iommu_aux_detach_device,
6099         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6100         .map                    = intel_iommu_map,
6101         .unmap                  = intel_iommu_unmap,
6102         .iova_to_phys           = intel_iommu_iova_to_phys,
6103         .probe_device           = intel_iommu_probe_device,
6104         .probe_finalize         = intel_iommu_probe_finalize,
6105         .release_device         = intel_iommu_release_device,
6106         .get_resv_regions       = intel_iommu_get_resv_regions,
6107         .put_resv_regions       = generic_iommu_put_resv_regions,
6108         .apply_resv_region      = intel_iommu_apply_resv_region,
6109         .device_group           = intel_iommu_device_group,
6110         .dev_has_feat           = intel_iommu_dev_has_feat,
6111         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6112         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6113         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6114         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6115         .def_domain_type        = device_def_domain_type,
6116         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6117 #ifdef CONFIG_INTEL_IOMMU_SVM
6118         .cache_invalidate       = intel_iommu_sva_invalidate,
6119         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6120         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6121         .sva_bind               = intel_svm_bind,
6122         .sva_unbind             = intel_svm_unbind,
6123         .sva_get_pasid          = intel_svm_get_pasid,
6124         .page_response          = intel_svm_page_response,
6125 #endif
6126 };
6127
6128 static void quirk_iommu_igfx(struct pci_dev *dev)
6129 {
6130         if (risky_device(dev))
6131                 return;
6132
6133         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6134         dmar_map_gfx = 0;
6135 }
6136
6137 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6145
6146 /* Broadwell igfx malfunctions with dmar */
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6170 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6171
6172 static void quirk_iommu_rwbf(struct pci_dev *dev)
6173 {
6174         if (risky_device(dev))
6175                 return;
6176
6177         /*
6178          * Mobile 4 Series Chipset neglects to set RWBF capability,
6179          * but needs it. Same seems to hold for the desktop versions.
6180          */
6181         pci_info(dev, "Forcing write-buffer flush capability\n");
6182         rwbf_quirk = 1;
6183 }
6184
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6190 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6191 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6192
6193 #define GGC 0x52
6194 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6195 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6196 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6197 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6198 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6199 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6200 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6201 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6202
6203 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6204 {
6205         unsigned short ggc;
6206
6207         if (risky_device(dev))
6208                 return;
6209
6210         if (pci_read_config_word(dev, GGC, &ggc))
6211                 return;
6212
6213         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6214                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6215                 dmar_map_gfx = 0;
6216         } else if (dmar_map_gfx) {
6217                 /* we have to ensure the gfx device is idle before we flush */
6218                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6219                 intel_iommu_strict = 1;
6220        }
6221 }
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6226
6227 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6228 {
6229         unsigned short ver;
6230
6231         if (!IS_GFX_DEVICE(dev))
6232                 return;
6233
6234         ver = (dev->device >> 8) & 0xff;
6235         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6236             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6237             ver != 0x9a)
6238                 return;
6239
6240         if (risky_device(dev))
6241                 return;
6242
6243         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6244         iommu_skip_te_disable = 1;
6245 }
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6247
6248 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6249    ISOCH DMAR unit for the Azalia sound device, but not give it any
6250    TLB entries, which causes it to deadlock. Check for that.  We do
6251    this in a function called from init_dmars(), instead of in a PCI
6252    quirk, because we don't want to print the obnoxious "BIOS broken"
6253    message if VT-d is actually disabled.
6254 */
6255 static void __init check_tylersburg_isoch(void)
6256 {
6257         struct pci_dev *pdev;
6258         uint32_t vtisochctrl;
6259
6260         /* If there's no Azalia in the system anyway, forget it. */
6261         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6262         if (!pdev)
6263                 return;
6264
6265         if (risky_device(pdev)) {
6266                 pci_dev_put(pdev);
6267                 return;
6268         }
6269
6270         pci_dev_put(pdev);
6271
6272         /* System Management Registers. Might be hidden, in which case
6273            we can't do the sanity check. But that's OK, because the
6274            known-broken BIOSes _don't_ actually hide it, so far. */
6275         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6276         if (!pdev)
6277                 return;
6278
6279         if (risky_device(pdev)) {
6280                 pci_dev_put(pdev);
6281                 return;
6282         }
6283
6284         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6285                 pci_dev_put(pdev);
6286                 return;
6287         }
6288
6289         pci_dev_put(pdev);
6290
6291         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6292         if (vtisochctrl & 1)
6293                 return;
6294
6295         /* Drop all bits other than the number of TLB entries */
6296         vtisochctrl &= 0x1c;
6297
6298         /* If we have the recommended number of TLB entries (16), fine. */
6299         if (vtisochctrl == 0x10)
6300                 return;
6301
6302         /* Zero TLB entries? You get to ride the short bus to school. */
6303         if (!vtisochctrl) {
6304                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6305                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6306                      dmi_get_system_info(DMI_BIOS_VENDOR),
6307                      dmi_get_system_info(DMI_BIOS_VERSION),
6308                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6309                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6310                 return;
6311         }
6312
6313         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6314                vtisochctrl);
6315 }