0af7b46692647a84199c1225541192901d853632
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN          (1)
79
80 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
182
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
184
185 /*
186  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
187  * if marked present.
188  */
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
190 {
191         if (!(re->lo & 1))
192                 return 0;
193
194         return re->lo & VTD_PAGE_MASK;
195 }
196
197 /*
198  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
202 {
203         if (!(re->hi & 1))
204                 return 0;
205
206         return re->hi & VTD_PAGE_MASK;
207 }
208
209 static inline void context_clear_pasid_enable(struct context_entry *context)
210 {
211         context->lo &= ~(1ULL << 11);
212 }
213
214 static inline bool context_pasid_enabled(struct context_entry *context)
215 {
216         return !!(context->lo & (1ULL << 11));
217 }
218
219 static inline void context_set_copied(struct context_entry *context)
220 {
221         context->hi |= (1ull << 3);
222 }
223
224 static inline bool context_copied(struct context_entry *context)
225 {
226         return !!(context->hi & (1ULL << 3));
227 }
228
229 static inline bool __context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233
234 bool context_present(struct context_entry *context)
235 {
236         return context_pasid_enabled(context) ?
237              __context_present(context) :
238              __context_present(context) && !context_copied(context);
239 }
240
241 static inline void context_set_present(struct context_entry *context)
242 {
243         context->lo |= 1;
244 }
245
246 static inline void context_set_fault_enable(struct context_entry *context)
247 {
248         context->lo &= (((u64)-1) << 2) | 1;
249 }
250
251 static inline void context_set_translation_type(struct context_entry *context,
252                                                 unsigned long value)
253 {
254         context->lo &= (((u64)-1) << 4) | 3;
255         context->lo |= (value & 3) << 2;
256 }
257
258 static inline void context_set_address_root(struct context_entry *context,
259                                             unsigned long value)
260 {
261         context->lo &= ~VTD_PAGE_MASK;
262         context->lo |= value & VTD_PAGE_MASK;
263 }
264
265 static inline void context_set_address_width(struct context_entry *context,
266                                              unsigned long value)
267 {
268         context->hi |= value & 7;
269 }
270
271 static inline void context_set_domain_id(struct context_entry *context,
272                                          unsigned long value)
273 {
274         context->hi |= (value & ((1 << 16) - 1)) << 8;
275 }
276
277 static inline int context_domain_id(struct context_entry *c)
278 {
279         return((c->hi >> 8) & 0xffff);
280 }
281
282 static inline void context_clear_entry(struct context_entry *context)
283 {
284         context->lo = 0;
285         context->hi = 0;
286 }
287
288 /*
289  * This domain is a statically identity mapping domain.
290  *      1. This domain creats a static 1:1 mapping to all usable memory.
291  *      2. It maps to each iommu if successful.
292  *      3. Each iommu mapps to this domain if successful.
293  */
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
296
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
299
300 /*
301  * This is a DMA domain allocated through the iommu domain allocation
302  * interface. But one or more devices belonging to this domain have
303  * been chosen to use a private domain. We should avoid to use the
304  * map/unmap/iova_to_phys APIs on it.
305  */
306 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319 };
320
321 struct dmar_atsr_unit {
322         struct list_head list;          /* list of ATSR units */
323         struct acpi_dmar_header *hdr;   /* ACPI header */
324         struct dmar_dev_scope *devices; /* target devices */
325         int devices_cnt;                /* target device count */
326         u8 include_all:1;               /* include all ports */
327 };
328
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331
332 #define for_each_rmrr_units(rmrr) \
333         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343                                struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
346                                      struct device *dev);
347
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
350 #else
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
353
354 int intel_iommu_sm;
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
357
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
363 static int intel_no_bounce;
364
365 #define IDENTMAP_ALL            1
366 #define IDENTMAP_GFX            2
367 #define IDENTMAP_AZALIA         4
368
369 int intel_iommu_gfx_mapped;
370 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371
372 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
373 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
374 static DEFINE_SPINLOCK(device_domain_lock);
375 static LIST_HEAD(device_domain_list);
376
377 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
378                                 to_pci_dev(d)->untrusted)
379
380 /*
381  * Iterate over elements in device_domain_list and call the specified
382  * callback @fn against each element.
383  */
384 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
385                                      void *data), void *data)
386 {
387         int ret = 0;
388         unsigned long flags;
389         struct device_domain_info *info;
390
391         spin_lock_irqsave(&device_domain_lock, flags);
392         list_for_each_entry(info, &device_domain_list, global) {
393                 ret = fn(info, data);
394                 if (ret) {
395                         spin_unlock_irqrestore(&device_domain_lock, flags);
396                         return ret;
397                 }
398         }
399         spin_unlock_irqrestore(&device_domain_lock, flags);
400
401         return 0;
402 }
403
404 const struct iommu_ops intel_iommu_ops;
405
406 static bool translation_pre_enabled(struct intel_iommu *iommu)
407 {
408         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
409 }
410
411 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
412 {
413         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
414 }
415
416 static void init_translation_status(struct intel_iommu *iommu)
417 {
418         u32 gsts;
419
420         gsts = readl(iommu->reg + DMAR_GSTS_REG);
421         if (gsts & DMA_GSTS_TES)
422                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
423 }
424
425 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
426 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
427 {
428         return container_of(dom, struct dmar_domain, domain);
429 }
430
431 static int __init intel_iommu_setup(char *str)
432 {
433         if (!str)
434                 return -EINVAL;
435         while (*str) {
436                 if (!strncmp(str, "on", 2)) {
437                         dmar_disabled = 0;
438                         pr_info("IOMMU enabled\n");
439                 } else if (!strncmp(str, "off", 3)) {
440                         dmar_disabled = 1;
441                         no_platform_optin = 1;
442                         pr_info("IOMMU disabled\n");
443                 } else if (!strncmp(str, "igfx_off", 8)) {
444                         dmar_map_gfx = 0;
445                         pr_info("Disable GFX device mapping\n");
446                 } else if (!strncmp(str, "forcedac", 8)) {
447                         pr_info("Forcing DAC for PCI devices\n");
448                         dmar_forcedac = 1;
449                 } else if (!strncmp(str, "strict", 6)) {
450                         pr_info("Disable batched IOTLB flush\n");
451                         intel_iommu_strict = 1;
452                 } else if (!strncmp(str, "sp_off", 6)) {
453                         pr_info("Disable supported super page\n");
454                         intel_iommu_superpage = 0;
455                 } else if (!strncmp(str, "sm_on", 5)) {
456                         pr_info("Intel-IOMMU: scalable mode supported\n");
457                         intel_iommu_sm = 1;
458                 } else if (!strncmp(str, "tboot_noforce", 13)) {
459                         printk(KERN_INFO
460                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
461                         intel_iommu_tboot_noforce = 1;
462                 } else if (!strncmp(str, "nobounce", 8)) {
463                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
464                         intel_no_bounce = 1;
465                 }
466
467                 str += strcspn(str, ",");
468                 while (*str == ',')
469                         str++;
470         }
471         return 0;
472 }
473 __setup("intel_iommu=", intel_iommu_setup);
474
475 static struct kmem_cache *iommu_domain_cache;
476 static struct kmem_cache *iommu_devinfo_cache;
477
478 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
479 {
480         struct dmar_domain **domains;
481         int idx = did >> 8;
482
483         domains = iommu->domains[idx];
484         if (!domains)
485                 return NULL;
486
487         return domains[did & 0xff];
488 }
489
490 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
491                              struct dmar_domain *domain)
492 {
493         struct dmar_domain **domains;
494         int idx = did >> 8;
495
496         if (!iommu->domains[idx]) {
497                 size_t size = 256 * sizeof(struct dmar_domain *);
498                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
499         }
500
501         domains = iommu->domains[idx];
502         if (WARN_ON(!domains))
503                 return;
504         else
505                 domains[did & 0xff] = domain;
506 }
507
508 void *alloc_pgtable_page(int node)
509 {
510         struct page *page;
511         void *vaddr = NULL;
512
513         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
514         if (page)
515                 vaddr = page_address(page);
516         return vaddr;
517 }
518
519 void free_pgtable_page(void *vaddr)
520 {
521         free_page((unsigned long)vaddr);
522 }
523
524 static inline void *alloc_domain_mem(void)
525 {
526         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
527 }
528
529 static void free_domain_mem(void *vaddr)
530 {
531         kmem_cache_free(iommu_domain_cache, vaddr);
532 }
533
534 static inline void * alloc_devinfo_mem(void)
535 {
536         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
537 }
538
539 static inline void free_devinfo_mem(void *vaddr)
540 {
541         kmem_cache_free(iommu_devinfo_cache, vaddr);
542 }
543
544 static inline int domain_type_is_si(struct dmar_domain *domain)
545 {
546         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
547 }
548
549 static inline int domain_pfn_supported(struct dmar_domain *domain,
550                                        unsigned long pfn)
551 {
552         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
553
554         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
555 }
556
557 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
558 {
559         unsigned long sagaw;
560         int agaw = -1;
561
562         sagaw = cap_sagaw(iommu->cap);
563         for (agaw = width_to_agaw(max_gaw);
564              agaw >= 0; agaw--) {
565                 if (test_bit(agaw, &sagaw))
566                         break;
567         }
568
569         return agaw;
570 }
571
572 /*
573  * Calculate max SAGAW for each iommu.
574  */
575 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
576 {
577         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
578 }
579
580 /*
581  * calculate agaw for each iommu.
582  * "SAGAW" may be different across iommus, use a default agaw, and
583  * get a supported less agaw for iommus that don't support the default agaw.
584  */
585 int iommu_calculate_agaw(struct intel_iommu *iommu)
586 {
587         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
588 }
589
590 /* This functionin only returns single iommu in a domain */
591 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
592 {
593         int iommu_id;
594
595         /* si_domain and vm domain should not get here. */
596         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
597                 return NULL;
598
599         for_each_domain_iommu(iommu_id, domain)
600                 break;
601
602         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
603                 return NULL;
604
605         return g_iommus[iommu_id];
606 }
607
608 static void domain_update_iommu_coherency(struct dmar_domain *domain)
609 {
610         struct dmar_drhd_unit *drhd;
611         struct intel_iommu *iommu;
612         bool found = false;
613         int i;
614
615         domain->iommu_coherency = 1;
616
617         for_each_domain_iommu(i, domain) {
618                 found = true;
619                 if (!ecap_coherent(g_iommus[i]->ecap)) {
620                         domain->iommu_coherency = 0;
621                         break;
622                 }
623         }
624         if (found)
625                 return;
626
627         /* No hardware attached; use lowest common denominator */
628         rcu_read_lock();
629         for_each_active_iommu(iommu, drhd) {
630                 if (!ecap_coherent(iommu->ecap)) {
631                         domain->iommu_coherency = 0;
632                         break;
633                 }
634         }
635         rcu_read_unlock();
636 }
637
638 static int domain_update_iommu_snooping(struct intel_iommu *skip)
639 {
640         struct dmar_drhd_unit *drhd;
641         struct intel_iommu *iommu;
642         int ret = 1;
643
644         rcu_read_lock();
645         for_each_active_iommu(iommu, drhd) {
646                 if (iommu != skip) {
647                         if (!ecap_sc_support(iommu->ecap)) {
648                                 ret = 0;
649                                 break;
650                         }
651                 }
652         }
653         rcu_read_unlock();
654
655         return ret;
656 }
657
658 static int domain_update_iommu_superpage(struct intel_iommu *skip)
659 {
660         struct dmar_drhd_unit *drhd;
661         struct intel_iommu *iommu;
662         int mask = 0xf;
663
664         if (!intel_iommu_superpage) {
665                 return 0;
666         }
667
668         /* set iommu_superpage to the smallest common denominator */
669         rcu_read_lock();
670         for_each_active_iommu(iommu, drhd) {
671                 if (iommu != skip) {
672                         mask &= cap_super_page_val(iommu->cap);
673                         if (!mask)
674                                 break;
675                 }
676         }
677         rcu_read_unlock();
678
679         return fls(mask);
680 }
681
682 /* Some capabilities may be different across iommus */
683 static void domain_update_iommu_cap(struct dmar_domain *domain)
684 {
685         domain_update_iommu_coherency(domain);
686         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
687         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
688 }
689
690 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
691                                          u8 devfn, int alloc)
692 {
693         struct root_entry *root = &iommu->root_entry[bus];
694         struct context_entry *context;
695         u64 *entry;
696
697         entry = &root->lo;
698         if (sm_supported(iommu)) {
699                 if (devfn >= 0x80) {
700                         devfn -= 0x80;
701                         entry = &root->hi;
702                 }
703                 devfn *= 2;
704         }
705         if (*entry & 1)
706                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
707         else {
708                 unsigned long phy_addr;
709                 if (!alloc)
710                         return NULL;
711
712                 context = alloc_pgtable_page(iommu->node);
713                 if (!context)
714                         return NULL;
715
716                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
717                 phy_addr = virt_to_phys((void *)context);
718                 *entry = phy_addr | 1;
719                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
720         }
721         return &context[devfn];
722 }
723
724 static int iommu_dummy(struct device *dev)
725 {
726         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
727 }
728
729 /**
730  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
731  *                               sub-hierarchy of a candidate PCI-PCI bridge
732  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
733  * @bridge: the candidate PCI-PCI bridge
734  *
735  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
736  */
737 static bool
738 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
739 {
740         struct pci_dev *pdev, *pbridge;
741
742         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
743                 return false;
744
745         pdev = to_pci_dev(dev);
746         pbridge = to_pci_dev(bridge);
747
748         if (pbridge->subordinate &&
749             pbridge->subordinate->number <= pdev->bus->number &&
750             pbridge->subordinate->busn_res.end >= pdev->bus->number)
751                 return true;
752
753         return false;
754 }
755
756 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
757 {
758         struct dmar_drhd_unit *drhd = NULL;
759         struct intel_iommu *iommu;
760         struct device *tmp;
761         struct pci_dev *pdev = NULL;
762         u16 segment = 0;
763         int i;
764
765         if (iommu_dummy(dev))
766                 return NULL;
767
768         if (dev_is_pci(dev)) {
769                 struct pci_dev *pf_pdev;
770
771                 pdev = to_pci_dev(dev);
772
773 #ifdef CONFIG_X86
774                 /* VMD child devices currently cannot be handled individually */
775                 if (is_vmd(pdev->bus))
776                         return NULL;
777 #endif
778
779                 /* VFs aren't listed in scope tables; we need to look up
780                  * the PF instead to find the IOMMU. */
781                 pf_pdev = pci_physfn(pdev);
782                 dev = &pf_pdev->dev;
783                 segment = pci_domain_nr(pdev->bus);
784         } else if (has_acpi_companion(dev))
785                 dev = &ACPI_COMPANION(dev)->dev;
786
787         rcu_read_lock();
788         for_each_active_iommu(iommu, drhd) {
789                 if (pdev && segment != drhd->segment)
790                         continue;
791
792                 for_each_active_dev_scope(drhd->devices,
793                                           drhd->devices_cnt, i, tmp) {
794                         if (tmp == dev) {
795                                 /* For a VF use its original BDF# not that of the PF
796                                  * which we used for the IOMMU lookup. Strictly speaking
797                                  * we could do this for all PCI devices; we only need to
798                                  * get the BDF# from the scope table for ACPI matches. */
799                                 if (pdev && pdev->is_virtfn)
800                                         goto got_pdev;
801
802                                 *bus = drhd->devices[i].bus;
803                                 *devfn = drhd->devices[i].devfn;
804                                 goto out;
805                         }
806
807                         if (is_downstream_to_pci_bridge(dev, tmp))
808                                 goto got_pdev;
809                 }
810
811                 if (pdev && drhd->include_all) {
812                 got_pdev:
813                         *bus = pdev->bus->number;
814                         *devfn = pdev->devfn;
815                         goto out;
816                 }
817         }
818         iommu = NULL;
819  out:
820         rcu_read_unlock();
821
822         return iommu;
823 }
824
825 static void domain_flush_cache(struct dmar_domain *domain,
826                                void *addr, int size)
827 {
828         if (!domain->iommu_coherency)
829                 clflush_cache_range(addr, size);
830 }
831
832 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
833 {
834         struct context_entry *context;
835         int ret = 0;
836         unsigned long flags;
837
838         spin_lock_irqsave(&iommu->lock, flags);
839         context = iommu_context_addr(iommu, bus, devfn, 0);
840         if (context)
841                 ret = context_present(context);
842         spin_unlock_irqrestore(&iommu->lock, flags);
843         return ret;
844 }
845
846 static void free_context_table(struct intel_iommu *iommu)
847 {
848         int i;
849         unsigned long flags;
850         struct context_entry *context;
851
852         spin_lock_irqsave(&iommu->lock, flags);
853         if (!iommu->root_entry) {
854                 goto out;
855         }
856         for (i = 0; i < ROOT_ENTRY_NR; i++) {
857                 context = iommu_context_addr(iommu, i, 0, 0);
858                 if (context)
859                         free_pgtable_page(context);
860
861                 if (!sm_supported(iommu))
862                         continue;
863
864                 context = iommu_context_addr(iommu, i, 0x80, 0);
865                 if (context)
866                         free_pgtable_page(context);
867
868         }
869         free_pgtable_page(iommu->root_entry);
870         iommu->root_entry = NULL;
871 out:
872         spin_unlock_irqrestore(&iommu->lock, flags);
873 }
874
875 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
876                                       unsigned long pfn, int *target_level)
877 {
878         struct dma_pte *parent, *pte;
879         int level = agaw_to_level(domain->agaw);
880         int offset;
881
882         BUG_ON(!domain->pgd);
883
884         if (!domain_pfn_supported(domain, pfn))
885                 /* Address beyond IOMMU's addressing capabilities. */
886                 return NULL;
887
888         parent = domain->pgd;
889
890         while (1) {
891                 void *tmp_page;
892
893                 offset = pfn_level_offset(pfn, level);
894                 pte = &parent[offset];
895                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
896                         break;
897                 if (level == *target_level)
898                         break;
899
900                 if (!dma_pte_present(pte)) {
901                         uint64_t pteval;
902
903                         tmp_page = alloc_pgtable_page(domain->nid);
904
905                         if (!tmp_page)
906                                 return NULL;
907
908                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
909                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
910                         if (cmpxchg64(&pte->val, 0ULL, pteval))
911                                 /* Someone else set it while we were thinking; use theirs. */
912                                 free_pgtable_page(tmp_page);
913                         else
914                                 domain_flush_cache(domain, pte, sizeof(*pte));
915                 }
916                 if (level == 1)
917                         break;
918
919                 parent = phys_to_virt(dma_pte_addr(pte));
920                 level--;
921         }
922
923         if (!*target_level)
924                 *target_level = level;
925
926         return pte;
927 }
928
929 /* return address's pte at specific level */
930 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
931                                          unsigned long pfn,
932                                          int level, int *large_page)
933 {
934         struct dma_pte *parent, *pte;
935         int total = agaw_to_level(domain->agaw);
936         int offset;
937
938         parent = domain->pgd;
939         while (level <= total) {
940                 offset = pfn_level_offset(pfn, total);
941                 pte = &parent[offset];
942                 if (level == total)
943                         return pte;
944
945                 if (!dma_pte_present(pte)) {
946                         *large_page = total;
947                         break;
948                 }
949
950                 if (dma_pte_superpage(pte)) {
951                         *large_page = total;
952                         return pte;
953                 }
954
955                 parent = phys_to_virt(dma_pte_addr(pte));
956                 total--;
957         }
958         return NULL;
959 }
960
961 /* clear last level pte, a tlb flush should be followed */
962 static void dma_pte_clear_range(struct dmar_domain *domain,
963                                 unsigned long start_pfn,
964                                 unsigned long last_pfn)
965 {
966         unsigned int large_page;
967         struct dma_pte *first_pte, *pte;
968
969         BUG_ON(!domain_pfn_supported(domain, start_pfn));
970         BUG_ON(!domain_pfn_supported(domain, last_pfn));
971         BUG_ON(start_pfn > last_pfn);
972
973         /* we don't need lock here; nobody else touches the iova range */
974         do {
975                 large_page = 1;
976                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
977                 if (!pte) {
978                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
979                         continue;
980                 }
981                 do {
982                         dma_clear_pte(pte);
983                         start_pfn += lvl_to_nr_pages(large_page);
984                         pte++;
985                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
986
987                 domain_flush_cache(domain, first_pte,
988                                    (void *)pte - (void *)first_pte);
989
990         } while (start_pfn && start_pfn <= last_pfn);
991 }
992
993 static void dma_pte_free_level(struct dmar_domain *domain, int level,
994                                int retain_level, struct dma_pte *pte,
995                                unsigned long pfn, unsigned long start_pfn,
996                                unsigned long last_pfn)
997 {
998         pfn = max(start_pfn, pfn);
999         pte = &pte[pfn_level_offset(pfn, level)];
1000
1001         do {
1002                 unsigned long level_pfn;
1003                 struct dma_pte *level_pte;
1004
1005                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1006                         goto next;
1007
1008                 level_pfn = pfn & level_mask(level);
1009                 level_pte = phys_to_virt(dma_pte_addr(pte));
1010
1011                 if (level > 2) {
1012                         dma_pte_free_level(domain, level - 1, retain_level,
1013                                            level_pte, level_pfn, start_pfn,
1014                                            last_pfn);
1015                 }
1016
1017                 /*
1018                  * Free the page table if we're below the level we want to
1019                  * retain and the range covers the entire table.
1020                  */
1021                 if (level < retain_level && !(start_pfn > level_pfn ||
1022                       last_pfn < level_pfn + level_size(level) - 1)) {
1023                         dma_clear_pte(pte);
1024                         domain_flush_cache(domain, pte, sizeof(*pte));
1025                         free_pgtable_page(level_pte);
1026                 }
1027 next:
1028                 pfn += level_size(level);
1029         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1030 }
1031
1032 /*
1033  * clear last level (leaf) ptes and free page table pages below the
1034  * level we wish to keep intact.
1035  */
1036 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1037                                    unsigned long start_pfn,
1038                                    unsigned long last_pfn,
1039                                    int retain_level)
1040 {
1041         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1042         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1043         BUG_ON(start_pfn > last_pfn);
1044
1045         dma_pte_clear_range(domain, start_pfn, last_pfn);
1046
1047         /* We don't need lock here; nobody else touches the iova range */
1048         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1049                            domain->pgd, 0, start_pfn, last_pfn);
1050
1051         /* free pgd */
1052         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1053                 free_pgtable_page(domain->pgd);
1054                 domain->pgd = NULL;
1055         }
1056 }
1057
1058 /* When a page at a given level is being unlinked from its parent, we don't
1059    need to *modify* it at all. All we need to do is make a list of all the
1060    pages which can be freed just as soon as we've flushed the IOTLB and we
1061    know the hardware page-walk will no longer touch them.
1062    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1063    be freed. */
1064 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1065                                             int level, struct dma_pte *pte,
1066                                             struct page *freelist)
1067 {
1068         struct page *pg;
1069
1070         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1071         pg->freelist = freelist;
1072         freelist = pg;
1073
1074         if (level == 1)
1075                 return freelist;
1076
1077         pte = page_address(pg);
1078         do {
1079                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1080                         freelist = dma_pte_list_pagetables(domain, level - 1,
1081                                                            pte, freelist);
1082                 pte++;
1083         } while (!first_pte_in_page(pte));
1084
1085         return freelist;
1086 }
1087
1088 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1089                                         struct dma_pte *pte, unsigned long pfn,
1090                                         unsigned long start_pfn,
1091                                         unsigned long last_pfn,
1092                                         struct page *freelist)
1093 {
1094         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1095
1096         pfn = max(start_pfn, pfn);
1097         pte = &pte[pfn_level_offset(pfn, level)];
1098
1099         do {
1100                 unsigned long level_pfn;
1101
1102                 if (!dma_pte_present(pte))
1103                         goto next;
1104
1105                 level_pfn = pfn & level_mask(level);
1106
1107                 /* If range covers entire pagetable, free it */
1108                 if (start_pfn <= level_pfn &&
1109                     last_pfn >= level_pfn + level_size(level) - 1) {
1110                         /* These suborbinate page tables are going away entirely. Don't
1111                            bother to clear them; we're just going to *free* them. */
1112                         if (level > 1 && !dma_pte_superpage(pte))
1113                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114
1115                         dma_clear_pte(pte);
1116                         if (!first_pte)
1117                                 first_pte = pte;
1118                         last_pte = pte;
1119                 } else if (level > 1) {
1120                         /* Recurse down into a level that isn't *entirely* obsolete */
1121                         freelist = dma_pte_clear_level(domain, level - 1,
1122                                                        phys_to_virt(dma_pte_addr(pte)),
1123                                                        level_pfn, start_pfn, last_pfn,
1124                                                        freelist);
1125                 }
1126 next:
1127                 pfn += level_size(level);
1128         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1129
1130         if (first_pte)
1131                 domain_flush_cache(domain, first_pte,
1132                                    (void *)++last_pte - (void *)first_pte);
1133
1134         return freelist;
1135 }
1136
1137 /* We can't just free the pages because the IOMMU may still be walking
1138    the page tables, and may have cached the intermediate levels. The
1139    pages can only be freed after the IOTLB flush has been done. */
1140 static struct page *domain_unmap(struct dmar_domain *domain,
1141                                  unsigned long start_pfn,
1142                                  unsigned long last_pfn)
1143 {
1144         struct page *freelist;
1145
1146         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1147         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1148         BUG_ON(start_pfn > last_pfn);
1149
1150         /* we don't need lock here; nobody else touches the iova range */
1151         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1152                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1153
1154         /* free pgd */
1155         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1156                 struct page *pgd_page = virt_to_page(domain->pgd);
1157                 pgd_page->freelist = freelist;
1158                 freelist = pgd_page;
1159
1160                 domain->pgd = NULL;
1161         }
1162
1163         return freelist;
1164 }
1165
1166 static void dma_free_pagelist(struct page *freelist)
1167 {
1168         struct page *pg;
1169
1170         while ((pg = freelist)) {
1171                 freelist = pg->freelist;
1172                 free_pgtable_page(page_address(pg));
1173         }
1174 }
1175
1176 static void iova_entry_free(unsigned long data)
1177 {
1178         struct page *freelist = (struct page *)data;
1179
1180         dma_free_pagelist(freelist);
1181 }
1182
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1185 {
1186         struct root_entry *root;
1187         unsigned long flags;
1188
1189         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190         if (!root) {
1191                 pr_err("Allocating root entry for %s failed\n",
1192                         iommu->name);
1193                 return -ENOMEM;
1194         }
1195
1196         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1197
1198         spin_lock_irqsave(&iommu->lock, flags);
1199         iommu->root_entry = root;
1200         spin_unlock_irqrestore(&iommu->lock, flags);
1201
1202         return 0;
1203 }
1204
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 {
1207         u64 addr;
1208         u32 sts;
1209         unsigned long flag;
1210
1211         addr = virt_to_phys(iommu->root_entry);
1212         if (sm_supported(iommu))
1213                 addr |= DMA_RTADDR_SMT;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1217
1218         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1219
1220         /* Make sure hardware complete it */
1221         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222                       readl, (sts & DMA_GSTS_RTPS), sts);
1223
1224         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1225 }
1226
1227 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1228 {
1229         u32 val;
1230         unsigned long flag;
1231
1232         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1233                 return;
1234
1235         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1237
1238         /* Make sure hardware complete it */
1239         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                       readl, (!(val & DMA_GSTS_WBFS)), val);
1241
1242         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243 }
1244
1245 /* return value determine if we need a write buffer flush */
1246 static void __iommu_flush_context(struct intel_iommu *iommu,
1247                                   u16 did, u16 source_id, u8 function_mask,
1248                                   u64 type)
1249 {
1250         u64 val = 0;
1251         unsigned long flag;
1252
1253         switch (type) {
1254         case DMA_CCMD_GLOBAL_INVL:
1255                 val = DMA_CCMD_GLOBAL_INVL;
1256                 break;
1257         case DMA_CCMD_DOMAIN_INVL:
1258                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1259                 break;
1260         case DMA_CCMD_DEVICE_INVL:
1261                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1262                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1263                 break;
1264         default:
1265                 BUG();
1266         }
1267         val |= DMA_CCMD_ICC;
1268
1269         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1270         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1271
1272         /* Make sure hardware complete it */
1273         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1274                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1275
1276         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1277 }
1278
1279 /* return value determine if we need a write buffer flush */
1280 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1281                                 u64 addr, unsigned int size_order, u64 type)
1282 {
1283         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1284         u64 val = 0, val_iva = 0;
1285         unsigned long flag;
1286
1287         switch (type) {
1288         case DMA_TLB_GLOBAL_FLUSH:
1289                 /* global flush doesn't need set IVA_REG */
1290                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1291                 break;
1292         case DMA_TLB_DSI_FLUSH:
1293                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1294                 break;
1295         case DMA_TLB_PSI_FLUSH:
1296                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1297                 /* IH bit is passed in as part of address */
1298                 val_iva = size_order | addr;
1299                 break;
1300         default:
1301                 BUG();
1302         }
1303         /* Note: set drain read/write */
1304 #if 0
1305         /*
1306          * This is probably to be super secure.. Looks like we can
1307          * ignore it without any impact.
1308          */
1309         if (cap_read_drain(iommu->cap))
1310                 val |= DMA_TLB_READ_DRAIN;
1311 #endif
1312         if (cap_write_drain(iommu->cap))
1313                 val |= DMA_TLB_WRITE_DRAIN;
1314
1315         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1316         /* Note: Only uses first TLB reg currently */
1317         if (val_iva)
1318                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1319         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1320
1321         /* Make sure hardware complete it */
1322         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1323                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1324
1325         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1326
1327         /* check IOTLB invalidation granularity */
1328         if (DMA_TLB_IAIG(val) == 0)
1329                 pr_err("Flush IOTLB failed\n");
1330         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1331                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1332                         (unsigned long long)DMA_TLB_IIRG(type),
1333                         (unsigned long long)DMA_TLB_IAIG(val));
1334 }
1335
1336 static struct device_domain_info *
1337 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1338                          u8 bus, u8 devfn)
1339 {
1340         struct device_domain_info *info;
1341
1342         assert_spin_locked(&device_domain_lock);
1343
1344         if (!iommu->qi)
1345                 return NULL;
1346
1347         list_for_each_entry(info, &domain->devices, link)
1348                 if (info->iommu == iommu && info->bus == bus &&
1349                     info->devfn == devfn) {
1350                         if (info->ats_supported && info->dev)
1351                                 return info;
1352                         break;
1353                 }
1354
1355         return NULL;
1356 }
1357
1358 static void domain_update_iotlb(struct dmar_domain *domain)
1359 {
1360         struct device_domain_info *info;
1361         bool has_iotlb_device = false;
1362
1363         assert_spin_locked(&device_domain_lock);
1364
1365         list_for_each_entry(info, &domain->devices, link) {
1366                 struct pci_dev *pdev;
1367
1368                 if (!info->dev || !dev_is_pci(info->dev))
1369                         continue;
1370
1371                 pdev = to_pci_dev(info->dev);
1372                 if (pdev->ats_enabled) {
1373                         has_iotlb_device = true;
1374                         break;
1375                 }
1376         }
1377
1378         domain->has_iotlb_device = has_iotlb_device;
1379 }
1380
1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1382 {
1383         struct pci_dev *pdev;
1384
1385         assert_spin_locked(&device_domain_lock);
1386
1387         if (!info || !dev_is_pci(info->dev))
1388                 return;
1389
1390         pdev = to_pci_dev(info->dev);
1391         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1392          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1393          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1394          * reserved, which should be set to 0.
1395          */
1396         if (!ecap_dit(info->iommu->ecap))
1397                 info->pfsid = 0;
1398         else {
1399                 struct pci_dev *pf_pdev;
1400
1401                 /* pdev will be returned if device is not a vf */
1402                 pf_pdev = pci_physfn(pdev);
1403                 info->pfsid = pci_dev_id(pf_pdev);
1404         }
1405
1406 #ifdef CONFIG_INTEL_IOMMU_SVM
1407         /* The PCIe spec, in its wisdom, declares that the behaviour of
1408            the device if you enable PASID support after ATS support is
1409            undefined. So always enable PASID support on devices which
1410            have it, even if we can't yet know if we're ever going to
1411            use it. */
1412         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1413                 info->pasid_enabled = 1;
1414
1415         if (info->pri_supported &&
1416             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1417             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1418                 info->pri_enabled = 1;
1419 #endif
1420         if (!pdev->untrusted && info->ats_supported &&
1421             pci_ats_page_aligned(pdev) &&
1422             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1423                 info->ats_enabled = 1;
1424                 domain_update_iotlb(info->domain);
1425                 info->ats_qdep = pci_ats_queue_depth(pdev);
1426         }
1427 }
1428
1429 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1430 {
1431         struct pci_dev *pdev;
1432
1433         assert_spin_locked(&device_domain_lock);
1434
1435         if (!dev_is_pci(info->dev))
1436                 return;
1437
1438         pdev = to_pci_dev(info->dev);
1439
1440         if (info->ats_enabled) {
1441                 pci_disable_ats(pdev);
1442                 info->ats_enabled = 0;
1443                 domain_update_iotlb(info->domain);
1444         }
1445 #ifdef CONFIG_INTEL_IOMMU_SVM
1446         if (info->pri_enabled) {
1447                 pci_disable_pri(pdev);
1448                 info->pri_enabled = 0;
1449         }
1450         if (info->pasid_enabled) {
1451                 pci_disable_pasid(pdev);
1452                 info->pasid_enabled = 0;
1453         }
1454 #endif
1455 }
1456
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458                                   u64 addr, unsigned mask)
1459 {
1460         u16 sid, qdep;
1461         unsigned long flags;
1462         struct device_domain_info *info;
1463
1464         if (!domain->has_iotlb_device)
1465                 return;
1466
1467         spin_lock_irqsave(&device_domain_lock, flags);
1468         list_for_each_entry(info, &domain->devices, link) {
1469                 if (!info->ats_enabled)
1470                         continue;
1471
1472                 sid = info->bus << 8 | info->devfn;
1473                 qdep = info->ats_qdep;
1474                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1475                                 qdep, addr, mask);
1476         }
1477         spin_unlock_irqrestore(&device_domain_lock, flags);
1478 }
1479
1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1481                                   struct dmar_domain *domain,
1482                                   unsigned long pfn, unsigned int pages,
1483                                   int ih, int map)
1484 {
1485         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1486         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1487         u16 did = domain->iommu_did[iommu->seq_id];
1488
1489         BUG_ON(pages == 0);
1490
1491         if (ih)
1492                 ih = 1 << 6;
1493         /*
1494          * Fallback to domain selective flush if no PSI support or the size is
1495          * too big.
1496          * PSI requires page size to be 2 ^ x, and the base address is naturally
1497          * aligned to the size
1498          */
1499         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1500                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1501                                                 DMA_TLB_DSI_FLUSH);
1502         else
1503                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1504                                                 DMA_TLB_PSI_FLUSH);
1505
1506         /*
1507          * In caching mode, changes of pages from non-present to present require
1508          * flush. However, device IOTLB doesn't need to be flushed in this case.
1509          */
1510         if (!cap_caching_mode(iommu->cap) || !map)
1511                 iommu_flush_dev_iotlb(domain, addr, mask);
1512 }
1513
1514 /* Notification for newly created mappings */
1515 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1516                                         struct dmar_domain *domain,
1517                                         unsigned long pfn, unsigned int pages)
1518 {
1519         /* It's a non-present to present mapping. Only flush if caching mode */
1520         if (cap_caching_mode(iommu->cap))
1521                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522         else
1523                 iommu_flush_write_buffer(iommu);
1524 }
1525
1526 static void iommu_flush_iova(struct iova_domain *iovad)
1527 {
1528         struct dmar_domain *domain;
1529         int idx;
1530
1531         domain = container_of(iovad, struct dmar_domain, iovad);
1532
1533         for_each_domain_iommu(idx, domain) {
1534                 struct intel_iommu *iommu = g_iommus[idx];
1535                 u16 did = domain->iommu_did[iommu->seq_id];
1536
1537                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1538
1539                 if (!cap_caching_mode(iommu->cap))
1540                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1541                                               0, MAX_AGAW_PFN_WIDTH);
1542         }
1543 }
1544
1545 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1546 {
1547         u32 pmen;
1548         unsigned long flags;
1549
1550         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1551                 return;
1552
1553         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1554         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1555         pmen &= ~DMA_PMEN_EPM;
1556         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1557
1558         /* wait for the protected region status bit to clear */
1559         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1560                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1561
1562         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1563 }
1564
1565 static void iommu_enable_translation(struct intel_iommu *iommu)
1566 {
1567         u32 sts;
1568         unsigned long flags;
1569
1570         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1571         iommu->gcmd |= DMA_GCMD_TE;
1572         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1573
1574         /* Make sure hardware complete it */
1575         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1576                       readl, (sts & DMA_GSTS_TES), sts);
1577
1578         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1579 }
1580
1581 static void iommu_disable_translation(struct intel_iommu *iommu)
1582 {
1583         u32 sts;
1584         unsigned long flag;
1585
1586         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1587         iommu->gcmd &= ~DMA_GCMD_TE;
1588         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1589
1590         /* Make sure hardware complete it */
1591         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1592                       readl, (!(sts & DMA_GSTS_TES)), sts);
1593
1594         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1595 }
1596
1597 static int iommu_init_domains(struct intel_iommu *iommu)
1598 {
1599         u32 ndomains, nlongs;
1600         size_t size;
1601
1602         ndomains = cap_ndoms(iommu->cap);
1603         pr_debug("%s: Number of Domains supported <%d>\n",
1604                  iommu->name, ndomains);
1605         nlongs = BITS_TO_LONGS(ndomains);
1606
1607         spin_lock_init(&iommu->lock);
1608
1609         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1610         if (!iommu->domain_ids) {
1611                 pr_err("%s: Allocating domain id array failed\n",
1612                        iommu->name);
1613                 return -ENOMEM;
1614         }
1615
1616         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1617         iommu->domains = kzalloc(size, GFP_KERNEL);
1618
1619         if (iommu->domains) {
1620                 size = 256 * sizeof(struct dmar_domain *);
1621                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1622         }
1623
1624         if (!iommu->domains || !iommu->domains[0]) {
1625                 pr_err("%s: Allocating domain array failed\n",
1626                        iommu->name);
1627                 kfree(iommu->domain_ids);
1628                 kfree(iommu->domains);
1629                 iommu->domain_ids = NULL;
1630                 iommu->domains    = NULL;
1631                 return -ENOMEM;
1632         }
1633
1634         /*
1635          * If Caching mode is set, then invalid translations are tagged
1636          * with domain-id 0, hence we need to pre-allocate it. We also
1637          * use domain-id 0 as a marker for non-allocated domain-id, so
1638          * make sure it is not used for a real domain.
1639          */
1640         set_bit(0, iommu->domain_ids);
1641
1642         /*
1643          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1644          * entry for first-level or pass-through translation modes should
1645          * be programmed with a domain id different from those used for
1646          * second-level or nested translation. We reserve a domain id for
1647          * this purpose.
1648          */
1649         if (sm_supported(iommu))
1650                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1651
1652         return 0;
1653 }
1654
1655 static void disable_dmar_iommu(struct intel_iommu *iommu)
1656 {
1657         struct device_domain_info *info, *tmp;
1658         unsigned long flags;
1659
1660         if (!iommu->domains || !iommu->domain_ids)
1661                 return;
1662
1663         spin_lock_irqsave(&device_domain_lock, flags);
1664         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1665                 if (info->iommu != iommu)
1666                         continue;
1667
1668                 if (!info->dev || !info->domain)
1669                         continue;
1670
1671                 __dmar_remove_one_dev_info(info);
1672         }
1673         spin_unlock_irqrestore(&device_domain_lock, flags);
1674
1675         if (iommu->gcmd & DMA_GCMD_TE)
1676                 iommu_disable_translation(iommu);
1677 }
1678
1679 static void free_dmar_iommu(struct intel_iommu *iommu)
1680 {
1681         if ((iommu->domains) && (iommu->domain_ids)) {
1682                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1683                 int i;
1684
1685                 for (i = 0; i < elems; i++)
1686                         kfree(iommu->domains[i]);
1687                 kfree(iommu->domains);
1688                 kfree(iommu->domain_ids);
1689                 iommu->domains = NULL;
1690                 iommu->domain_ids = NULL;
1691         }
1692
1693         g_iommus[iommu->seq_id] = NULL;
1694
1695         /* free context mapping */
1696         free_context_table(iommu);
1697
1698 #ifdef CONFIG_INTEL_IOMMU_SVM
1699         if (pasid_supported(iommu)) {
1700                 if (ecap_prs(iommu->ecap))
1701                         intel_svm_finish_prq(iommu);
1702         }
1703 #endif
1704 }
1705
1706 static struct dmar_domain *alloc_domain(int flags)
1707 {
1708         struct dmar_domain *domain;
1709
1710         domain = alloc_domain_mem();
1711         if (!domain)
1712                 return NULL;
1713
1714         memset(domain, 0, sizeof(*domain));
1715         domain->nid = NUMA_NO_NODE;
1716         domain->flags = flags;
1717         domain->has_iotlb_device = false;
1718         INIT_LIST_HEAD(&domain->devices);
1719
1720         return domain;
1721 }
1722
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725                                struct intel_iommu *iommu)
1726 {
1727         unsigned long ndomains;
1728         int num;
1729
1730         assert_spin_locked(&device_domain_lock);
1731         assert_spin_locked(&iommu->lock);
1732
1733         domain->iommu_refcnt[iommu->seq_id] += 1;
1734         domain->iommu_count += 1;
1735         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736                 ndomains = cap_ndoms(iommu->cap);
1737                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1738
1739                 if (num >= ndomains) {
1740                         pr_err("%s: No free domain ids\n", iommu->name);
1741                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1742                         domain->iommu_count -= 1;
1743                         return -ENOSPC;
1744                 }
1745
1746                 set_bit(num, iommu->domain_ids);
1747                 set_iommu_domain(iommu, num, domain);
1748
1749                 domain->iommu_did[iommu->seq_id] = num;
1750                 domain->nid                      = iommu->node;
1751
1752                 domain_update_iommu_cap(domain);
1753         }
1754
1755         return 0;
1756 }
1757
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759                                struct intel_iommu *iommu)
1760 {
1761         int num, count;
1762
1763         assert_spin_locked(&device_domain_lock);
1764         assert_spin_locked(&iommu->lock);
1765
1766         domain->iommu_refcnt[iommu->seq_id] -= 1;
1767         count = --domain->iommu_count;
1768         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769                 num = domain->iommu_did[iommu->seq_id];
1770                 clear_bit(num, iommu->domain_ids);
1771                 set_iommu_domain(iommu, num, NULL);
1772
1773                 domain_update_iommu_cap(domain);
1774                 domain->iommu_did[iommu->seq_id] = 0;
1775         }
1776
1777         return count;
1778 }
1779
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1782
1783 static int dmar_init_reserved_ranges(void)
1784 {
1785         struct pci_dev *pdev = NULL;
1786         struct iova *iova;
1787         int i;
1788
1789         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1790
1791         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792                 &reserved_rbtree_key);
1793
1794         /* IOAPIC ranges shouldn't be accessed by DMA */
1795         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796                 IOVA_PFN(IOAPIC_RANGE_END));
1797         if (!iova) {
1798                 pr_err("Reserve IOAPIC range failed\n");
1799                 return -ENODEV;
1800         }
1801
1802         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803         for_each_pci_dev(pdev) {
1804                 struct resource *r;
1805
1806                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807                         r = &pdev->resource[i];
1808                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1809                                 continue;
1810                         iova = reserve_iova(&reserved_iova_list,
1811                                             IOVA_PFN(r->start),
1812                                             IOVA_PFN(r->end));
1813                         if (!iova) {
1814                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815                                 return -ENODEV;
1816                         }
1817                 }
1818         }
1819         return 0;
1820 }
1821
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1823 {
1824         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1825 }
1826
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1828 {
1829         int agaw;
1830         int r = (gaw - 12) % 9;
1831
1832         if (r == 0)
1833                 agaw = gaw;
1834         else
1835                 agaw = gaw + 9 - r;
1836         if (agaw > 64)
1837                 agaw = 64;
1838         return agaw;
1839 }
1840
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1842                        int guest_width)
1843 {
1844         int adjust_width, agaw;
1845         unsigned long sagaw;
1846         int err;
1847
1848         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1849
1850         err = init_iova_flush_queue(&domain->iovad,
1851                                     iommu_flush_iova, iova_entry_free);
1852         if (err)
1853                 return err;
1854
1855         domain_reserve_special_ranges(domain);
1856
1857         /* calculate AGAW */
1858         if (guest_width > cap_mgaw(iommu->cap))
1859                 guest_width = cap_mgaw(iommu->cap);
1860         domain->gaw = guest_width;
1861         adjust_width = guestwidth_to_adjustwidth(guest_width);
1862         agaw = width_to_agaw(adjust_width);
1863         sagaw = cap_sagaw(iommu->cap);
1864         if (!test_bit(agaw, &sagaw)) {
1865                 /* hardware doesn't support it, choose a bigger one */
1866                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867                 agaw = find_next_bit(&sagaw, 5, agaw);
1868                 if (agaw >= 5)
1869                         return -ENODEV;
1870         }
1871         domain->agaw = agaw;
1872
1873         if (ecap_coherent(iommu->ecap))
1874                 domain->iommu_coherency = 1;
1875         else
1876                 domain->iommu_coherency = 0;
1877
1878         if (ecap_sc_support(iommu->ecap))
1879                 domain->iommu_snooping = 1;
1880         else
1881                 domain->iommu_snooping = 0;
1882
1883         if (intel_iommu_superpage)
1884                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1885         else
1886                 domain->iommu_superpage = 0;
1887
1888         domain->nid = iommu->node;
1889
1890         /* always allocate the top pgd */
1891         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1892         if (!domain->pgd)
1893                 return -ENOMEM;
1894         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1895         return 0;
1896 }
1897
1898 static void domain_exit(struct dmar_domain *domain)
1899 {
1900
1901         /* Remove associated devices and clear attached or cached domains */
1902         domain_remove_dev_info(domain);
1903
1904         /* destroy iovas */
1905         put_iova_domain(&domain->iovad);
1906
1907         if (domain->pgd) {
1908                 struct page *freelist;
1909
1910                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1911                 dma_free_pagelist(freelist);
1912         }
1913
1914         free_domain_mem(domain);
1915 }
1916
1917 /*
1918  * Get the PASID directory size for scalable mode context entry.
1919  * Value of X in the PDTS field of a scalable mode context entry
1920  * indicates PASID directory with 2^(X + 7) entries.
1921  */
1922 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1923 {
1924         int pds, max_pde;
1925
1926         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1927         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928         if (pds < 7)
1929                 return 0;
1930
1931         return pds - 7;
1932 }
1933
1934 /*
1935  * Set the RID_PASID field of a scalable mode context entry. The
1936  * IOMMU hardware will use the PASID value set in this field for
1937  * DMA translations of DMA requests without PASID.
1938  */
1939 static inline void
1940 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1941 {
1942         context->hi |= pasid & ((1 << 20) - 1);
1943         context->hi |= (1 << 20);
1944 }
1945
1946 /*
1947  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1948  * entry.
1949  */
1950 static inline void context_set_sm_dte(struct context_entry *context)
1951 {
1952         context->lo |= (1 << 2);
1953 }
1954
1955 /*
1956  * Set the PRE(Page Request Enable) field of a scalable mode context
1957  * entry.
1958  */
1959 static inline void context_set_sm_pre(struct context_entry *context)
1960 {
1961         context->lo |= (1 << 4);
1962 }
1963
1964 /* Convert value to context PASID directory size field coding. */
1965 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1966
1967 static int domain_context_mapping_one(struct dmar_domain *domain,
1968                                       struct intel_iommu *iommu,
1969                                       struct pasid_table *table,
1970                                       u8 bus, u8 devfn)
1971 {
1972         u16 did = domain->iommu_did[iommu->seq_id];
1973         int translation = CONTEXT_TT_MULTI_LEVEL;
1974         struct device_domain_info *info = NULL;
1975         struct context_entry *context;
1976         unsigned long flags;
1977         int ret;
1978
1979         WARN_ON(did == 0);
1980
1981         if (hw_pass_through && domain_type_is_si(domain))
1982                 translation = CONTEXT_TT_PASS_THROUGH;
1983
1984         pr_debug("Set context mapping for %02x:%02x.%d\n",
1985                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1986
1987         BUG_ON(!domain->pgd);
1988
1989         spin_lock_irqsave(&device_domain_lock, flags);
1990         spin_lock(&iommu->lock);
1991
1992         ret = -ENOMEM;
1993         context = iommu_context_addr(iommu, bus, devfn, 1);
1994         if (!context)
1995                 goto out_unlock;
1996
1997         ret = 0;
1998         if (context_present(context))
1999                 goto out_unlock;
2000
2001         /*
2002          * For kdump cases, old valid entries may be cached due to the
2003          * in-flight DMA and copied pgtable, but there is no unmapping
2004          * behaviour for them, thus we need an explicit cache flush for
2005          * the newly-mapped device. For kdump, at this point, the device
2006          * is supposed to finish reset at its driver probe stage, so no
2007          * in-flight DMA will exist, and we don't need to worry anymore
2008          * hereafter.
2009          */
2010         if (context_copied(context)) {
2011                 u16 did_old = context_domain_id(context);
2012
2013                 if (did_old < cap_ndoms(iommu->cap)) {
2014                         iommu->flush.flush_context(iommu, did_old,
2015                                                    (((u16)bus) << 8) | devfn,
2016                                                    DMA_CCMD_MASK_NOBIT,
2017                                                    DMA_CCMD_DEVICE_INVL);
2018                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2019                                                  DMA_TLB_DSI_FLUSH);
2020                 }
2021         }
2022
2023         context_clear_entry(context);
2024
2025         if (sm_supported(iommu)) {
2026                 unsigned long pds;
2027
2028                 WARN_ON(!table);
2029
2030                 /* Setup the PASID DIR pointer: */
2031                 pds = context_get_sm_pds(table);
2032                 context->lo = (u64)virt_to_phys(table->table) |
2033                                 context_pdts(pds);
2034
2035                 /* Setup the RID_PASID field: */
2036                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2037
2038                 /*
2039                  * Setup the Device-TLB enable bit and Page request
2040                  * Enable bit:
2041                  */
2042                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2043                 if (info && info->ats_supported)
2044                         context_set_sm_dte(context);
2045                 if (info && info->pri_supported)
2046                         context_set_sm_pre(context);
2047         } else {
2048                 struct dma_pte *pgd = domain->pgd;
2049                 int agaw;
2050
2051                 context_set_domain_id(context, did);
2052
2053                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2054                         /*
2055                          * Skip top levels of page tables for iommu which has
2056                          * less agaw than default. Unnecessary for PT mode.
2057                          */
2058                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2059                                 ret = -ENOMEM;
2060                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2061                                 if (!dma_pte_present(pgd))
2062                                         goto out_unlock;
2063                         }
2064
2065                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2066                         if (info && info->ats_supported)
2067                                 translation = CONTEXT_TT_DEV_IOTLB;
2068                         else
2069                                 translation = CONTEXT_TT_MULTI_LEVEL;
2070
2071                         context_set_address_root(context, virt_to_phys(pgd));
2072                         context_set_address_width(context, agaw);
2073                 } else {
2074                         /*
2075                          * In pass through mode, AW must be programmed to
2076                          * indicate the largest AGAW value supported by
2077                          * hardware. And ASR is ignored by hardware.
2078                          */
2079                         context_set_address_width(context, iommu->msagaw);
2080                 }
2081
2082                 context_set_translation_type(context, translation);
2083         }
2084
2085         context_set_fault_enable(context);
2086         context_set_present(context);
2087         domain_flush_cache(domain, context, sizeof(*context));
2088
2089         /*
2090          * It's a non-present to present mapping. If hardware doesn't cache
2091          * non-present entry we only need to flush the write-buffer. If the
2092          * _does_ cache non-present entries, then it does so in the special
2093          * domain #0, which we have to flush:
2094          */
2095         if (cap_caching_mode(iommu->cap)) {
2096                 iommu->flush.flush_context(iommu, 0,
2097                                            (((u16)bus) << 8) | devfn,
2098                                            DMA_CCMD_MASK_NOBIT,
2099                                            DMA_CCMD_DEVICE_INVL);
2100                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2101         } else {
2102                 iommu_flush_write_buffer(iommu);
2103         }
2104         iommu_enable_dev_iotlb(info);
2105
2106         ret = 0;
2107
2108 out_unlock:
2109         spin_unlock(&iommu->lock);
2110         spin_unlock_irqrestore(&device_domain_lock, flags);
2111
2112         return ret;
2113 }
2114
2115 static int
2116 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2117 {
2118         struct pasid_table *table;
2119         struct intel_iommu *iommu;
2120         u8 bus, devfn;
2121
2122         iommu = device_to_iommu(dev, &bus, &devfn);
2123         if (!iommu)
2124                 return -ENODEV;
2125
2126         table = intel_pasid_get_table(dev);
2127         return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2128 }
2129
2130 static int domain_context_mapped_cb(struct pci_dev *pdev,
2131                                     u16 alias, void *opaque)
2132 {
2133         struct intel_iommu *iommu = opaque;
2134
2135         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2136 }
2137
2138 static int domain_context_mapped(struct device *dev)
2139 {
2140         struct intel_iommu *iommu;
2141         u8 bus, devfn;
2142
2143         iommu = device_to_iommu(dev, &bus, &devfn);
2144         if (!iommu)
2145                 return -ENODEV;
2146
2147         if (!dev_is_pci(dev))
2148                 return device_context_mapped(iommu, bus, devfn);
2149
2150         return !pci_for_each_dma_alias(to_pci_dev(dev),
2151                                        domain_context_mapped_cb, iommu);
2152 }
2153
2154 /* Returns a number of VTD pages, but aligned to MM page size */
2155 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2156                                             size_t size)
2157 {
2158         host_addr &= ~PAGE_MASK;
2159         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2160 }
2161
2162 /* Return largest possible superpage level for a given mapping */
2163 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2164                                           unsigned long iov_pfn,
2165                                           unsigned long phy_pfn,
2166                                           unsigned long pages)
2167 {
2168         int support, level = 1;
2169         unsigned long pfnmerge;
2170
2171         support = domain->iommu_superpage;
2172
2173         /* To use a large page, the virtual *and* physical addresses
2174            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2175            of them will mean we have to use smaller pages. So just
2176            merge them and check both at once. */
2177         pfnmerge = iov_pfn | phy_pfn;
2178
2179         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2180                 pages >>= VTD_STRIDE_SHIFT;
2181                 if (!pages)
2182                         break;
2183                 pfnmerge >>= VTD_STRIDE_SHIFT;
2184                 level++;
2185                 support--;
2186         }
2187         return level;
2188 }
2189
2190 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2191                             struct scatterlist *sg, unsigned long phys_pfn,
2192                             unsigned long nr_pages, int prot)
2193 {
2194         struct dma_pte *first_pte = NULL, *pte = NULL;
2195         phys_addr_t uninitialized_var(pteval);
2196         unsigned long sg_res = 0;
2197         unsigned int largepage_lvl = 0;
2198         unsigned long lvl_pages = 0;
2199
2200         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2201
2202         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2203                 return -EINVAL;
2204
2205         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2206
2207         if (!sg) {
2208                 sg_res = nr_pages;
2209                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2210         }
2211
2212         while (nr_pages > 0) {
2213                 uint64_t tmp;
2214
2215                 if (!sg_res) {
2216                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2217
2218                         sg_res = aligned_nrpages(sg->offset, sg->length);
2219                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2220                         sg->dma_length = sg->length;
2221                         pteval = (sg_phys(sg) - pgoff) | prot;
2222                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2223                 }
2224
2225                 if (!pte) {
2226                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2227
2228                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2229                         if (!pte)
2230                                 return -ENOMEM;
2231                         /* It is large page*/
2232                         if (largepage_lvl > 1) {
2233                                 unsigned long nr_superpages, end_pfn;
2234
2235                                 pteval |= DMA_PTE_LARGE_PAGE;
2236                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2237
2238                                 nr_superpages = sg_res / lvl_pages;
2239                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2240
2241                                 /*
2242                                  * Ensure that old small page tables are
2243                                  * removed to make room for superpage(s).
2244                                  * We're adding new large pages, so make sure
2245                                  * we don't remove their parent tables.
2246                                  */
2247                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2248                                                        largepage_lvl + 1);
2249                         } else {
2250                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2251                         }
2252
2253                 }
2254                 /* We don't need lock here, nobody else
2255                  * touches the iova range
2256                  */
2257                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2258                 if (tmp) {
2259                         static int dumps = 5;
2260                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2261                                 iov_pfn, tmp, (unsigned long long)pteval);
2262                         if (dumps) {
2263                                 dumps--;
2264                                 debug_dma_dump_mappings(NULL);
2265                         }
2266                         WARN_ON(1);
2267                 }
2268
2269                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2270
2271                 BUG_ON(nr_pages < lvl_pages);
2272                 BUG_ON(sg_res < lvl_pages);
2273
2274                 nr_pages -= lvl_pages;
2275                 iov_pfn += lvl_pages;
2276                 phys_pfn += lvl_pages;
2277                 pteval += lvl_pages * VTD_PAGE_SIZE;
2278                 sg_res -= lvl_pages;
2279
2280                 /* If the next PTE would be the first in a new page, then we
2281                    need to flush the cache on the entries we've just written.
2282                    And then we'll need to recalculate 'pte', so clear it and
2283                    let it get set again in the if (!pte) block above.
2284
2285                    If we're done (!nr_pages) we need to flush the cache too.
2286
2287                    Also if we've been setting superpages, we may need to
2288                    recalculate 'pte' and switch back to smaller pages for the
2289                    end of the mapping, if the trailing size is not enough to
2290                    use another superpage (i.e. sg_res < lvl_pages). */
2291                 pte++;
2292                 if (!nr_pages || first_pte_in_page(pte) ||
2293                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2294                         domain_flush_cache(domain, first_pte,
2295                                            (void *)pte - (void *)first_pte);
2296                         pte = NULL;
2297                 }
2298
2299                 if (!sg_res && nr_pages)
2300                         sg = sg_next(sg);
2301         }
2302         return 0;
2303 }
2304
2305 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2306                           struct scatterlist *sg, unsigned long phys_pfn,
2307                           unsigned long nr_pages, int prot)
2308 {
2309         int iommu_id, ret;
2310         struct intel_iommu *iommu;
2311
2312         /* Do the real mapping first */
2313         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2314         if (ret)
2315                 return ret;
2316
2317         for_each_domain_iommu(iommu_id, domain) {
2318                 iommu = g_iommus[iommu_id];
2319                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2320         }
2321
2322         return 0;
2323 }
2324
2325 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326                                     struct scatterlist *sg, unsigned long nr_pages,
2327                                     int prot)
2328 {
2329         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2330 }
2331
2332 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2333                                      unsigned long phys_pfn, unsigned long nr_pages,
2334                                      int prot)
2335 {
2336         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2337 }
2338
2339 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2340 {
2341         unsigned long flags;
2342         struct context_entry *context;
2343         u16 did_old;
2344
2345         if (!iommu)
2346                 return;
2347
2348         spin_lock_irqsave(&iommu->lock, flags);
2349         context = iommu_context_addr(iommu, bus, devfn, 0);
2350         if (!context) {
2351                 spin_unlock_irqrestore(&iommu->lock, flags);
2352                 return;
2353         }
2354         did_old = context_domain_id(context);
2355         context_clear_entry(context);
2356         __iommu_flush_cache(iommu, context, sizeof(*context));
2357         spin_unlock_irqrestore(&iommu->lock, flags);
2358         iommu->flush.flush_context(iommu,
2359                                    did_old,
2360                                    (((u16)bus) << 8) | devfn,
2361                                    DMA_CCMD_MASK_NOBIT,
2362                                    DMA_CCMD_DEVICE_INVL);
2363         iommu->flush.flush_iotlb(iommu,
2364                                  did_old,
2365                                  0,
2366                                  0,
2367                                  DMA_TLB_DSI_FLUSH);
2368 }
2369
2370 static inline void unlink_domain_info(struct device_domain_info *info)
2371 {
2372         assert_spin_locked(&device_domain_lock);
2373         list_del(&info->link);
2374         list_del(&info->global);
2375         if (info->dev)
2376                 info->dev->archdata.iommu = NULL;
2377 }
2378
2379 static void domain_remove_dev_info(struct dmar_domain *domain)
2380 {
2381         struct device_domain_info *info, *tmp;
2382         unsigned long flags;
2383
2384         spin_lock_irqsave(&device_domain_lock, flags);
2385         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2386                 __dmar_remove_one_dev_info(info);
2387         spin_unlock_irqrestore(&device_domain_lock, flags);
2388 }
2389
2390 /*
2391  * find_domain
2392  * Note: we use struct device->archdata.iommu stores the info
2393  */
2394 static struct dmar_domain *find_domain(struct device *dev)
2395 {
2396         struct device_domain_info *info;
2397
2398         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2399                 struct iommu_domain *domain;
2400
2401                 dev->archdata.iommu = NULL;
2402                 domain = iommu_get_domain_for_dev(dev);
2403                 if (domain)
2404                         intel_iommu_attach_device(domain, dev);
2405         }
2406
2407         /* No lock here, assumes no domain exit in normal case */
2408         info = dev->archdata.iommu;
2409
2410         if (likely(info))
2411                 return info->domain;
2412         return NULL;
2413 }
2414
2415 static inline struct device_domain_info *
2416 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2417 {
2418         struct device_domain_info *info;
2419
2420         list_for_each_entry(info, &device_domain_list, global)
2421                 if (info->iommu->segment == segment && info->bus == bus &&
2422                     info->devfn == devfn)
2423                         return info;
2424
2425         return NULL;
2426 }
2427
2428 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2429                                                     int bus, int devfn,
2430                                                     struct device *dev,
2431                                                     struct dmar_domain *domain)
2432 {
2433         struct dmar_domain *found = NULL;
2434         struct device_domain_info *info;
2435         unsigned long flags;
2436         int ret;
2437
2438         info = alloc_devinfo_mem();
2439         if (!info)
2440                 return NULL;
2441
2442         info->bus = bus;
2443         info->devfn = devfn;
2444         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2445         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2446         info->ats_qdep = 0;
2447         info->dev = dev;
2448         info->domain = domain;
2449         info->iommu = iommu;
2450         info->pasid_table = NULL;
2451         info->auxd_enabled = 0;
2452         INIT_LIST_HEAD(&info->auxiliary_domains);
2453
2454         if (dev && dev_is_pci(dev)) {
2455                 struct pci_dev *pdev = to_pci_dev(info->dev);
2456
2457                 if (!pdev->untrusted &&
2458                     !pci_ats_disabled() &&
2459                     ecap_dev_iotlb_support(iommu->ecap) &&
2460                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2461                     dmar_find_matched_atsr_unit(pdev))
2462                         info->ats_supported = 1;
2463
2464                 if (sm_supported(iommu)) {
2465                         if (pasid_supported(iommu)) {
2466                                 int features = pci_pasid_features(pdev);
2467                                 if (features >= 0)
2468                                         info->pasid_supported = features | 1;
2469                         }
2470
2471                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2472                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2473                                 info->pri_supported = 1;
2474                 }
2475         }
2476
2477         spin_lock_irqsave(&device_domain_lock, flags);
2478         if (dev)
2479                 found = find_domain(dev);
2480
2481         if (!found) {
2482                 struct device_domain_info *info2;
2483                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2484                 if (info2) {
2485                         found      = info2->domain;
2486                         info2->dev = dev;
2487                 }
2488         }
2489
2490         if (found) {
2491                 spin_unlock_irqrestore(&device_domain_lock, flags);
2492                 free_devinfo_mem(info);
2493                 /* Caller must free the original domain */
2494                 return found;
2495         }
2496
2497         spin_lock(&iommu->lock);
2498         ret = domain_attach_iommu(domain, iommu);
2499         spin_unlock(&iommu->lock);
2500
2501         if (ret) {
2502                 spin_unlock_irqrestore(&device_domain_lock, flags);
2503                 free_devinfo_mem(info);
2504                 return NULL;
2505         }
2506
2507         list_add(&info->link, &domain->devices);
2508         list_add(&info->global, &device_domain_list);
2509         if (dev)
2510                 dev->archdata.iommu = info;
2511         spin_unlock_irqrestore(&device_domain_lock, flags);
2512
2513         /* PASID table is mandatory for a PCI device in scalable mode. */
2514         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2515                 ret = intel_pasid_alloc_table(dev);
2516                 if (ret) {
2517                         dev_err(dev, "PASID table allocation failed\n");
2518                         dmar_remove_one_dev_info(dev);
2519                         return NULL;
2520                 }
2521
2522                 /* Setup the PASID entry for requests without PASID: */
2523                 spin_lock(&iommu->lock);
2524                 if (hw_pass_through && domain_type_is_si(domain))
2525                         ret = intel_pasid_setup_pass_through(iommu, domain,
2526                                         dev, PASID_RID2PASID);
2527                 else
2528                         ret = intel_pasid_setup_second_level(iommu, domain,
2529                                         dev, PASID_RID2PASID);
2530                 spin_unlock(&iommu->lock);
2531                 if (ret) {
2532                         dev_err(dev, "Setup RID2PASID failed\n");
2533                         dmar_remove_one_dev_info(dev);
2534                         return NULL;
2535                 }
2536         }
2537
2538         if (dev && domain_context_mapping(domain, dev)) {
2539                 dev_err(dev, "Domain context map failed\n");
2540                 dmar_remove_one_dev_info(dev);
2541                 return NULL;
2542         }
2543
2544         return domain;
2545 }
2546
2547 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2548 {
2549         *(u16 *)opaque = alias;
2550         return 0;
2551 }
2552
2553 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2554 {
2555         struct device_domain_info *info;
2556         struct dmar_domain *domain = NULL;
2557         struct intel_iommu *iommu;
2558         u16 dma_alias;
2559         unsigned long flags;
2560         u8 bus, devfn;
2561
2562         iommu = device_to_iommu(dev, &bus, &devfn);
2563         if (!iommu)
2564                 return NULL;
2565
2566         if (dev_is_pci(dev)) {
2567                 struct pci_dev *pdev = to_pci_dev(dev);
2568
2569                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2570
2571                 spin_lock_irqsave(&device_domain_lock, flags);
2572                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2573                                                       PCI_BUS_NUM(dma_alias),
2574                                                       dma_alias & 0xff);
2575                 if (info) {
2576                         iommu = info->iommu;
2577                         domain = info->domain;
2578                 }
2579                 spin_unlock_irqrestore(&device_domain_lock, flags);
2580
2581                 /* DMA alias already has a domain, use it */
2582                 if (info)
2583                         goto out;
2584         }
2585
2586         /* Allocate and initialize new domain for the device */
2587         domain = alloc_domain(0);
2588         if (!domain)
2589                 return NULL;
2590         if (domain_init(domain, iommu, gaw)) {
2591                 domain_exit(domain);
2592                 return NULL;
2593         }
2594
2595 out:
2596         return domain;
2597 }
2598
2599 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2600                                               struct dmar_domain *domain)
2601 {
2602         struct intel_iommu *iommu;
2603         struct dmar_domain *tmp;
2604         u16 req_id, dma_alias;
2605         u8 bus, devfn;
2606
2607         iommu = device_to_iommu(dev, &bus, &devfn);
2608         if (!iommu)
2609                 return NULL;
2610
2611         req_id = ((u16)bus << 8) | devfn;
2612
2613         if (dev_is_pci(dev)) {
2614                 struct pci_dev *pdev = to_pci_dev(dev);
2615
2616                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2617
2618                 /* register PCI DMA alias device */
2619                 if (req_id != dma_alias) {
2620                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2621                                         dma_alias & 0xff, NULL, domain);
2622
2623                         if (!tmp || tmp != domain)
2624                                 return tmp;
2625                 }
2626         }
2627
2628         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2629         if (!tmp || tmp != domain)
2630                 return tmp;
2631
2632         return domain;
2633 }
2634
2635 static int iommu_domain_identity_map(struct dmar_domain *domain,
2636                                      unsigned long long start,
2637                                      unsigned long long end)
2638 {
2639         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2640         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2641
2642         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2643                           dma_to_mm_pfn(last_vpfn))) {
2644                 pr_err("Reserving iova failed\n");
2645                 return -ENOMEM;
2646         }
2647
2648         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2649         /*
2650          * RMRR range might have overlap with physical memory range,
2651          * clear it first
2652          */
2653         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2654
2655         return __domain_mapping(domain, first_vpfn, NULL,
2656                                 first_vpfn, last_vpfn - first_vpfn + 1,
2657                                 DMA_PTE_READ|DMA_PTE_WRITE);
2658 }
2659
2660 static int domain_prepare_identity_map(struct device *dev,
2661                                        struct dmar_domain *domain,
2662                                        unsigned long long start,
2663                                        unsigned long long end)
2664 {
2665         /* For _hardware_ passthrough, don't bother. But for software
2666            passthrough, we do it anyway -- it may indicate a memory
2667            range which is reserved in E820, so which didn't get set
2668            up to start with in si_domain */
2669         if (domain == si_domain && hw_pass_through) {
2670                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2671                          start, end);
2672                 return 0;
2673         }
2674
2675         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2676
2677         if (end < start) {
2678                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2679                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2680                         dmi_get_system_info(DMI_BIOS_VENDOR),
2681                         dmi_get_system_info(DMI_BIOS_VERSION),
2682                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2683                 return -EIO;
2684         }
2685
2686         if (end >> agaw_to_width(domain->agaw)) {
2687                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2688                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2689                      agaw_to_width(domain->agaw),
2690                      dmi_get_system_info(DMI_BIOS_VENDOR),
2691                      dmi_get_system_info(DMI_BIOS_VERSION),
2692                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2693                 return -EIO;
2694         }
2695
2696         return iommu_domain_identity_map(domain, start, end);
2697 }
2698
2699 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2700
2701 static int __init si_domain_init(int hw)
2702 {
2703         struct dmar_rmrr_unit *rmrr;
2704         struct device *dev;
2705         int i, nid, ret;
2706
2707         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2708         if (!si_domain)
2709                 return -EFAULT;
2710
2711         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2712                 domain_exit(si_domain);
2713                 return -EFAULT;
2714         }
2715
2716         if (hw)
2717                 return 0;
2718
2719         for_each_online_node(nid) {
2720                 unsigned long start_pfn, end_pfn;
2721                 int i;
2722
2723                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2724                         ret = iommu_domain_identity_map(si_domain,
2725                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2726                         if (ret)
2727                                 return ret;
2728                 }
2729         }
2730
2731         /*
2732          * Normally we use DMA domains for devices which have RMRRs. But we
2733          * loose this requirement for graphic and usb devices. Identity map
2734          * the RMRRs for graphic and USB devices so that they could use the
2735          * si_domain.
2736          */
2737         for_each_rmrr_units(rmrr) {
2738                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2739                                           i, dev) {
2740                         unsigned long long start = rmrr->base_address;
2741                         unsigned long long end = rmrr->end_address;
2742
2743                         if (device_is_rmrr_locked(dev))
2744                                 continue;
2745
2746                         if (WARN_ON(end < start ||
2747                                     end >> agaw_to_width(si_domain->agaw)))
2748                                 continue;
2749
2750                         ret = iommu_domain_identity_map(si_domain, start, end);
2751                         if (ret)
2752                                 return ret;
2753                 }
2754         }
2755
2756         return 0;
2757 }
2758
2759 static int identity_mapping(struct device *dev)
2760 {
2761         struct device_domain_info *info;
2762
2763         info = dev->archdata.iommu;
2764         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2765                 return (info->domain == si_domain);
2766
2767         return 0;
2768 }
2769
2770 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2771 {
2772         struct dmar_domain *ndomain;
2773         struct intel_iommu *iommu;
2774         u8 bus, devfn;
2775
2776         iommu = device_to_iommu(dev, &bus, &devfn);
2777         if (!iommu)
2778                 return -ENODEV;
2779
2780         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2781         if (ndomain != domain)
2782                 return -EBUSY;
2783
2784         return 0;
2785 }
2786
2787 static bool device_has_rmrr(struct device *dev)
2788 {
2789         struct dmar_rmrr_unit *rmrr;
2790         struct device *tmp;
2791         int i;
2792
2793         rcu_read_lock();
2794         for_each_rmrr_units(rmrr) {
2795                 /*
2796                  * Return TRUE if this RMRR contains the device that
2797                  * is passed in.
2798                  */
2799                 for_each_active_dev_scope(rmrr->devices,
2800                                           rmrr->devices_cnt, i, tmp)
2801                         if (tmp == dev ||
2802                             is_downstream_to_pci_bridge(dev, tmp)) {
2803                                 rcu_read_unlock();
2804                                 return true;
2805                         }
2806         }
2807         rcu_read_unlock();
2808         return false;
2809 }
2810
2811 /**
2812  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2813  * is relaxable (ie. is allowed to be not enforced under some conditions)
2814  * @dev: device handle
2815  *
2816  * We assume that PCI USB devices with RMRRs have them largely
2817  * for historical reasons and that the RMRR space is not actively used post
2818  * boot.  This exclusion may change if vendors begin to abuse it.
2819  *
2820  * The same exception is made for graphics devices, with the requirement that
2821  * any use of the RMRR regions will be torn down before assigning the device
2822  * to a guest.
2823  *
2824  * Return: true if the RMRR is relaxable, false otherwise
2825  */
2826 static bool device_rmrr_is_relaxable(struct device *dev)
2827 {
2828         struct pci_dev *pdev;
2829
2830         if (!dev_is_pci(dev))
2831                 return false;
2832
2833         pdev = to_pci_dev(dev);
2834         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2835                 return true;
2836         else
2837                 return false;
2838 }
2839
2840 /*
2841  * There are a couple cases where we need to restrict the functionality of
2842  * devices associated with RMRRs.  The first is when evaluating a device for
2843  * identity mapping because problems exist when devices are moved in and out
2844  * of domains and their respective RMRR information is lost.  This means that
2845  * a device with associated RMRRs will never be in a "passthrough" domain.
2846  * The second is use of the device through the IOMMU API.  This interface
2847  * expects to have full control of the IOVA space for the device.  We cannot
2848  * satisfy both the requirement that RMRR access is maintained and have an
2849  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2850  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2851  * We therefore prevent devices associated with an RMRR from participating in
2852  * the IOMMU API, which eliminates them from device assignment.
2853  *
2854  * In both cases, devices which have relaxable RMRRs are not concerned by this
2855  * restriction. See device_rmrr_is_relaxable comment.
2856  */
2857 static bool device_is_rmrr_locked(struct device *dev)
2858 {
2859         if (!device_has_rmrr(dev))
2860                 return false;
2861
2862         if (device_rmrr_is_relaxable(dev))
2863                 return false;
2864
2865         return true;
2866 }
2867
2868 /*
2869  * Return the required default domain type for a specific device.
2870  *
2871  * @dev: the device in query
2872  * @startup: true if this is during early boot
2873  *
2874  * Returns:
2875  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2876  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2877  *  - 0: both identity and dynamic domains work for this device
2878  */
2879 static int device_def_domain_type(struct device *dev)
2880 {
2881         if (dev_is_pci(dev)) {
2882                 struct pci_dev *pdev = to_pci_dev(dev);
2883
2884                 if (device_is_rmrr_locked(dev))
2885                         return IOMMU_DOMAIN_DMA;
2886
2887                 /*
2888                  * Prevent any device marked as untrusted from getting
2889                  * placed into the statically identity mapping domain.
2890                  */
2891                 if (pdev->untrusted)
2892                         return IOMMU_DOMAIN_DMA;
2893
2894                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2895                         return IOMMU_DOMAIN_IDENTITY;
2896
2897                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2898                         return IOMMU_DOMAIN_IDENTITY;
2899
2900                 /*
2901                  * We want to start off with all devices in the 1:1 domain, and
2902                  * take them out later if we find they can't access all of memory.
2903                  *
2904                  * However, we can't do this for PCI devices behind bridges,
2905                  * because all PCI devices behind the same bridge will end up
2906                  * with the same source-id on their transactions.
2907                  *
2908                  * Practically speaking, we can't change things around for these
2909                  * devices at run-time, because we can't be sure there'll be no
2910                  * DMA transactions in flight for any of their siblings.
2911                  *
2912                  * So PCI devices (unless they're on the root bus) as well as
2913                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2914                  * the 1:1 domain, just in _case_ one of their siblings turns out
2915                  * not to be able to map all of memory.
2916                  */
2917                 if (!pci_is_pcie(pdev)) {
2918                         if (!pci_is_root_bus(pdev->bus))
2919                                 return IOMMU_DOMAIN_DMA;
2920                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2921                                 return IOMMU_DOMAIN_DMA;
2922                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923                         return IOMMU_DOMAIN_DMA;
2924         } else {
2925                 if (device_has_rmrr(dev))
2926                         return IOMMU_DOMAIN_DMA;
2927         }
2928
2929         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2930                         IOMMU_DOMAIN_IDENTITY : 0;
2931 }
2932
2933 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2934 {
2935         /*
2936          * Start from the sane iommu hardware state.
2937          * If the queued invalidation is already initialized by us
2938          * (for example, while enabling interrupt-remapping) then
2939          * we got the things already rolling from a sane state.
2940          */
2941         if (!iommu->qi) {
2942                 /*
2943                  * Clear any previous faults.
2944                  */
2945                 dmar_fault(-1, iommu);
2946                 /*
2947                  * Disable queued invalidation if supported and already enabled
2948                  * before OS handover.
2949                  */
2950                 dmar_disable_qi(iommu);
2951         }
2952
2953         if (dmar_enable_qi(iommu)) {
2954                 /*
2955                  * Queued Invalidate not enabled, use Register Based Invalidate
2956                  */
2957                 iommu->flush.flush_context = __iommu_flush_context;
2958                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2959                 pr_info("%s: Using Register based invalidation\n",
2960                         iommu->name);
2961         } else {
2962                 iommu->flush.flush_context = qi_flush_context;
2963                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2964                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2965         }
2966 }
2967
2968 static int copy_context_table(struct intel_iommu *iommu,
2969                               struct root_entry *old_re,
2970                               struct context_entry **tbl,
2971                               int bus, bool ext)
2972 {
2973         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2974         struct context_entry *new_ce = NULL, ce;
2975         struct context_entry *old_ce = NULL;
2976         struct root_entry re;
2977         phys_addr_t old_ce_phys;
2978
2979         tbl_idx = ext ? bus * 2 : bus;
2980         memcpy(&re, old_re, sizeof(re));
2981
2982         for (devfn = 0; devfn < 256; devfn++) {
2983                 /* First calculate the correct index */
2984                 idx = (ext ? devfn * 2 : devfn) % 256;
2985
2986                 if (idx == 0) {
2987                         /* First save what we may have and clean up */
2988                         if (new_ce) {
2989                                 tbl[tbl_idx] = new_ce;
2990                                 __iommu_flush_cache(iommu, new_ce,
2991                                                     VTD_PAGE_SIZE);
2992                                 pos = 1;
2993                         }
2994
2995                         if (old_ce)
2996                                 memunmap(old_ce);
2997
2998                         ret = 0;
2999                         if (devfn < 0x80)
3000                                 old_ce_phys = root_entry_lctp(&re);
3001                         else
3002                                 old_ce_phys = root_entry_uctp(&re);
3003
3004                         if (!old_ce_phys) {
3005                                 if (ext && devfn == 0) {
3006                                         /* No LCTP, try UCTP */
3007                                         devfn = 0x7f;
3008                                         continue;
3009                                 } else {
3010                                         goto out;
3011                                 }
3012                         }
3013
3014                         ret = -ENOMEM;
3015                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3016                                         MEMREMAP_WB);
3017                         if (!old_ce)
3018                                 goto out;
3019
3020                         new_ce = alloc_pgtable_page(iommu->node);
3021                         if (!new_ce)
3022                                 goto out_unmap;
3023
3024                         ret = 0;
3025                 }
3026
3027                 /* Now copy the context entry */
3028                 memcpy(&ce, old_ce + idx, sizeof(ce));
3029
3030                 if (!__context_present(&ce))
3031                         continue;
3032
3033                 did = context_domain_id(&ce);
3034                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3035                         set_bit(did, iommu->domain_ids);
3036
3037                 /*
3038                  * We need a marker for copied context entries. This
3039                  * marker needs to work for the old format as well as
3040                  * for extended context entries.
3041                  *
3042                  * Bit 67 of the context entry is used. In the old
3043                  * format this bit is available to software, in the
3044                  * extended format it is the PGE bit, but PGE is ignored
3045                  * by HW if PASIDs are disabled (and thus still
3046                  * available).
3047                  *
3048                  * So disable PASIDs first and then mark the entry
3049                  * copied. This means that we don't copy PASID
3050                  * translations from the old kernel, but this is fine as
3051                  * faults there are not fatal.
3052                  */
3053                 context_clear_pasid_enable(&ce);
3054                 context_set_copied(&ce);
3055
3056                 new_ce[idx] = ce;
3057         }
3058
3059         tbl[tbl_idx + pos] = new_ce;
3060
3061         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3062
3063 out_unmap:
3064         memunmap(old_ce);
3065
3066 out:
3067         return ret;
3068 }
3069
3070 static int copy_translation_tables(struct intel_iommu *iommu)
3071 {
3072         struct context_entry **ctxt_tbls;
3073         struct root_entry *old_rt;
3074         phys_addr_t old_rt_phys;
3075         int ctxt_table_entries;
3076         unsigned long flags;
3077         u64 rtaddr_reg;
3078         int bus, ret;
3079         bool new_ext, ext;
3080
3081         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3082         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3083         new_ext    = !!ecap_ecs(iommu->ecap);
3084
3085         /*
3086          * The RTT bit can only be changed when translation is disabled,
3087          * but disabling translation means to open a window for data
3088          * corruption. So bail out and don't copy anything if we would
3089          * have to change the bit.
3090          */
3091         if (new_ext != ext)
3092                 return -EINVAL;
3093
3094         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3095         if (!old_rt_phys)
3096                 return -EINVAL;
3097
3098         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3099         if (!old_rt)
3100                 return -ENOMEM;
3101
3102         /* This is too big for the stack - allocate it from slab */
3103         ctxt_table_entries = ext ? 512 : 256;
3104         ret = -ENOMEM;
3105         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3106         if (!ctxt_tbls)
3107                 goto out_unmap;
3108
3109         for (bus = 0; bus < 256; bus++) {
3110                 ret = copy_context_table(iommu, &old_rt[bus],
3111                                          ctxt_tbls, bus, ext);
3112                 if (ret) {
3113                         pr_err("%s: Failed to copy context table for bus %d\n",
3114                                 iommu->name, bus);
3115                         continue;
3116                 }
3117         }
3118
3119         spin_lock_irqsave(&iommu->lock, flags);
3120
3121         /* Context tables are copied, now write them to the root_entry table */
3122         for (bus = 0; bus < 256; bus++) {
3123                 int idx = ext ? bus * 2 : bus;
3124                 u64 val;
3125
3126                 if (ctxt_tbls[idx]) {
3127                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3128                         iommu->root_entry[bus].lo = val;
3129                 }
3130
3131                 if (!ext || !ctxt_tbls[idx + 1])
3132                         continue;
3133
3134                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3135                 iommu->root_entry[bus].hi = val;
3136         }
3137
3138         spin_unlock_irqrestore(&iommu->lock, flags);
3139
3140         kfree(ctxt_tbls);
3141
3142         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3143
3144         ret = 0;
3145
3146 out_unmap:
3147         memunmap(old_rt);
3148
3149         return ret;
3150 }
3151
3152 static int __init init_dmars(void)
3153 {
3154         struct dmar_drhd_unit *drhd;
3155         struct intel_iommu *iommu;
3156         int ret;
3157
3158         /*
3159          * for each drhd
3160          *    allocate root
3161          *    initialize and program root entry to not present
3162          * endfor
3163          */
3164         for_each_drhd_unit(drhd) {
3165                 /*
3166                  * lock not needed as this is only incremented in the single
3167                  * threaded kernel __init code path all other access are read
3168                  * only
3169                  */
3170                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3171                         g_num_of_iommus++;
3172                         continue;
3173                 }
3174                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3175         }
3176
3177         /* Preallocate enough resources for IOMMU hot-addition */
3178         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3179                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3180
3181         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3182                         GFP_KERNEL);
3183         if (!g_iommus) {
3184                 pr_err("Allocating global iommu array failed\n");
3185                 ret = -ENOMEM;
3186                 goto error;
3187         }
3188
3189         for_each_iommu(iommu, drhd) {
3190                 if (drhd->ignored) {
3191                         iommu_disable_translation(iommu);
3192                         continue;
3193                 }
3194
3195                 /*
3196                  * Find the max pasid size of all IOMMU's in the system.
3197                  * We need to ensure the system pasid table is no bigger
3198                  * than the smallest supported.
3199                  */
3200                 if (pasid_supported(iommu)) {
3201                         u32 temp = 2 << ecap_pss(iommu->ecap);
3202
3203                         intel_pasid_max_id = min_t(u32, temp,
3204                                                    intel_pasid_max_id);
3205                 }
3206
3207                 g_iommus[iommu->seq_id] = iommu;
3208
3209                 intel_iommu_init_qi(iommu);
3210
3211                 ret = iommu_init_domains(iommu);
3212                 if (ret)
3213                         goto free_iommu;
3214
3215                 init_translation_status(iommu);
3216
3217                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3218                         iommu_disable_translation(iommu);
3219                         clear_translation_pre_enabled(iommu);
3220                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3221                                 iommu->name);
3222                 }
3223
3224                 /*
3225                  * TBD:
3226                  * we could share the same root & context tables
3227                  * among all IOMMU's. Need to Split it later.
3228                  */
3229                 ret = iommu_alloc_root_entry(iommu);
3230                 if (ret)
3231                         goto free_iommu;
3232
3233                 if (translation_pre_enabled(iommu)) {
3234                         pr_info("Translation already enabled - trying to copy translation structures\n");
3235
3236                         ret = copy_translation_tables(iommu);
3237                         if (ret) {
3238                                 /*
3239                                  * We found the IOMMU with translation
3240                                  * enabled - but failed to copy over the
3241                                  * old root-entry table. Try to proceed
3242                                  * by disabling translation now and
3243                                  * allocating a clean root-entry table.
3244                                  * This might cause DMAR faults, but
3245                                  * probably the dump will still succeed.
3246                                  */
3247                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3248                                        iommu->name);
3249                                 iommu_disable_translation(iommu);
3250                                 clear_translation_pre_enabled(iommu);
3251                         } else {
3252                                 pr_info("Copied translation tables from previous kernel for %s\n",
3253                                         iommu->name);
3254                         }
3255                 }
3256
3257                 if (!ecap_pass_through(iommu->ecap))
3258                         hw_pass_through = 0;
3259 #ifdef CONFIG_INTEL_IOMMU_SVM
3260                 if (pasid_supported(iommu))
3261                         intel_svm_init(iommu);
3262 #endif
3263         }
3264
3265         /*
3266          * Now that qi is enabled on all iommus, set the root entry and flush
3267          * caches. This is required on some Intel X58 chipsets, otherwise the
3268          * flush_context function will loop forever and the boot hangs.
3269          */
3270         for_each_active_iommu(iommu, drhd) {
3271                 iommu_flush_write_buffer(iommu);
3272                 iommu_set_root_entry(iommu);
3273                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3274                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3275         }
3276
3277         if (iommu_pass_through)
3278                 iommu_identity_mapping |= IDENTMAP_ALL;
3279
3280 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3281         dmar_map_gfx = 0;
3282 #endif
3283
3284         if (!dmar_map_gfx)
3285                 iommu_identity_mapping |= IDENTMAP_GFX;
3286
3287         check_tylersburg_isoch();
3288
3289         ret = si_domain_init(hw_pass_through);
3290         if (ret)
3291                 goto free_iommu;
3292
3293         /*
3294          * for each drhd
3295          *   enable fault log
3296          *   global invalidate context cache
3297          *   global invalidate iotlb
3298          *   enable translation
3299          */
3300         for_each_iommu(iommu, drhd) {
3301                 if (drhd->ignored) {
3302                         /*
3303                          * we always have to disable PMRs or DMA may fail on
3304                          * this device
3305                          */
3306                         if (force_on)
3307                                 iommu_disable_protect_mem_regions(iommu);
3308                         continue;
3309                 }
3310
3311                 iommu_flush_write_buffer(iommu);
3312
3313 #ifdef CONFIG_INTEL_IOMMU_SVM
3314                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3315                         /*
3316                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3317                          * could cause possible lock race condition.
3318                          */
3319                         up_write(&dmar_global_lock);
3320                         ret = intel_svm_enable_prq(iommu);
3321                         down_write(&dmar_global_lock);
3322                         if (ret)
3323                                 goto free_iommu;
3324                 }
3325 #endif
3326                 ret = dmar_set_interrupt(iommu);
3327                 if (ret)
3328                         goto free_iommu;
3329         }
3330
3331         return 0;
3332
3333 free_iommu:
3334         for_each_active_iommu(iommu, drhd) {
3335                 disable_dmar_iommu(iommu);
3336                 free_dmar_iommu(iommu);
3337         }
3338
3339         kfree(g_iommus);
3340
3341 error:
3342         return ret;
3343 }
3344
3345 /* This takes a number of _MM_ pages, not VTD pages */
3346 static unsigned long intel_alloc_iova(struct device *dev,
3347                                      struct dmar_domain *domain,
3348                                      unsigned long nrpages, uint64_t dma_mask)
3349 {
3350         unsigned long iova_pfn;
3351
3352         /* Restrict dma_mask to the width that the iommu can handle */
3353         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3354         /* Ensure we reserve the whole size-aligned region */
3355         nrpages = __roundup_pow_of_two(nrpages);
3356
3357         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3358                 /*
3359                  * First try to allocate an io virtual address in
3360                  * DMA_BIT_MASK(32) and if that fails then try allocating
3361                  * from higher range
3362                  */
3363                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3364                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3365                 if (iova_pfn)
3366                         return iova_pfn;
3367         }
3368         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3369                                    IOVA_PFN(dma_mask), true);
3370         if (unlikely(!iova_pfn)) {
3371                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3372                 return 0;
3373         }
3374
3375         return iova_pfn;
3376 }
3377
3378 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3379 {
3380         struct dmar_domain *domain, *tmp;
3381         struct dmar_rmrr_unit *rmrr;
3382         struct device *i_dev;
3383         int i, ret;
3384
3385         /* Device shouldn't be attached by any domains. */
3386         domain = find_domain(dev);
3387         if (domain)
3388                 return NULL;
3389
3390         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3391         if (!domain)
3392                 goto out;
3393
3394         /* We have a new domain - setup possible RMRRs for the device */
3395         rcu_read_lock();
3396         for_each_rmrr_units(rmrr) {
3397                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3398                                           i, i_dev) {
3399                         if (i_dev != dev)
3400                                 continue;
3401
3402                         ret = domain_prepare_identity_map(dev, domain,
3403                                                           rmrr->base_address,
3404                                                           rmrr->end_address);
3405                         if (ret)
3406                                 dev_err(dev, "Mapping reserved region failed\n");
3407                 }
3408         }
3409         rcu_read_unlock();
3410
3411         tmp = set_domain_for_dev(dev, domain);
3412         if (!tmp || domain != tmp) {
3413                 domain_exit(domain);
3414                 domain = tmp;
3415         }
3416
3417 out:
3418         if (!domain)
3419                 dev_err(dev, "Allocating domain failed\n");
3420         else
3421                 domain->domain.type = IOMMU_DOMAIN_DMA;
3422
3423         return domain;
3424 }
3425
3426 /* Check if the dev needs to go through non-identity map and unmap process.*/
3427 static bool iommu_need_mapping(struct device *dev)
3428 {
3429         int ret;
3430
3431         if (iommu_dummy(dev))
3432                 return false;
3433
3434         ret = identity_mapping(dev);
3435         if (ret) {
3436                 u64 dma_mask = *dev->dma_mask;
3437
3438                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3439                         dma_mask = dev->coherent_dma_mask;
3440
3441                 if (dma_mask >= dma_get_required_mask(dev))
3442                         return false;
3443
3444                 /*
3445                  * 32 bit DMA is removed from si_domain and fall back to
3446                  * non-identity mapping.
3447                  */
3448                 dmar_remove_one_dev_info(dev);
3449                 ret = iommu_request_dma_domain_for_dev(dev);
3450                 if (ret) {
3451                         struct iommu_domain *domain;
3452                         struct dmar_domain *dmar_domain;
3453
3454                         domain = iommu_get_domain_for_dev(dev);
3455                         if (domain) {
3456                                 dmar_domain = to_dmar_domain(domain);
3457                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3458                         }
3459                         dmar_remove_one_dev_info(dev);
3460                         get_private_domain_for_dev(dev);
3461                 }
3462
3463                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3464         }
3465
3466         return true;
3467 }
3468
3469 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3470                                      size_t size, int dir, u64 dma_mask)
3471 {
3472         struct dmar_domain *domain;
3473         phys_addr_t start_paddr;
3474         unsigned long iova_pfn;
3475         int prot = 0;
3476         int ret;
3477         struct intel_iommu *iommu;
3478         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3479
3480         BUG_ON(dir == DMA_NONE);
3481
3482         domain = find_domain(dev);
3483         if (!domain)
3484                 return DMA_MAPPING_ERROR;
3485
3486         iommu = domain_get_iommu(domain);
3487         size = aligned_nrpages(paddr, size);
3488
3489         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3490         if (!iova_pfn)
3491                 goto error;
3492
3493         /*
3494          * Check if DMAR supports zero-length reads on write only
3495          * mappings..
3496          */
3497         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3498                         !cap_zlr(iommu->cap))
3499                 prot |= DMA_PTE_READ;
3500         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3501                 prot |= DMA_PTE_WRITE;
3502         /*
3503          * paddr - (paddr + size) might be partial page, we should map the whole
3504          * page.  Note: if two part of one page are separately mapped, we
3505          * might have two guest_addr mapping to the same host paddr, but this
3506          * is not a big problem
3507          */
3508         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3509                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3510         if (ret)
3511                 goto error;
3512
3513         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3514         start_paddr += paddr & ~PAGE_MASK;
3515         return start_paddr;
3516
3517 error:
3518         if (iova_pfn)
3519                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3520         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3521                 size, (unsigned long long)paddr, dir);
3522         return DMA_MAPPING_ERROR;
3523 }
3524
3525 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3526                                  unsigned long offset, size_t size,
3527                                  enum dma_data_direction dir,
3528                                  unsigned long attrs)
3529 {
3530         if (iommu_need_mapping(dev))
3531                 return __intel_map_single(dev, page_to_phys(page) + offset,
3532                                 size, dir, *dev->dma_mask);
3533         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3534 }
3535
3536 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3537                                      size_t size, enum dma_data_direction dir,
3538                                      unsigned long attrs)
3539 {
3540         if (iommu_need_mapping(dev))
3541                 return __intel_map_single(dev, phys_addr, size, dir,
3542                                 *dev->dma_mask);
3543         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3544 }
3545
3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3547 {
3548         struct dmar_domain *domain;
3549         unsigned long start_pfn, last_pfn;
3550         unsigned long nrpages;
3551         unsigned long iova_pfn;
3552         struct intel_iommu *iommu;
3553         struct page *freelist;
3554         struct pci_dev *pdev = NULL;
3555
3556         domain = find_domain(dev);
3557         BUG_ON(!domain);
3558
3559         iommu = domain_get_iommu(domain);
3560
3561         iova_pfn = IOVA_PFN(dev_addr);
3562
3563         nrpages = aligned_nrpages(dev_addr, size);
3564         start_pfn = mm_to_dma_pfn(iova_pfn);
3565         last_pfn = start_pfn + nrpages - 1;
3566
3567         if (dev_is_pci(dev))
3568                 pdev = to_pci_dev(dev);
3569
3570         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3571
3572         freelist = domain_unmap(domain, start_pfn, last_pfn);
3573
3574         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3575                         !has_iova_flush_queue(&domain->iovad)) {
3576                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577                                       nrpages, !freelist, 0);
3578                 /* free iova */
3579                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580                 dma_free_pagelist(freelist);
3581         } else {
3582                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3583                            (unsigned long)freelist);
3584                 /*
3585                  * queue up the release of the unmap to save the 1/6th of the
3586                  * cpu used up by the iotlb flush operation...
3587                  */
3588         }
3589 }
3590
3591 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3592                              size_t size, enum dma_data_direction dir,
3593                              unsigned long attrs)
3594 {
3595         if (iommu_need_mapping(dev))
3596                 intel_unmap(dev, dev_addr, size);
3597         else
3598                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3599 }
3600
3601 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3602                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3603 {
3604         if (iommu_need_mapping(dev))
3605                 intel_unmap(dev, dev_addr, size);
3606 }
3607
3608 static void *intel_alloc_coherent(struct device *dev, size_t size,
3609                                   dma_addr_t *dma_handle, gfp_t flags,
3610                                   unsigned long attrs)
3611 {
3612         struct page *page = NULL;
3613         int order;
3614
3615         if (!iommu_need_mapping(dev))
3616                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3617
3618         size = PAGE_ALIGN(size);
3619         order = get_order(size);
3620
3621         if (gfpflags_allow_blocking(flags)) {
3622                 unsigned int count = size >> PAGE_SHIFT;
3623
3624                 page = dma_alloc_from_contiguous(dev, count, order,
3625                                                  flags & __GFP_NOWARN);
3626         }
3627
3628         if (!page)
3629                 page = alloc_pages(flags, order);
3630         if (!page)
3631                 return NULL;
3632         memset(page_address(page), 0, size);
3633
3634         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3635                                          DMA_BIDIRECTIONAL,
3636                                          dev->coherent_dma_mask);
3637         if (*dma_handle != DMA_MAPPING_ERROR)
3638                 return page_address(page);
3639         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3640                 __free_pages(page, order);
3641
3642         return NULL;
3643 }
3644
3645 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3646                                 dma_addr_t dma_handle, unsigned long attrs)
3647 {
3648         int order;
3649         struct page *page = virt_to_page(vaddr);
3650
3651         if (!iommu_need_mapping(dev))
3652                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3653
3654         size = PAGE_ALIGN(size);
3655         order = get_order(size);
3656
3657         intel_unmap(dev, dma_handle, size);
3658         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3659                 __free_pages(page, order);
3660 }
3661
3662 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3663                            int nelems, enum dma_data_direction dir,
3664                            unsigned long attrs)
3665 {
3666         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3667         unsigned long nrpages = 0;
3668         struct scatterlist *sg;
3669         int i;
3670
3671         if (!iommu_need_mapping(dev))
3672                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3673
3674         for_each_sg(sglist, sg, nelems, i) {
3675                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3676         }
3677
3678         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3679 }
3680
3681 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3682                         enum dma_data_direction dir, unsigned long attrs)
3683 {
3684         int i;
3685         struct dmar_domain *domain;
3686         size_t size = 0;
3687         int prot = 0;
3688         unsigned long iova_pfn;
3689         int ret;
3690         struct scatterlist *sg;
3691         unsigned long start_vpfn;
3692         struct intel_iommu *iommu;
3693
3694         BUG_ON(dir == DMA_NONE);
3695         if (!iommu_need_mapping(dev))
3696                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3697
3698         domain = find_domain(dev);
3699         if (!domain)
3700                 return 0;
3701
3702         iommu = domain_get_iommu(domain);
3703
3704         for_each_sg(sglist, sg, nelems, i)
3705                 size += aligned_nrpages(sg->offset, sg->length);
3706
3707         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3708                                 *dev->dma_mask);
3709         if (!iova_pfn) {
3710                 sglist->dma_length = 0;
3711                 return 0;
3712         }
3713
3714         /*
3715          * Check if DMAR supports zero-length reads on write only
3716          * mappings..
3717          */
3718         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3719                         !cap_zlr(iommu->cap))
3720                 prot |= DMA_PTE_READ;
3721         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3722                 prot |= DMA_PTE_WRITE;
3723
3724         start_vpfn = mm_to_dma_pfn(iova_pfn);
3725
3726         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3727         if (unlikely(ret)) {
3728                 dma_pte_free_pagetable(domain, start_vpfn,
3729                                        start_vpfn + size - 1,
3730                                        agaw_to_level(domain->agaw) + 1);
3731                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3732                 return 0;
3733         }
3734
3735         return nelems;
3736 }
3737
3738 static const struct dma_map_ops intel_dma_ops = {
3739         .alloc = intel_alloc_coherent,
3740         .free = intel_free_coherent,
3741         .map_sg = intel_map_sg,
3742         .unmap_sg = intel_unmap_sg,
3743         .map_page = intel_map_page,
3744         .unmap_page = intel_unmap_page,
3745         .map_resource = intel_map_resource,
3746         .unmap_resource = intel_unmap_resource,
3747         .dma_supported = dma_direct_supported,
3748 };
3749
3750 static inline int iommu_domain_cache_init(void)
3751 {
3752         int ret = 0;
3753
3754         iommu_domain_cache = kmem_cache_create("iommu_domain",
3755                                          sizeof(struct dmar_domain),
3756                                          0,
3757                                          SLAB_HWCACHE_ALIGN,
3758
3759                                          NULL);
3760         if (!iommu_domain_cache) {
3761                 pr_err("Couldn't create iommu_domain cache\n");
3762                 ret = -ENOMEM;
3763         }
3764
3765         return ret;
3766 }
3767
3768 static inline int iommu_devinfo_cache_init(void)
3769 {
3770         int ret = 0;
3771
3772         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3773                                          sizeof(struct device_domain_info),
3774                                          0,
3775                                          SLAB_HWCACHE_ALIGN,
3776                                          NULL);
3777         if (!iommu_devinfo_cache) {
3778                 pr_err("Couldn't create devinfo cache\n");
3779                 ret = -ENOMEM;
3780         }
3781
3782         return ret;
3783 }
3784
3785 static int __init iommu_init_mempool(void)
3786 {
3787         int ret;
3788         ret = iova_cache_get();
3789         if (ret)
3790                 return ret;
3791
3792         ret = iommu_domain_cache_init();
3793         if (ret)
3794                 goto domain_error;
3795
3796         ret = iommu_devinfo_cache_init();
3797         if (!ret)
3798                 return ret;
3799
3800         kmem_cache_destroy(iommu_domain_cache);
3801 domain_error:
3802         iova_cache_put();
3803
3804         return -ENOMEM;
3805 }
3806
3807 static void __init iommu_exit_mempool(void)
3808 {
3809         kmem_cache_destroy(iommu_devinfo_cache);
3810         kmem_cache_destroy(iommu_domain_cache);
3811         iova_cache_put();
3812 }
3813
3814 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3815 {
3816         struct dmar_drhd_unit *drhd;
3817         u32 vtbar;
3818         int rc;
3819
3820         /* We know that this device on this chipset has its own IOMMU.
3821          * If we find it under a different IOMMU, then the BIOS is lying
3822          * to us. Hope that the IOMMU for this device is actually
3823          * disabled, and it needs no translation...
3824          */
3825         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3826         if (rc) {
3827                 /* "can't" happen */
3828                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3829                 return;
3830         }
3831         vtbar &= 0xffff0000;
3832
3833         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3834         drhd = dmar_find_matched_drhd_unit(pdev);
3835         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3836                             TAINT_FIRMWARE_WORKAROUND,
3837                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3838                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3839 }
3840 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3841
3842 static void __init init_no_remapping_devices(void)
3843 {
3844         struct dmar_drhd_unit *drhd;
3845         struct device *dev;
3846         int i;
3847
3848         for_each_drhd_unit(drhd) {
3849                 if (!drhd->include_all) {
3850                         for_each_active_dev_scope(drhd->devices,
3851                                                   drhd->devices_cnt, i, dev)
3852                                 break;
3853                         /* ignore DMAR unit if no devices exist */
3854                         if (i == drhd->devices_cnt)
3855                                 drhd->ignored = 1;
3856                 }
3857         }
3858
3859         for_each_active_drhd_unit(drhd) {
3860                 if (drhd->include_all)
3861                         continue;
3862
3863                 for_each_active_dev_scope(drhd->devices,
3864                                           drhd->devices_cnt, i, dev)
3865                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3866                                 break;
3867                 if (i < drhd->devices_cnt)
3868                         continue;
3869
3870                 /* This IOMMU has *only* gfx devices. Either bypass it or
3871                    set the gfx_mapped flag, as appropriate */
3872                 if (!dmar_map_gfx) {
3873                         drhd->ignored = 1;
3874                         for_each_active_dev_scope(drhd->devices,
3875                                                   drhd->devices_cnt, i, dev)
3876                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3877                 }
3878         }
3879 }
3880
3881 #ifdef CONFIG_SUSPEND
3882 static int init_iommu_hw(void)
3883 {
3884         struct dmar_drhd_unit *drhd;
3885         struct intel_iommu *iommu = NULL;
3886
3887         for_each_active_iommu(iommu, drhd)
3888                 if (iommu->qi)
3889                         dmar_reenable_qi(iommu);
3890
3891         for_each_iommu(iommu, drhd) {
3892                 if (drhd->ignored) {
3893                         /*
3894                          * we always have to disable PMRs or DMA may fail on
3895                          * this device
3896                          */
3897                         if (force_on)
3898                                 iommu_disable_protect_mem_regions(iommu);
3899                         continue;
3900                 }
3901
3902                 iommu_flush_write_buffer(iommu);
3903
3904                 iommu_set_root_entry(iommu);
3905
3906                 iommu->flush.flush_context(iommu, 0, 0, 0,
3907                                            DMA_CCMD_GLOBAL_INVL);
3908                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3909                 iommu_enable_translation(iommu);
3910                 iommu_disable_protect_mem_regions(iommu);
3911         }
3912
3913         return 0;
3914 }
3915
3916 static void iommu_flush_all(void)
3917 {
3918         struct dmar_drhd_unit *drhd;
3919         struct intel_iommu *iommu;
3920
3921         for_each_active_iommu(iommu, drhd) {
3922                 iommu->flush.flush_context(iommu, 0, 0, 0,
3923                                            DMA_CCMD_GLOBAL_INVL);
3924                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3925                                          DMA_TLB_GLOBAL_FLUSH);
3926         }
3927 }
3928
3929 static int iommu_suspend(void)
3930 {
3931         struct dmar_drhd_unit *drhd;
3932         struct intel_iommu *iommu = NULL;
3933         unsigned long flag;
3934
3935         for_each_active_iommu(iommu, drhd) {
3936                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3937                                                  GFP_ATOMIC);
3938                 if (!iommu->iommu_state)
3939                         goto nomem;
3940         }
3941
3942         iommu_flush_all();
3943
3944         for_each_active_iommu(iommu, drhd) {
3945                 iommu_disable_translation(iommu);
3946
3947                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3948
3949                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3950                         readl(iommu->reg + DMAR_FECTL_REG);
3951                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3952                         readl(iommu->reg + DMAR_FEDATA_REG);
3953                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3954                         readl(iommu->reg + DMAR_FEADDR_REG);
3955                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3956                         readl(iommu->reg + DMAR_FEUADDR_REG);
3957
3958                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3959         }
3960         return 0;
3961
3962 nomem:
3963         for_each_active_iommu(iommu, drhd)
3964                 kfree(iommu->iommu_state);
3965
3966         return -ENOMEM;
3967 }
3968
3969 static void iommu_resume(void)
3970 {
3971         struct dmar_drhd_unit *drhd;
3972         struct intel_iommu *iommu = NULL;
3973         unsigned long flag;
3974
3975         if (init_iommu_hw()) {
3976                 if (force_on)
3977                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3978                 else
3979                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3980                 return;
3981         }
3982
3983         for_each_active_iommu(iommu, drhd) {
3984
3985                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3986
3987                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3988                         iommu->reg + DMAR_FECTL_REG);
3989                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3990                         iommu->reg + DMAR_FEDATA_REG);
3991                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3992                         iommu->reg + DMAR_FEADDR_REG);
3993                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3994                         iommu->reg + DMAR_FEUADDR_REG);
3995
3996                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3997         }
3998
3999         for_each_active_iommu(iommu, drhd)
4000                 kfree(iommu->iommu_state);
4001 }
4002
4003 static struct syscore_ops iommu_syscore_ops = {
4004         .resume         = iommu_resume,
4005         .suspend        = iommu_suspend,
4006 };
4007
4008 static void __init init_iommu_pm_ops(void)
4009 {
4010         register_syscore_ops(&iommu_syscore_ops);
4011 }
4012
4013 #else
4014 static inline void init_iommu_pm_ops(void) {}
4015 #endif  /* CONFIG_PM */
4016
4017 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4018 {
4019         struct acpi_dmar_reserved_memory *rmrr;
4020         struct dmar_rmrr_unit *rmrru;
4021
4022         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4023         if (!rmrru)
4024                 goto out;
4025
4026         rmrru->hdr = header;
4027         rmrr = (struct acpi_dmar_reserved_memory *)header;
4028         rmrru->base_address = rmrr->base_address;
4029         rmrru->end_address = rmrr->end_address;
4030
4031         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4032                                 ((void *)rmrr) + rmrr->header.length,
4033                                 &rmrru->devices_cnt);
4034         if (rmrru->devices_cnt && rmrru->devices == NULL)
4035                 goto free_rmrru;
4036
4037         list_add(&rmrru->list, &dmar_rmrr_units);
4038
4039         return 0;
4040 free_rmrru:
4041         kfree(rmrru);
4042 out:
4043         return -ENOMEM;
4044 }
4045
4046 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4047 {
4048         struct dmar_atsr_unit *atsru;
4049         struct acpi_dmar_atsr *tmp;
4050
4051         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4052                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4053                 if (atsr->segment != tmp->segment)
4054                         continue;
4055                 if (atsr->header.length != tmp->header.length)
4056                         continue;
4057                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4058                         return atsru;
4059         }
4060
4061         return NULL;
4062 }
4063
4064 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4065 {
4066         struct acpi_dmar_atsr *atsr;
4067         struct dmar_atsr_unit *atsru;
4068
4069         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4070                 return 0;
4071
4072         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4073         atsru = dmar_find_atsr(atsr);
4074         if (atsru)
4075                 return 0;
4076
4077         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4078         if (!atsru)
4079                 return -ENOMEM;
4080
4081         /*
4082          * If memory is allocated from slab by ACPI _DSM method, we need to
4083          * copy the memory content because the memory buffer will be freed
4084          * on return.
4085          */
4086         atsru->hdr = (void *)(atsru + 1);
4087         memcpy(atsru->hdr, hdr, hdr->length);
4088         atsru->include_all = atsr->flags & 0x1;
4089         if (!atsru->include_all) {
4090                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4091                                 (void *)atsr + atsr->header.length,
4092                                 &atsru->devices_cnt);
4093                 if (atsru->devices_cnt && atsru->devices == NULL) {
4094                         kfree(atsru);
4095                         return -ENOMEM;
4096                 }
4097         }
4098
4099         list_add_rcu(&atsru->list, &dmar_atsr_units);
4100
4101         return 0;
4102 }
4103
4104 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4105 {
4106         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4107         kfree(atsru);
4108 }
4109
4110 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4111 {
4112         struct acpi_dmar_atsr *atsr;
4113         struct dmar_atsr_unit *atsru;
4114
4115         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4116         atsru = dmar_find_atsr(atsr);
4117         if (atsru) {
4118                 list_del_rcu(&atsru->list);
4119                 synchronize_rcu();
4120                 intel_iommu_free_atsr(atsru);
4121         }
4122
4123         return 0;
4124 }
4125
4126 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4127 {
4128         int i;
4129         struct device *dev;
4130         struct acpi_dmar_atsr *atsr;
4131         struct dmar_atsr_unit *atsru;
4132
4133         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4134         atsru = dmar_find_atsr(atsr);
4135         if (!atsru)
4136                 return 0;
4137
4138         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4139                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4140                                           i, dev)
4141                         return -EBUSY;
4142         }
4143
4144         return 0;
4145 }
4146
4147 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4148 {
4149         int sp, ret;
4150         struct intel_iommu *iommu = dmaru->iommu;
4151
4152         if (g_iommus[iommu->seq_id])
4153                 return 0;
4154
4155         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4156                 pr_warn("%s: Doesn't support hardware pass through.\n",
4157                         iommu->name);
4158                 return -ENXIO;
4159         }
4160         if (!ecap_sc_support(iommu->ecap) &&
4161             domain_update_iommu_snooping(iommu)) {
4162                 pr_warn("%s: Doesn't support snooping.\n",
4163                         iommu->name);
4164                 return -ENXIO;
4165         }
4166         sp = domain_update_iommu_superpage(iommu) - 1;
4167         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4168                 pr_warn("%s: Doesn't support large page.\n",
4169                         iommu->name);
4170                 return -ENXIO;
4171         }
4172
4173         /*
4174          * Disable translation if already enabled prior to OS handover.
4175          */
4176         if (iommu->gcmd & DMA_GCMD_TE)
4177                 iommu_disable_translation(iommu);
4178
4179         g_iommus[iommu->seq_id] = iommu;
4180         ret = iommu_init_domains(iommu);
4181         if (ret == 0)
4182                 ret = iommu_alloc_root_entry(iommu);
4183         if (ret)
4184                 goto out;
4185
4186 #ifdef CONFIG_INTEL_IOMMU_SVM
4187         if (pasid_supported(iommu))
4188                 intel_svm_init(iommu);
4189 #endif
4190
4191         if (dmaru->ignored) {
4192                 /*
4193                  * we always have to disable PMRs or DMA may fail on this device
4194                  */
4195                 if (force_on)
4196                         iommu_disable_protect_mem_regions(iommu);
4197                 return 0;
4198         }
4199
4200         intel_iommu_init_qi(iommu);
4201         iommu_flush_write_buffer(iommu);
4202
4203 #ifdef CONFIG_INTEL_IOMMU_SVM
4204         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4205                 ret = intel_svm_enable_prq(iommu);
4206                 if (ret)
4207                         goto disable_iommu;
4208         }
4209 #endif
4210         ret = dmar_set_interrupt(iommu);
4211         if (ret)
4212                 goto disable_iommu;
4213
4214         iommu_set_root_entry(iommu);
4215         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4216         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4217         iommu_enable_translation(iommu);
4218
4219         iommu_disable_protect_mem_regions(iommu);
4220         return 0;
4221
4222 disable_iommu:
4223         disable_dmar_iommu(iommu);
4224 out:
4225         free_dmar_iommu(iommu);
4226         return ret;
4227 }
4228
4229 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4230 {
4231         int ret = 0;
4232         struct intel_iommu *iommu = dmaru->iommu;
4233
4234         if (!intel_iommu_enabled)
4235                 return 0;
4236         if (iommu == NULL)
4237                 return -EINVAL;
4238
4239         if (insert) {
4240                 ret = intel_iommu_add(dmaru);
4241         } else {
4242                 disable_dmar_iommu(iommu);
4243                 free_dmar_iommu(iommu);
4244         }
4245
4246         return ret;
4247 }
4248
4249 static void intel_iommu_free_dmars(void)
4250 {
4251         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4252         struct dmar_atsr_unit *atsru, *atsr_n;
4253
4254         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4255                 list_del(&rmrru->list);
4256                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4257                 kfree(rmrru);
4258         }
4259
4260         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4261                 list_del(&atsru->list);
4262                 intel_iommu_free_atsr(atsru);
4263         }
4264 }
4265
4266 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4267 {
4268         int i, ret = 1;
4269         struct pci_bus *bus;
4270         struct pci_dev *bridge = NULL;
4271         struct device *tmp;
4272         struct acpi_dmar_atsr *atsr;
4273         struct dmar_atsr_unit *atsru;
4274
4275         dev = pci_physfn(dev);
4276         for (bus = dev->bus; bus; bus = bus->parent) {
4277                 bridge = bus->self;
4278                 /* If it's an integrated device, allow ATS */
4279                 if (!bridge)
4280                         return 1;
4281                 /* Connected via non-PCIe: no ATS */
4282                 if (!pci_is_pcie(bridge) ||
4283                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4284                         return 0;
4285                 /* If we found the root port, look it up in the ATSR */
4286                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4287                         break;
4288         }
4289
4290         rcu_read_lock();
4291         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4292                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4293                 if (atsr->segment != pci_domain_nr(dev->bus))
4294                         continue;
4295
4296                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4297                         if (tmp == &bridge->dev)
4298                                 goto out;
4299
4300                 if (atsru->include_all)
4301                         goto out;
4302         }
4303         ret = 0;
4304 out:
4305         rcu_read_unlock();
4306
4307         return ret;
4308 }
4309
4310 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4311 {
4312         int ret;
4313         struct dmar_rmrr_unit *rmrru;
4314         struct dmar_atsr_unit *atsru;
4315         struct acpi_dmar_atsr *atsr;
4316         struct acpi_dmar_reserved_memory *rmrr;
4317
4318         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4319                 return 0;
4320
4321         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4322                 rmrr = container_of(rmrru->hdr,
4323                                     struct acpi_dmar_reserved_memory, header);
4324                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4325                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4326                                 ((void *)rmrr) + rmrr->header.length,
4327                                 rmrr->segment, rmrru->devices,
4328                                 rmrru->devices_cnt);
4329                         if (ret < 0)
4330                                 return ret;
4331                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4332                         dmar_remove_dev_scope(info, rmrr->segment,
4333                                 rmrru->devices, rmrru->devices_cnt);
4334                 }
4335         }
4336
4337         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4338                 if (atsru->include_all)
4339                         continue;
4340
4341                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4342                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4343                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4344                                         (void *)atsr + atsr->header.length,
4345                                         atsr->segment, atsru->devices,
4346                                         atsru->devices_cnt);
4347                         if (ret > 0)
4348                                 break;
4349                         else if (ret < 0)
4350                                 return ret;
4351                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4352                         if (dmar_remove_dev_scope(info, atsr->segment,
4353                                         atsru->devices, atsru->devices_cnt))
4354                                 break;
4355                 }
4356         }
4357
4358         return 0;
4359 }
4360
4361 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4362                                        unsigned long val, void *v)
4363 {
4364         struct memory_notify *mhp = v;
4365         unsigned long long start, end;
4366         unsigned long start_vpfn, last_vpfn;
4367
4368         switch (val) {
4369         case MEM_GOING_ONLINE:
4370                 start = mhp->start_pfn << PAGE_SHIFT;
4371                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4372                 if (iommu_domain_identity_map(si_domain, start, end)) {
4373                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4374                                 start, end);
4375                         return NOTIFY_BAD;
4376                 }
4377                 break;
4378
4379         case MEM_OFFLINE:
4380         case MEM_CANCEL_ONLINE:
4381                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4382                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4383                 while (start_vpfn <= last_vpfn) {
4384                         struct iova *iova;
4385                         struct dmar_drhd_unit *drhd;
4386                         struct intel_iommu *iommu;
4387                         struct page *freelist;
4388
4389                         iova = find_iova(&si_domain->iovad, start_vpfn);
4390                         if (iova == NULL) {
4391                                 pr_debug("Failed get IOVA for PFN %lx\n",
4392                                          start_vpfn);
4393                                 break;
4394                         }
4395
4396                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4397                                                      start_vpfn, last_vpfn);
4398                         if (iova == NULL) {
4399                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4400                                         start_vpfn, last_vpfn);
4401                                 return NOTIFY_BAD;
4402                         }
4403
4404                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4405                                                iova->pfn_hi);
4406
4407                         rcu_read_lock();
4408                         for_each_active_iommu(iommu, drhd)
4409                                 iommu_flush_iotlb_psi(iommu, si_domain,
4410                                         iova->pfn_lo, iova_size(iova),
4411                                         !freelist, 0);
4412                         rcu_read_unlock();
4413                         dma_free_pagelist(freelist);
4414
4415                         start_vpfn = iova->pfn_hi + 1;
4416                         free_iova_mem(iova);
4417                 }
4418                 break;
4419         }
4420
4421         return NOTIFY_OK;
4422 }
4423
4424 static struct notifier_block intel_iommu_memory_nb = {
4425         .notifier_call = intel_iommu_memory_notifier,
4426         .priority = 0
4427 };
4428
4429 static void free_all_cpu_cached_iovas(unsigned int cpu)
4430 {
4431         int i;
4432
4433         for (i = 0; i < g_num_of_iommus; i++) {
4434                 struct intel_iommu *iommu = g_iommus[i];
4435                 struct dmar_domain *domain;
4436                 int did;
4437
4438                 if (!iommu)
4439                         continue;
4440
4441                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4442                         domain = get_iommu_domain(iommu, (u16)did);
4443
4444                         if (!domain)
4445                                 continue;
4446                         free_cpu_cached_iovas(cpu, &domain->iovad);
4447                 }
4448         }
4449 }
4450
4451 static int intel_iommu_cpu_dead(unsigned int cpu)
4452 {
4453         free_all_cpu_cached_iovas(cpu);
4454         return 0;
4455 }
4456
4457 static void intel_disable_iommus(void)
4458 {
4459         struct intel_iommu *iommu = NULL;
4460         struct dmar_drhd_unit *drhd;
4461
4462         for_each_iommu(iommu, drhd)
4463                 iommu_disable_translation(iommu);
4464 }
4465
4466 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4467 {
4468         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4469
4470         return container_of(iommu_dev, struct intel_iommu, iommu);
4471 }
4472
4473 static ssize_t intel_iommu_show_version(struct device *dev,
4474                                         struct device_attribute *attr,
4475                                         char *buf)
4476 {
4477         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4478         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4479         return sprintf(buf, "%d:%d\n",
4480                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4481 }
4482 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4483
4484 static ssize_t intel_iommu_show_address(struct device *dev,
4485                                         struct device_attribute *attr,
4486                                         char *buf)
4487 {
4488         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4489         return sprintf(buf, "%llx\n", iommu->reg_phys);
4490 }
4491 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4492
4493 static ssize_t intel_iommu_show_cap(struct device *dev,
4494                                     struct device_attribute *attr,
4495                                     char *buf)
4496 {
4497         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4498         return sprintf(buf, "%llx\n", iommu->cap);
4499 }
4500 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4501
4502 static ssize_t intel_iommu_show_ecap(struct device *dev,
4503                                     struct device_attribute *attr,
4504                                     char *buf)
4505 {
4506         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4507         return sprintf(buf, "%llx\n", iommu->ecap);
4508 }
4509 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4510
4511 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4512                                       struct device_attribute *attr,
4513                                       char *buf)
4514 {
4515         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4516         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4517 }
4518 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4519
4520 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4521                                            struct device_attribute *attr,
4522                                            char *buf)
4523 {
4524         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4525         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4526                                                   cap_ndoms(iommu->cap)));
4527 }
4528 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4529
4530 static struct attribute *intel_iommu_attrs[] = {
4531         &dev_attr_version.attr,
4532         &dev_attr_address.attr,
4533         &dev_attr_cap.attr,
4534         &dev_attr_ecap.attr,
4535         &dev_attr_domains_supported.attr,
4536         &dev_attr_domains_used.attr,
4537         NULL,
4538 };
4539
4540 static struct attribute_group intel_iommu_group = {
4541         .name = "intel-iommu",
4542         .attrs = intel_iommu_attrs,
4543 };
4544
4545 const struct attribute_group *intel_iommu_groups[] = {
4546         &intel_iommu_group,
4547         NULL,
4548 };
4549
4550 static inline bool has_untrusted_dev(void)
4551 {
4552         struct pci_dev *pdev = NULL;
4553
4554         for_each_pci_dev(pdev)
4555                 if (pdev->untrusted)
4556                         return true;
4557
4558         return false;
4559 }
4560
4561 static int __init platform_optin_force_iommu(void)
4562 {
4563         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4564                 return 0;
4565
4566         if (no_iommu || dmar_disabled)
4567                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4568
4569         /*
4570          * If Intel-IOMMU is disabled by default, we will apply identity
4571          * map for all devices except those marked as being untrusted.
4572          */
4573         if (dmar_disabled)
4574                 iommu_identity_mapping |= IDENTMAP_ALL;
4575
4576         dmar_disabled = 0;
4577         no_iommu = 0;
4578
4579         return 1;
4580 }
4581
4582 static int __init probe_acpi_namespace_devices(void)
4583 {
4584         struct dmar_drhd_unit *drhd;
4585         /* To avoid a -Wunused-but-set-variable warning. */
4586         struct intel_iommu *iommu __maybe_unused;
4587         struct device *dev;
4588         int i, ret = 0;
4589
4590         for_each_active_iommu(iommu, drhd) {
4591                 for_each_active_dev_scope(drhd->devices,
4592                                           drhd->devices_cnt, i, dev) {
4593                         struct acpi_device_physical_node *pn;
4594                         struct iommu_group *group;
4595                         struct acpi_device *adev;
4596
4597                         if (dev->bus != &acpi_bus_type)
4598                                 continue;
4599
4600                         adev = to_acpi_device(dev);
4601                         mutex_lock(&adev->physical_node_lock);
4602                         list_for_each_entry(pn,
4603                                             &adev->physical_node_list, node) {
4604                                 group = iommu_group_get(pn->dev);
4605                                 if (group) {
4606                                         iommu_group_put(group);
4607                                         continue;
4608                                 }
4609
4610                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4611                                 ret = iommu_probe_device(pn->dev);
4612                                 if (ret)
4613                                         break;
4614                         }
4615                         mutex_unlock(&adev->physical_node_lock);
4616
4617                         if (ret)
4618                                 return ret;
4619                 }
4620         }
4621
4622         return 0;
4623 }
4624
4625 int __init intel_iommu_init(void)
4626 {
4627         int ret = -ENODEV;
4628         struct dmar_drhd_unit *drhd;
4629         struct intel_iommu *iommu;
4630
4631         /*
4632          * Intel IOMMU is required for a TXT/tboot launch or platform
4633          * opt in, so enforce that.
4634          */
4635         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4636
4637         if (iommu_init_mempool()) {
4638                 if (force_on)
4639                         panic("tboot: Failed to initialize iommu memory\n");
4640                 return -ENOMEM;
4641         }
4642
4643         down_write(&dmar_global_lock);
4644         if (dmar_table_init()) {
4645                 if (force_on)
4646                         panic("tboot: Failed to initialize DMAR table\n");
4647                 goto out_free_dmar;
4648         }
4649
4650         if (dmar_dev_scope_init() < 0) {
4651                 if (force_on)
4652                         panic("tboot: Failed to initialize DMAR device scope\n");
4653                 goto out_free_dmar;
4654         }
4655
4656         up_write(&dmar_global_lock);
4657
4658         /*
4659          * The bus notifier takes the dmar_global_lock, so lockdep will
4660          * complain later when we register it under the lock.
4661          */
4662         dmar_register_bus_notifier();
4663
4664         down_write(&dmar_global_lock);
4665
4666         if (no_iommu || dmar_disabled) {
4667                 /*
4668                  * We exit the function here to ensure IOMMU's remapping and
4669                  * mempool aren't setup, which means that the IOMMU's PMRs
4670                  * won't be disabled via the call to init_dmars(). So disable
4671                  * it explicitly here. The PMRs were setup by tboot prior to
4672                  * calling SENTER, but the kernel is expected to reset/tear
4673                  * down the PMRs.
4674                  */
4675                 if (intel_iommu_tboot_noforce) {
4676                         for_each_iommu(iommu, drhd)
4677                                 iommu_disable_protect_mem_regions(iommu);
4678                 }
4679
4680                 /*
4681                  * Make sure the IOMMUs are switched off, even when we
4682                  * boot into a kexec kernel and the previous kernel left
4683                  * them enabled
4684                  */
4685                 intel_disable_iommus();
4686                 goto out_free_dmar;
4687         }
4688
4689         if (list_empty(&dmar_rmrr_units))
4690                 pr_info("No RMRR found\n");
4691
4692         if (list_empty(&dmar_atsr_units))
4693                 pr_info("No ATSR found\n");
4694
4695         if (dmar_init_reserved_ranges()) {
4696                 if (force_on)
4697                         panic("tboot: Failed to reserve iommu ranges\n");
4698                 goto out_free_reserved_range;
4699         }
4700
4701         if (dmar_map_gfx)
4702                 intel_iommu_gfx_mapped = 1;
4703
4704         init_no_remapping_devices();
4705
4706         ret = init_dmars();
4707         if (ret) {
4708                 if (force_on)
4709                         panic("tboot: Failed to initialize DMARs\n");
4710                 pr_err("Initialization failed\n");
4711                 goto out_free_reserved_range;
4712         }
4713         up_write(&dmar_global_lock);
4714
4715 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4716         /*
4717          * If the system has no untrusted device or the user has decided
4718          * to disable the bounce page mechanisms, we don't need swiotlb.
4719          * Mark this and the pre-allocated bounce pages will be released
4720          * later.
4721          */
4722         if (!has_untrusted_dev() || intel_no_bounce)
4723                 swiotlb = 0;
4724 #endif
4725         dma_ops = &intel_dma_ops;
4726
4727         init_iommu_pm_ops();
4728
4729         for_each_active_iommu(iommu, drhd) {
4730                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4731                                        intel_iommu_groups,
4732                                        "%s", iommu->name);
4733                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4734                 iommu_device_register(&iommu->iommu);
4735         }
4736
4737         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4738         if (si_domain && !hw_pass_through)
4739                 register_memory_notifier(&intel_iommu_memory_nb);
4740         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4741                           intel_iommu_cpu_dead);
4742
4743         down_read(&dmar_global_lock);
4744         if (probe_acpi_namespace_devices())
4745                 pr_warn("ACPI name space devices didn't probe correctly\n");
4746         up_read(&dmar_global_lock);
4747
4748         /* Finally, we enable the DMA remapping hardware. */
4749         for_each_iommu(iommu, drhd) {
4750                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4751                         iommu_enable_translation(iommu);
4752
4753                 iommu_disable_protect_mem_regions(iommu);
4754         }
4755         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4756
4757         intel_iommu_enabled = 1;
4758         intel_iommu_debugfs_init();
4759
4760         return 0;
4761
4762 out_free_reserved_range:
4763         put_iova_domain(&reserved_iova_list);
4764 out_free_dmar:
4765         intel_iommu_free_dmars();
4766         up_write(&dmar_global_lock);
4767         iommu_exit_mempool();
4768         return ret;
4769 }
4770
4771 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4772 {
4773         struct dmar_domain *domain;
4774         struct intel_iommu *iommu;
4775         unsigned long flags;
4776
4777         assert_spin_locked(&device_domain_lock);
4778
4779         if (WARN_ON(!info))
4780                 return;
4781
4782         iommu = info->iommu;
4783         domain = info->domain;
4784
4785         if (info->dev) {
4786                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4787                         intel_pasid_tear_down_entry(iommu, info->dev,
4788                                         PASID_RID2PASID);
4789
4790                 iommu_disable_dev_iotlb(info);
4791                 domain_context_clear_one(iommu, info->bus, info->devfn);
4792                 intel_pasid_free_table(info->dev);
4793         }
4794
4795         unlink_domain_info(info);
4796
4797         spin_lock_irqsave(&iommu->lock, flags);
4798         domain_detach_iommu(domain, iommu);
4799         spin_unlock_irqrestore(&iommu->lock, flags);
4800
4801         /* free the private domain */
4802         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4803             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4804             list_empty(&domain->devices))
4805                 domain_exit(info->domain);
4806
4807         free_devinfo_mem(info);
4808 }
4809
4810 static void dmar_remove_one_dev_info(struct device *dev)
4811 {
4812         struct device_domain_info *info;
4813         unsigned long flags;
4814
4815         spin_lock_irqsave(&device_domain_lock, flags);
4816         info = dev->archdata.iommu;
4817         if (info)
4818                 __dmar_remove_one_dev_info(info);
4819         spin_unlock_irqrestore(&device_domain_lock, flags);
4820 }
4821
4822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4823 {
4824         int adjust_width;
4825
4826         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4827         domain_reserve_special_ranges(domain);
4828
4829         /* calculate AGAW */
4830         domain->gaw = guest_width;
4831         adjust_width = guestwidth_to_adjustwidth(guest_width);
4832         domain->agaw = width_to_agaw(adjust_width);
4833
4834         domain->iommu_coherency = 0;
4835         domain->iommu_snooping = 0;
4836         domain->iommu_superpage = 0;
4837         domain->max_addr = 0;
4838
4839         /* always allocate the top pgd */
4840         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4841         if (!domain->pgd)
4842                 return -ENOMEM;
4843         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4844         return 0;
4845 }
4846
4847 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4848 {
4849         struct dmar_domain *dmar_domain;
4850         struct iommu_domain *domain;
4851
4852         switch (type) {
4853         case IOMMU_DOMAIN_DMA:
4854         /* fallthrough */
4855         case IOMMU_DOMAIN_UNMANAGED:
4856                 dmar_domain = alloc_domain(0);
4857                 if (!dmar_domain) {
4858                         pr_err("Can't allocate dmar_domain\n");
4859                         return NULL;
4860                 }
4861                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4862                         pr_err("Domain initialization failed\n");
4863                         domain_exit(dmar_domain);
4864                         return NULL;
4865                 }
4866
4867                 if (type == IOMMU_DOMAIN_DMA &&
4868                     init_iova_flush_queue(&dmar_domain->iovad,
4869                                           iommu_flush_iova, iova_entry_free)) {
4870                         pr_warn("iova flush queue initialization failed\n");
4871                         intel_iommu_strict = 1;
4872                 }
4873
4874                 domain_update_iommu_cap(dmar_domain);
4875
4876                 domain = &dmar_domain->domain;
4877                 domain->geometry.aperture_start = 0;
4878                 domain->geometry.aperture_end   =
4879                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4880                 domain->geometry.force_aperture = true;
4881
4882                 return domain;
4883         case IOMMU_DOMAIN_IDENTITY:
4884                 return &si_domain->domain;
4885         default:
4886                 return NULL;
4887         }
4888
4889         return NULL;
4890 }
4891
4892 static void intel_iommu_domain_free(struct iommu_domain *domain)
4893 {
4894         if (domain != &si_domain->domain)
4895                 domain_exit(to_dmar_domain(domain));
4896 }
4897
4898 /*
4899  * Check whether a @domain could be attached to the @dev through the
4900  * aux-domain attach/detach APIs.
4901  */
4902 static inline bool
4903 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4904 {
4905         struct device_domain_info *info = dev->archdata.iommu;
4906
4907         return info && info->auxd_enabled &&
4908                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4909 }
4910
4911 static void auxiliary_link_device(struct dmar_domain *domain,
4912                                   struct device *dev)
4913 {
4914         struct device_domain_info *info = dev->archdata.iommu;
4915
4916         assert_spin_locked(&device_domain_lock);
4917         if (WARN_ON(!info))
4918                 return;
4919
4920         domain->auxd_refcnt++;
4921         list_add(&domain->auxd, &info->auxiliary_domains);
4922 }
4923
4924 static void auxiliary_unlink_device(struct dmar_domain *domain,
4925                                     struct device *dev)
4926 {
4927         struct device_domain_info *info = dev->archdata.iommu;
4928
4929         assert_spin_locked(&device_domain_lock);
4930         if (WARN_ON(!info))
4931                 return;
4932
4933         list_del(&domain->auxd);
4934         domain->auxd_refcnt--;
4935
4936         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4937                 intel_pasid_free_id(domain->default_pasid);
4938 }
4939
4940 static int aux_domain_add_dev(struct dmar_domain *domain,
4941                               struct device *dev)
4942 {
4943         int ret;
4944         u8 bus, devfn;
4945         unsigned long flags;
4946         struct intel_iommu *iommu;
4947
4948         iommu = device_to_iommu(dev, &bus, &devfn);
4949         if (!iommu)
4950                 return -ENODEV;
4951
4952         if (domain->default_pasid <= 0) {
4953                 int pasid;
4954
4955                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4956                                              pci_max_pasids(to_pci_dev(dev)),
4957                                              GFP_KERNEL);
4958                 if (pasid <= 0) {
4959                         pr_err("Can't allocate default pasid\n");
4960                         return -ENODEV;
4961                 }
4962                 domain->default_pasid = pasid;
4963         }
4964
4965         spin_lock_irqsave(&device_domain_lock, flags);
4966         /*
4967          * iommu->lock must be held to attach domain to iommu and setup the
4968          * pasid entry for second level translation.
4969          */
4970         spin_lock(&iommu->lock);
4971         ret = domain_attach_iommu(domain, iommu);
4972         if (ret)
4973                 goto attach_failed;
4974
4975         /* Setup the PASID entry for mediated devices: */
4976         ret = intel_pasid_setup_second_level(iommu, domain, dev,
4977                                              domain->default_pasid);
4978         if (ret)
4979                 goto table_failed;
4980         spin_unlock(&iommu->lock);
4981
4982         auxiliary_link_device(domain, dev);
4983
4984         spin_unlock_irqrestore(&device_domain_lock, flags);
4985
4986         return 0;
4987
4988 table_failed:
4989         domain_detach_iommu(domain, iommu);
4990 attach_failed:
4991         spin_unlock(&iommu->lock);
4992         spin_unlock_irqrestore(&device_domain_lock, flags);
4993         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4994                 intel_pasid_free_id(domain->default_pasid);
4995
4996         return ret;
4997 }
4998
4999 static void aux_domain_remove_dev(struct dmar_domain *domain,
5000                                   struct device *dev)
5001 {
5002         struct device_domain_info *info;
5003         struct intel_iommu *iommu;
5004         unsigned long flags;
5005
5006         if (!is_aux_domain(dev, &domain->domain))
5007                 return;
5008
5009         spin_lock_irqsave(&device_domain_lock, flags);
5010         info = dev->archdata.iommu;
5011         iommu = info->iommu;
5012
5013         auxiliary_unlink_device(domain, dev);
5014
5015         spin_lock(&iommu->lock);
5016         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5017         domain_detach_iommu(domain, iommu);
5018         spin_unlock(&iommu->lock);
5019
5020         spin_unlock_irqrestore(&device_domain_lock, flags);
5021 }
5022
5023 static int prepare_domain_attach_device(struct iommu_domain *domain,
5024                                         struct device *dev)
5025 {
5026         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5027         struct intel_iommu *iommu;
5028         int addr_width;
5029         u8 bus, devfn;
5030
5031         iommu = device_to_iommu(dev, &bus, &devfn);
5032         if (!iommu)
5033                 return -ENODEV;
5034
5035         /* check if this iommu agaw is sufficient for max mapped address */
5036         addr_width = agaw_to_width(iommu->agaw);
5037         if (addr_width > cap_mgaw(iommu->cap))
5038                 addr_width = cap_mgaw(iommu->cap);
5039
5040         if (dmar_domain->max_addr > (1LL << addr_width)) {
5041                 dev_err(dev, "%s: iommu width (%d) is not "
5042                         "sufficient for the mapped address (%llx)\n",
5043                         __func__, addr_width, dmar_domain->max_addr);
5044                 return -EFAULT;
5045         }
5046         dmar_domain->gaw = addr_width;
5047
5048         /*
5049          * Knock out extra levels of page tables if necessary
5050          */
5051         while (iommu->agaw < dmar_domain->agaw) {
5052                 struct dma_pte *pte;
5053
5054                 pte = dmar_domain->pgd;
5055                 if (dma_pte_present(pte)) {
5056                         dmar_domain->pgd = (struct dma_pte *)
5057                                 phys_to_virt(dma_pte_addr(pte));
5058                         free_pgtable_page(pte);
5059                 }
5060                 dmar_domain->agaw--;
5061         }
5062
5063         return 0;
5064 }
5065
5066 static int intel_iommu_attach_device(struct iommu_domain *domain,
5067                                      struct device *dev)
5068 {
5069         int ret;
5070
5071         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5072             device_is_rmrr_locked(dev)) {
5073                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5074                 return -EPERM;
5075         }
5076
5077         if (is_aux_domain(dev, domain))
5078                 return -EPERM;
5079
5080         /* normally dev is not mapped */
5081         if (unlikely(domain_context_mapped(dev))) {
5082                 struct dmar_domain *old_domain;
5083
5084                 old_domain = find_domain(dev);
5085                 if (old_domain)
5086                         dmar_remove_one_dev_info(dev);
5087         }
5088
5089         ret = prepare_domain_attach_device(domain, dev);
5090         if (ret)
5091                 return ret;
5092
5093         return domain_add_dev_info(to_dmar_domain(domain), dev);
5094 }
5095
5096 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5097                                          struct device *dev)
5098 {
5099         int ret;
5100
5101         if (!is_aux_domain(dev, domain))
5102                 return -EPERM;
5103
5104         ret = prepare_domain_attach_device(domain, dev);
5105         if (ret)
5106                 return ret;
5107
5108         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5109 }
5110
5111 static void intel_iommu_detach_device(struct iommu_domain *domain,
5112                                       struct device *dev)
5113 {
5114         dmar_remove_one_dev_info(dev);
5115 }
5116
5117 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5118                                           struct device *dev)
5119 {
5120         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5121 }
5122
5123 static int intel_iommu_map(struct iommu_domain *domain,
5124                            unsigned long iova, phys_addr_t hpa,
5125                            size_t size, int iommu_prot)
5126 {
5127         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5128         u64 max_addr;
5129         int prot = 0;
5130         int ret;
5131
5132         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5133                 return -EINVAL;
5134
5135         if (iommu_prot & IOMMU_READ)
5136                 prot |= DMA_PTE_READ;
5137         if (iommu_prot & IOMMU_WRITE)
5138                 prot |= DMA_PTE_WRITE;
5139         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5140                 prot |= DMA_PTE_SNP;
5141
5142         max_addr = iova + size;
5143         if (dmar_domain->max_addr < max_addr) {
5144                 u64 end;
5145
5146                 /* check if minimum agaw is sufficient for mapped address */
5147                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5148                 if (end < max_addr) {
5149                         pr_err("%s: iommu width (%d) is not "
5150                                "sufficient for the mapped address (%llx)\n",
5151                                __func__, dmar_domain->gaw, max_addr);
5152                         return -EFAULT;
5153                 }
5154                 dmar_domain->max_addr = max_addr;
5155         }
5156         /* Round up size to next multiple of PAGE_SIZE, if it and
5157            the low bits of hpa would take us onto the next page */
5158         size = aligned_nrpages(hpa, size);
5159         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5160                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5161         return ret;
5162 }
5163
5164 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5165                                 unsigned long iova, size_t size)
5166 {
5167         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5168         struct page *freelist = NULL;
5169         unsigned long start_pfn, last_pfn;
5170         unsigned int npages;
5171         int iommu_id, level = 0;
5172
5173         /* Cope with horrid API which requires us to unmap more than the
5174            size argument if it happens to be a large-page mapping. */
5175         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5176         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5177                 return 0;
5178
5179         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5180                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5181
5182         start_pfn = iova >> VTD_PAGE_SHIFT;
5183         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5184
5185         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5186
5187         npages = last_pfn - start_pfn + 1;
5188
5189         for_each_domain_iommu(iommu_id, dmar_domain)
5190                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5191                                       start_pfn, npages, !freelist, 0);
5192
5193         dma_free_pagelist(freelist);
5194
5195         if (dmar_domain->max_addr == iova + size)
5196                 dmar_domain->max_addr = iova;
5197
5198         return size;
5199 }
5200
5201 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5202                                             dma_addr_t iova)
5203 {
5204         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5205         struct dma_pte *pte;
5206         int level = 0;
5207         u64 phys = 0;
5208
5209         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5210                 return 0;
5211
5212         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5213         if (pte)
5214                 phys = dma_pte_addr(pte);
5215
5216         return phys;
5217 }
5218
5219 static inline bool scalable_mode_support(void)
5220 {
5221         struct dmar_drhd_unit *drhd;
5222         struct intel_iommu *iommu;
5223         bool ret = true;
5224
5225         rcu_read_lock();
5226         for_each_active_iommu(iommu, drhd) {
5227                 if (!sm_supported(iommu)) {
5228                         ret = false;
5229                         break;
5230                 }
5231         }
5232         rcu_read_unlock();
5233
5234         return ret;
5235 }
5236
5237 static inline bool iommu_pasid_support(void)
5238 {
5239         struct dmar_drhd_unit *drhd;
5240         struct intel_iommu *iommu;
5241         bool ret = true;
5242
5243         rcu_read_lock();
5244         for_each_active_iommu(iommu, drhd) {
5245                 if (!pasid_supported(iommu)) {
5246                         ret = false;
5247                         break;
5248                 }
5249         }
5250         rcu_read_unlock();
5251
5252         return ret;
5253 }
5254
5255 static bool intel_iommu_capable(enum iommu_cap cap)
5256 {
5257         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5258                 return domain_update_iommu_snooping(NULL) == 1;
5259         if (cap == IOMMU_CAP_INTR_REMAP)
5260                 return irq_remapping_enabled == 1;
5261
5262         return false;
5263 }
5264
5265 static int intel_iommu_add_device(struct device *dev)
5266 {
5267         struct dmar_domain *dmar_domain;
5268         struct iommu_domain *domain;
5269         struct intel_iommu *iommu;
5270         struct iommu_group *group;
5271         u8 bus, devfn;
5272         int ret;
5273
5274         iommu = device_to_iommu(dev, &bus, &devfn);
5275         if (!iommu)
5276                 return -ENODEV;
5277
5278         iommu_device_link(&iommu->iommu, dev);
5279
5280         if (translation_pre_enabled(iommu))
5281                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5282
5283         group = iommu_group_get_for_dev(dev);
5284
5285         if (IS_ERR(group))
5286                 return PTR_ERR(group);
5287
5288         iommu_group_put(group);
5289
5290         domain = iommu_get_domain_for_dev(dev);
5291         dmar_domain = to_dmar_domain(domain);
5292         if (domain->type == IOMMU_DOMAIN_DMA) {
5293                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5294                         ret = iommu_request_dm_for_dev(dev);
5295                         if (ret) {
5296                                 dmar_remove_one_dev_info(dev);
5297                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5298                                 domain_add_dev_info(si_domain, dev);
5299                                 dev_info(dev,
5300                                          "Device uses a private identity domain.\n");
5301                         }
5302                 }
5303         } else {
5304                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5305                         ret = iommu_request_dma_domain_for_dev(dev);
5306                         if (ret) {
5307                                 dmar_remove_one_dev_info(dev);
5308                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5309                                 if (!get_private_domain_for_dev(dev)) {
5310                                         dev_warn(dev,
5311                                                  "Failed to get a private domain.\n");
5312                                         return -ENOMEM;
5313                                 }
5314
5315                                 dev_info(dev,
5316                                          "Device uses a private dma domain.\n");
5317                         }
5318                 }
5319         }
5320
5321         return 0;
5322 }
5323
5324 static void intel_iommu_remove_device(struct device *dev)
5325 {
5326         struct intel_iommu *iommu;
5327         u8 bus, devfn;
5328
5329         iommu = device_to_iommu(dev, &bus, &devfn);
5330         if (!iommu)
5331                 return;
5332
5333         dmar_remove_one_dev_info(dev);
5334
5335         iommu_group_remove_device(dev);
5336
5337         iommu_device_unlink(&iommu->iommu, dev);
5338 }
5339
5340 static void intel_iommu_get_resv_regions(struct device *device,
5341                                          struct list_head *head)
5342 {
5343         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5344         struct iommu_resv_region *reg;
5345         struct dmar_rmrr_unit *rmrr;
5346         struct device *i_dev;
5347         int i;
5348
5349         down_read(&dmar_global_lock);
5350         for_each_rmrr_units(rmrr) {
5351                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5352                                           i, i_dev) {
5353                         struct iommu_resv_region *resv;
5354                         enum iommu_resv_type type;
5355                         size_t length;
5356
5357                         if (i_dev != device &&
5358                             !is_downstream_to_pci_bridge(device, i_dev))
5359                                 continue;
5360
5361                         length = rmrr->end_address - rmrr->base_address + 1;
5362
5363                         type = device_rmrr_is_relaxable(device) ?
5364                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5365
5366                         resv = iommu_alloc_resv_region(rmrr->base_address,
5367                                                        length, prot, type);
5368                         if (!resv)
5369                                 break;
5370
5371                         list_add_tail(&resv->list, head);
5372                 }
5373         }
5374         up_read(&dmar_global_lock);
5375
5376 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5377         if (dev_is_pci(device)) {
5378                 struct pci_dev *pdev = to_pci_dev(device);
5379
5380                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5381                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5382                                                       IOMMU_RESV_DIRECT);
5383                         if (reg)
5384                                 list_add_tail(&reg->list, head);
5385                 }
5386         }
5387 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5388
5389         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5390                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5391                                       0, IOMMU_RESV_MSI);
5392         if (!reg)
5393                 return;
5394         list_add_tail(&reg->list, head);
5395 }
5396
5397 static void intel_iommu_put_resv_regions(struct device *dev,
5398                                          struct list_head *head)
5399 {
5400         struct iommu_resv_region *entry, *next;
5401
5402         list_for_each_entry_safe(entry, next, head, list)
5403                 kfree(entry);
5404 }
5405
5406 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5407 {
5408         struct device_domain_info *info;
5409         struct context_entry *context;
5410         struct dmar_domain *domain;
5411         unsigned long flags;
5412         u64 ctx_lo;
5413         int ret;
5414
5415         domain = find_domain(dev);
5416         if (!domain)
5417                 return -EINVAL;
5418
5419         spin_lock_irqsave(&device_domain_lock, flags);
5420         spin_lock(&iommu->lock);
5421
5422         ret = -EINVAL;
5423         info = dev->archdata.iommu;
5424         if (!info || !info->pasid_supported)
5425                 goto out;
5426
5427         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5428         if (WARN_ON(!context))
5429                 goto out;
5430
5431         ctx_lo = context[0].lo;
5432
5433         if (!(ctx_lo & CONTEXT_PASIDE)) {
5434                 ctx_lo |= CONTEXT_PASIDE;
5435                 context[0].lo = ctx_lo;
5436                 wmb();
5437                 iommu->flush.flush_context(iommu,
5438                                            domain->iommu_did[iommu->seq_id],
5439                                            PCI_DEVID(info->bus, info->devfn),
5440                                            DMA_CCMD_MASK_NOBIT,
5441                                            DMA_CCMD_DEVICE_INVL);
5442         }
5443
5444         /* Enable PASID support in the device, if it wasn't already */
5445         if (!info->pasid_enabled)
5446                 iommu_enable_dev_iotlb(info);
5447
5448         ret = 0;
5449
5450  out:
5451         spin_unlock(&iommu->lock);
5452         spin_unlock_irqrestore(&device_domain_lock, flags);
5453
5454         return ret;
5455 }
5456
5457 static void intel_iommu_apply_resv_region(struct device *dev,
5458                                           struct iommu_domain *domain,
5459                                           struct iommu_resv_region *region)
5460 {
5461         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5462         unsigned long start, end;
5463
5464         start = IOVA_PFN(region->start);
5465         end   = IOVA_PFN(region->start + region->length - 1);
5466
5467         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5468 }
5469
5470 #ifdef CONFIG_INTEL_IOMMU_SVM
5471 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5472 {
5473         struct intel_iommu *iommu;
5474         u8 bus, devfn;
5475
5476         if (iommu_dummy(dev)) {
5477                 dev_warn(dev,
5478                          "No IOMMU translation for device; cannot enable SVM\n");
5479                 return NULL;
5480         }
5481
5482         iommu = device_to_iommu(dev, &bus, &devfn);
5483         if ((!iommu)) {
5484                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5485                 return NULL;
5486         }
5487
5488         return iommu;
5489 }
5490 #endif /* CONFIG_INTEL_IOMMU_SVM */
5491
5492 static int intel_iommu_enable_auxd(struct device *dev)
5493 {
5494         struct device_domain_info *info;
5495         struct intel_iommu *iommu;
5496         unsigned long flags;
5497         u8 bus, devfn;
5498         int ret;
5499
5500         iommu = device_to_iommu(dev, &bus, &devfn);
5501         if (!iommu || dmar_disabled)
5502                 return -EINVAL;
5503
5504         if (!sm_supported(iommu) || !pasid_supported(iommu))
5505                 return -EINVAL;
5506
5507         ret = intel_iommu_enable_pasid(iommu, dev);
5508         if (ret)
5509                 return -ENODEV;
5510
5511         spin_lock_irqsave(&device_domain_lock, flags);
5512         info = dev->archdata.iommu;
5513         info->auxd_enabled = 1;
5514         spin_unlock_irqrestore(&device_domain_lock, flags);
5515
5516         return 0;
5517 }
5518
5519 static int intel_iommu_disable_auxd(struct device *dev)
5520 {
5521         struct device_domain_info *info;
5522         unsigned long flags;
5523
5524         spin_lock_irqsave(&device_domain_lock, flags);
5525         info = dev->archdata.iommu;
5526         if (!WARN_ON(!info))
5527                 info->auxd_enabled = 0;
5528         spin_unlock_irqrestore(&device_domain_lock, flags);
5529
5530         return 0;
5531 }
5532
5533 /*
5534  * A PCI express designated vendor specific extended capability is defined
5535  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5536  * for system software and tools to detect endpoint devices supporting the
5537  * Intel scalable IO virtualization without host driver dependency.
5538  *
5539  * Returns the address of the matching extended capability structure within
5540  * the device's PCI configuration space or 0 if the device does not support
5541  * it.
5542  */
5543 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5544 {
5545         int pos;
5546         u16 vendor, id;
5547
5548         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5549         while (pos) {
5550                 pci_read_config_word(pdev, pos + 4, &vendor);
5551                 pci_read_config_word(pdev, pos + 8, &id);
5552                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5553                         return pos;
5554
5555                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5556         }
5557
5558         return 0;
5559 }
5560
5561 static bool
5562 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5563 {
5564         if (feat == IOMMU_DEV_FEAT_AUX) {
5565                 int ret;
5566
5567                 if (!dev_is_pci(dev) || dmar_disabled ||
5568                     !scalable_mode_support() || !iommu_pasid_support())
5569                         return false;
5570
5571                 ret = pci_pasid_features(to_pci_dev(dev));
5572                 if (ret < 0)
5573                         return false;
5574
5575                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5576         }
5577
5578         return false;
5579 }
5580
5581 static int
5582 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5583 {
5584         if (feat == IOMMU_DEV_FEAT_AUX)
5585                 return intel_iommu_enable_auxd(dev);
5586
5587         return -ENODEV;
5588 }
5589
5590 static int
5591 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5592 {
5593         if (feat == IOMMU_DEV_FEAT_AUX)
5594                 return intel_iommu_disable_auxd(dev);
5595
5596         return -ENODEV;
5597 }
5598
5599 static bool
5600 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5601 {
5602         struct device_domain_info *info = dev->archdata.iommu;
5603
5604         if (feat == IOMMU_DEV_FEAT_AUX)
5605                 return scalable_mode_support() && info && info->auxd_enabled;
5606
5607         return false;
5608 }
5609
5610 static int
5611 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5612 {
5613         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5614
5615         return dmar_domain->default_pasid > 0 ?
5616                         dmar_domain->default_pasid : -EINVAL;
5617 }
5618
5619 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5620                                            struct device *dev)
5621 {
5622         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5623 }
5624
5625 const struct iommu_ops intel_iommu_ops = {
5626         .capable                = intel_iommu_capable,
5627         .domain_alloc           = intel_iommu_domain_alloc,
5628         .domain_free            = intel_iommu_domain_free,
5629         .attach_dev             = intel_iommu_attach_device,
5630         .detach_dev             = intel_iommu_detach_device,
5631         .aux_attach_dev         = intel_iommu_aux_attach_device,
5632         .aux_detach_dev         = intel_iommu_aux_detach_device,
5633         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5634         .map                    = intel_iommu_map,
5635         .unmap                  = intel_iommu_unmap,
5636         .iova_to_phys           = intel_iommu_iova_to_phys,
5637         .add_device             = intel_iommu_add_device,
5638         .remove_device          = intel_iommu_remove_device,
5639         .get_resv_regions       = intel_iommu_get_resv_regions,
5640         .put_resv_regions       = intel_iommu_put_resv_regions,
5641         .apply_resv_region      = intel_iommu_apply_resv_region,
5642         .device_group           = pci_device_group,
5643         .dev_has_feat           = intel_iommu_dev_has_feat,
5644         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5645         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5646         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5647         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5648         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5649 };
5650
5651 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5652 {
5653         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5654         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5655         dmar_map_gfx = 0;
5656 }
5657
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5665
5666 static void quirk_iommu_rwbf(struct pci_dev *dev)
5667 {
5668         /*
5669          * Mobile 4 Series Chipset neglects to set RWBF capability,
5670          * but needs it. Same seems to hold for the desktop versions.
5671          */
5672         pci_info(dev, "Forcing write-buffer flush capability\n");
5673         rwbf_quirk = 1;
5674 }
5675
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5682 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5683
5684 #define GGC 0x52
5685 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5686 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5687 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5688 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5689 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5690 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5691 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5692 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5693
5694 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5695 {
5696         unsigned short ggc;
5697
5698         if (pci_read_config_word(dev, GGC, &ggc))
5699                 return;
5700
5701         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5702                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5703                 dmar_map_gfx = 0;
5704         } else if (dmar_map_gfx) {
5705                 /* we have to ensure the gfx device is idle before we flush */
5706                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5707                 intel_iommu_strict = 1;
5708        }
5709 }
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5714
5715 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5716    ISOCH DMAR unit for the Azalia sound device, but not give it any
5717    TLB entries, which causes it to deadlock. Check for that.  We do
5718    this in a function called from init_dmars(), instead of in a PCI
5719    quirk, because we don't want to print the obnoxious "BIOS broken"
5720    message if VT-d is actually disabled.
5721 */
5722 static void __init check_tylersburg_isoch(void)
5723 {
5724         struct pci_dev *pdev;
5725         uint32_t vtisochctrl;
5726
5727         /* If there's no Azalia in the system anyway, forget it. */
5728         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5729         if (!pdev)
5730                 return;
5731         pci_dev_put(pdev);
5732
5733         /* System Management Registers. Might be hidden, in which case
5734            we can't do the sanity check. But that's OK, because the
5735            known-broken BIOSes _don't_ actually hide it, so far. */
5736         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5737         if (!pdev)
5738                 return;
5739
5740         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5741                 pci_dev_put(pdev);
5742                 return;
5743         }
5744
5745         pci_dev_put(pdev);
5746
5747         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5748         if (vtisochctrl & 1)
5749                 return;
5750
5751         /* Drop all bits other than the number of TLB entries */
5752         vtisochctrl &= 0x1c;
5753
5754         /* If we have the recommended number of TLB entries (16), fine. */
5755         if (vtisochctrl == 0x10)
5756                 return;
5757
5758         /* Zero TLB entries? You get to ride the short bus to school. */
5759         if (!vtisochctrl) {
5760                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5761                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5762                      dmi_get_system_info(DMI_BIOS_VENDOR),
5763                      dmi_get_system_info(DMI_BIOS_VERSION),
5764                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5765                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5766                 return;
5767         }
5768
5769         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5770                vtisochctrl);
5771 }