iommu/vt-d: Fix IOMMU lookup for SR-IOV Virtual Functions
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/crash_dump.h>
49 #include <asm/irq_remapping.h>
50 #include <asm/cacheflush.h>
51 #include <asm/iommu.h>
52
53 #include "irq_remapping.h"
54
55 #define ROOT_SIZE               VTD_PAGE_SIZE
56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
57
58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
62
63 #define IOAPIC_RANGE_START      (0xfee00000)
64 #define IOAPIC_RANGE_END        (0xfeefffff)
65 #define IOVA_START_ADDR         (0x1000)
66
67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
68
69 #define MAX_AGAW_WIDTH 64
70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
71
72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
74
75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
80
81 /* IO virtual address start page frame number */
82 #define IOVA_START_PFN          (1)
83
84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186
187 /*
188  * 0: Present
189  * 1-11: Reserved
190  * 12-63: Context Ptr (12 - (haw-1))
191  * 64-127: Reserved
192  */
193 struct root_entry {
194         u64     lo;
195         u64     hi;
196 };
197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
198
199 /*
200  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_lctp(struct root_entry *re)
204 {
205         if (!(re->lo & 1))
206                 return 0;
207
208         return re->lo & VTD_PAGE_MASK;
209 }
210
211 /*
212  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
213  * if marked present.
214  */
215 static phys_addr_t root_entry_uctp(struct root_entry *re)
216 {
217         if (!(re->hi & 1))
218                 return 0;
219
220         return re->hi & VTD_PAGE_MASK;
221 }
222 /*
223  * low 64 bits:
224  * 0: present
225  * 1: fault processing disable
226  * 2-3: translation type
227  * 12-63: address space root
228  * high 64 bits:
229  * 0-2: address width
230  * 3-6: aval
231  * 8-23: domain id
232  */
233 struct context_entry {
234         u64 lo;
235         u64 hi;
236 };
237
238 static inline void context_clear_pasid_enable(struct context_entry *context)
239 {
240         context->lo &= ~(1ULL << 11);
241 }
242
243 static inline bool context_pasid_enabled(struct context_entry *context)
244 {
245         return !!(context->lo & (1ULL << 11));
246 }
247
248 static inline void context_set_copied(struct context_entry *context)
249 {
250         context->hi |= (1ull << 3);
251 }
252
253 static inline bool context_copied(struct context_entry *context)
254 {
255         return !!(context->hi & (1ULL << 3));
256 }
257
258 static inline bool __context_present(struct context_entry *context)
259 {
260         return (context->lo & 1);
261 }
262
263 static inline bool context_present(struct context_entry *context)
264 {
265         return context_pasid_enabled(context) ?
266              __context_present(context) :
267              __context_present(context) && !context_copied(context);
268 }
269
270 static inline void context_set_present(struct context_entry *context)
271 {
272         context->lo |= 1;
273 }
274
275 static inline void context_set_fault_enable(struct context_entry *context)
276 {
277         context->lo &= (((u64)-1) << 2) | 1;
278 }
279
280 static inline void context_set_translation_type(struct context_entry *context,
281                                                 unsigned long value)
282 {
283         context->lo &= (((u64)-1) << 4) | 3;
284         context->lo |= (value & 3) << 2;
285 }
286
287 static inline void context_set_address_root(struct context_entry *context,
288                                             unsigned long value)
289 {
290         context->lo &= ~VTD_PAGE_MASK;
291         context->lo |= value & VTD_PAGE_MASK;
292 }
293
294 static inline void context_set_address_width(struct context_entry *context,
295                                              unsigned long value)
296 {
297         context->hi |= value & 7;
298 }
299
300 static inline void context_set_domain_id(struct context_entry *context,
301                                          unsigned long value)
302 {
303         context->hi |= (value & ((1 << 16) - 1)) << 8;
304 }
305
306 static inline int context_domain_id(struct context_entry *c)
307 {
308         return((c->hi >> 8) & 0xffff);
309 }
310
311 static inline void context_clear_entry(struct context_entry *context)
312 {
313         context->lo = 0;
314         context->hi = 0;
315 }
316
317 /*
318  * 0: readable
319  * 1: writable
320  * 2-6: reserved
321  * 7: super page
322  * 8-10: available
323  * 11: snoop behavior
324  * 12-63: Host physcial address
325  */
326 struct dma_pte {
327         u64 val;
328 };
329
330 static inline void dma_clear_pte(struct dma_pte *pte)
331 {
332         pte->val = 0;
333 }
334
335 static inline u64 dma_pte_addr(struct dma_pte *pte)
336 {
337 #ifdef CONFIG_64BIT
338         return pte->val & VTD_PAGE_MASK;
339 #else
340         /* Must have a full atomic 64-bit read */
341         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
342 #endif
343 }
344
345 static inline bool dma_pte_present(struct dma_pte *pte)
346 {
347         return (pte->val & 3) != 0;
348 }
349
350 static inline bool dma_pte_superpage(struct dma_pte *pte)
351 {
352         return (pte->val & DMA_PTE_LARGE_PAGE);
353 }
354
355 static inline int first_pte_in_page(struct dma_pte *pte)
356 {
357         return !((unsigned long)pte & ~VTD_PAGE_MASK);
358 }
359
360 /*
361  * This domain is a statically identity mapping domain.
362  *      1. This domain creats a static 1:1 mapping to all usable memory.
363  *      2. It maps to each iommu if successful.
364  *      3. Each iommu mapps to this domain if successful.
365  */
366 static struct dmar_domain *si_domain;
367 static int hw_pass_through = 1;
368
369 /*
370  * Domain represents a virtual machine, more than one devices
371  * across iommus may be owned in one domain, e.g. kvm guest.
372  */
373 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
374
375 /* si_domain contains mulitple devices */
376 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
377
378 #define for_each_domain_iommu(idx, domain)                      \
379         for (idx = 0; idx < g_num_of_iommus; idx++)             \
380                 if (domain->iommu_refcnt[idx])
381
382 struct dmar_domain {
383         int     nid;                    /* node id */
384
385         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
386                                         /* Refcount of devices per iommu */
387
388
389         u16             iommu_did[DMAR_UNITS_SUPPORTED];
390                                         /* Domain ids per IOMMU. Use u16 since
391                                          * domain ids are 16 bit wide according
392                                          * to VT-d spec, section 9.3 */
393
394         bool has_iotlb_device;
395         struct list_head devices;       /* all devices' list */
396         struct iova_domain iovad;       /* iova's that belong to this domain */
397
398         struct dma_pte  *pgd;           /* virtual address */
399         int             gaw;            /* max guest address width */
400
401         /* adjusted guest address width, 0 is level 2 30-bit */
402         int             agaw;
403
404         int             flags;          /* flags to find out type of domain */
405
406         int             iommu_coherency;/* indicate coherency of iommu access */
407         int             iommu_snooping; /* indicate snooping control feature*/
408         int             iommu_count;    /* reference count of iommu */
409         int             iommu_superpage;/* Level of superpages supported:
410                                            0 == 4KiB (no superpages), 1 == 2MiB,
411                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
412         u64             max_addr;       /* maximum mapped address */
413
414         struct iommu_domain domain;     /* generic domain data structure for
415                                            iommu core */
416 };
417
418 /* PCI domain-device relationship */
419 struct device_domain_info {
420         struct list_head link;  /* link to domain siblings */
421         struct list_head global; /* link to global list */
422         u8 bus;                 /* PCI bus number */
423         u8 devfn;               /* PCI devfn number */
424         u8 pasid_supported:3;
425         u8 pasid_enabled:1;
426         u8 pri_supported:1;
427         u8 pri_enabled:1;
428         u8 ats_supported:1;
429         u8 ats_enabled:1;
430         u8 ats_qdep;
431         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
432         struct intel_iommu *iommu; /* IOMMU used by this device */
433         struct dmar_domain *domain; /* pointer to domain */
434 };
435
436 struct dmar_rmrr_unit {
437         struct list_head list;          /* list of rmrr units   */
438         struct acpi_dmar_header *hdr;   /* ACPI header          */
439         u64     base_address;           /* reserved base address*/
440         u64     end_address;            /* reserved end address */
441         struct dmar_dev_scope *devices; /* target devices */
442         int     devices_cnt;            /* target device count */
443 };
444
445 struct dmar_atsr_unit {
446         struct list_head list;          /* list of ATSR units */
447         struct acpi_dmar_header *hdr;   /* ACPI header */
448         struct dmar_dev_scope *devices; /* target devices */
449         int devices_cnt;                /* target device count */
450         u8 include_all:1;               /* include all ports */
451 };
452
453 static LIST_HEAD(dmar_atsr_units);
454 static LIST_HEAD(dmar_rmrr_units);
455
456 #define for_each_rmrr_units(rmrr) \
457         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
458
459 static void flush_unmaps_timeout(unsigned long data);
460
461 struct deferred_flush_entry {
462         unsigned long iova_pfn;
463         unsigned long nrpages;
464         struct dmar_domain *domain;
465         struct page *freelist;
466 };
467
468 #define HIGH_WATER_MARK 250
469 struct deferred_flush_table {
470         int next;
471         struct deferred_flush_entry entries[HIGH_WATER_MARK];
472 };
473
474 struct deferred_flush_data {
475         spinlock_t lock;
476         int timer_on;
477         struct timer_list timer;
478         long size;
479         struct deferred_flush_table *tables;
480 };
481
482 DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
483
484 /* bitmap for indexing intel_iommus */
485 static int g_num_of_iommus;
486
487 static void domain_exit(struct dmar_domain *domain);
488 static void domain_remove_dev_info(struct dmar_domain *domain);
489 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
490                                      struct device *dev);
491 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
492 static void domain_context_clear(struct intel_iommu *iommu,
493                                  struct device *dev);
494 static int domain_detach_iommu(struct dmar_domain *domain,
495                                struct intel_iommu *iommu);
496
497 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
498 int dmar_disabled = 0;
499 #else
500 int dmar_disabled = 1;
501 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
502
503 int intel_iommu_enabled = 0;
504 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
505
506 static int dmar_map_gfx = 1;
507 static int dmar_forcedac;
508 static int intel_iommu_strict;
509 static int intel_iommu_superpage = 1;
510 static int intel_iommu_ecs = 1;
511 static int intel_iommu_pasid28;
512 static int iommu_identity_mapping;
513
514 #define IDENTMAP_ALL            1
515 #define IDENTMAP_GFX            2
516 #define IDENTMAP_AZALIA         4
517
518 /* Broadwell and Skylake have broken ECS support — normal so-called "second
519  * level" translation of DMA requests-without-PASID doesn't actually happen
520  * unless you also set the NESTE bit in an extended context-entry. Which of
521  * course means that SVM doesn't work because it's trying to do nested
522  * translation of the physical addresses it finds in the process page tables,
523  * through the IOVA->phys mapping found in the "second level" page tables.
524  *
525  * The VT-d specification was retroactively changed to change the definition
526  * of the capability bits and pretend that Broadwell/Skylake never happened...
527  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
528  * for some reason it was the PASID capability bit which was redefined (from
529  * bit 28 on BDW/SKL to bit 40 in future).
530  *
531  * So our test for ECS needs to eschew those implementations which set the old
532  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
533  * Unless we are working around the 'pasid28' limitations, that is, by putting
534  * the device into passthrough mode for normal DMA and thus masking the bug.
535  */
536 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
537                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
538 /* PASID support is thus enabled if ECS is enabled and *either* of the old
539  * or new capability bits are set. */
540 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
541                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
542
543 int intel_iommu_gfx_mapped;
544 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
545
546 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
547 static DEFINE_SPINLOCK(device_domain_lock);
548 static LIST_HEAD(device_domain_list);
549
550 static const struct iommu_ops intel_iommu_ops;
551
552 static bool translation_pre_enabled(struct intel_iommu *iommu)
553 {
554         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
555 }
556
557 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
558 {
559         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
560 }
561
562 static void init_translation_status(struct intel_iommu *iommu)
563 {
564         u32 gsts;
565
566         gsts = readl(iommu->reg + DMAR_GSTS_REG);
567         if (gsts & DMA_GSTS_TES)
568                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
569 }
570
571 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
572 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
573 {
574         return container_of(dom, struct dmar_domain, domain);
575 }
576
577 static int __init intel_iommu_setup(char *str)
578 {
579         if (!str)
580                 return -EINVAL;
581         while (*str) {
582                 if (!strncmp(str, "on", 2)) {
583                         dmar_disabled = 0;
584                         pr_info("IOMMU enabled\n");
585                 } else if (!strncmp(str, "off", 3)) {
586                         dmar_disabled = 1;
587                         pr_info("IOMMU disabled\n");
588                 } else if (!strncmp(str, "igfx_off", 8)) {
589                         dmar_map_gfx = 0;
590                         pr_info("Disable GFX device mapping\n");
591                 } else if (!strncmp(str, "forcedac", 8)) {
592                         pr_info("Forcing DAC for PCI devices\n");
593                         dmar_forcedac = 1;
594                 } else if (!strncmp(str, "strict", 6)) {
595                         pr_info("Disable batched IOTLB flush\n");
596                         intel_iommu_strict = 1;
597                 } else if (!strncmp(str, "sp_off", 6)) {
598                         pr_info("Disable supported super page\n");
599                         intel_iommu_superpage = 0;
600                 } else if (!strncmp(str, "ecs_off", 7)) {
601                         printk(KERN_INFO
602                                 "Intel-IOMMU: disable extended context table support\n");
603                         intel_iommu_ecs = 0;
604                 } else if (!strncmp(str, "pasid28", 7)) {
605                         printk(KERN_INFO
606                                 "Intel-IOMMU: enable pre-production PASID support\n");
607                         intel_iommu_pasid28 = 1;
608                         iommu_identity_mapping |= IDENTMAP_GFX;
609                 }
610
611                 str += strcspn(str, ",");
612                 while (*str == ',')
613                         str++;
614         }
615         return 0;
616 }
617 __setup("intel_iommu=", intel_iommu_setup);
618
619 static struct kmem_cache *iommu_domain_cache;
620 static struct kmem_cache *iommu_devinfo_cache;
621
622 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
623 {
624         struct dmar_domain **domains;
625         int idx = did >> 8;
626
627         domains = iommu->domains[idx];
628         if (!domains)
629                 return NULL;
630
631         return domains[did & 0xff];
632 }
633
634 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
635                              struct dmar_domain *domain)
636 {
637         struct dmar_domain **domains;
638         int idx = did >> 8;
639
640         if (!iommu->domains[idx]) {
641                 size_t size = 256 * sizeof(struct dmar_domain *);
642                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
643         }
644
645         domains = iommu->domains[idx];
646         if (WARN_ON(!domains))
647                 return;
648         else
649                 domains[did & 0xff] = domain;
650 }
651
652 static inline void *alloc_pgtable_page(int node)
653 {
654         struct page *page;
655         void *vaddr = NULL;
656
657         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
658         if (page)
659                 vaddr = page_address(page);
660         return vaddr;
661 }
662
663 static inline void free_pgtable_page(void *vaddr)
664 {
665         free_page((unsigned long)vaddr);
666 }
667
668 static inline void *alloc_domain_mem(void)
669 {
670         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
671 }
672
673 static void free_domain_mem(void *vaddr)
674 {
675         kmem_cache_free(iommu_domain_cache, vaddr);
676 }
677
678 static inline void * alloc_devinfo_mem(void)
679 {
680         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
681 }
682
683 static inline void free_devinfo_mem(void *vaddr)
684 {
685         kmem_cache_free(iommu_devinfo_cache, vaddr);
686 }
687
688 static inline int domain_type_is_vm(struct dmar_domain *domain)
689 {
690         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
691 }
692
693 static inline int domain_type_is_si(struct dmar_domain *domain)
694 {
695         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
696 }
697
698 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
699 {
700         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
701                                 DOMAIN_FLAG_STATIC_IDENTITY);
702 }
703
704 static inline int domain_pfn_supported(struct dmar_domain *domain,
705                                        unsigned long pfn)
706 {
707         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
708
709         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
710 }
711
712 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
713 {
714         unsigned long sagaw;
715         int agaw = -1;
716
717         sagaw = cap_sagaw(iommu->cap);
718         for (agaw = width_to_agaw(max_gaw);
719              agaw >= 0; agaw--) {
720                 if (test_bit(agaw, &sagaw))
721                         break;
722         }
723
724         return agaw;
725 }
726
727 /*
728  * Calculate max SAGAW for each iommu.
729  */
730 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
731 {
732         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
733 }
734
735 /*
736  * calculate agaw for each iommu.
737  * "SAGAW" may be different across iommus, use a default agaw, and
738  * get a supported less agaw for iommus that don't support the default agaw.
739  */
740 int iommu_calculate_agaw(struct intel_iommu *iommu)
741 {
742         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
743 }
744
745 /* This functionin only returns single iommu in a domain */
746 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
747 {
748         int iommu_id;
749
750         /* si_domain and vm domain should not get here. */
751         BUG_ON(domain_type_is_vm_or_si(domain));
752         for_each_domain_iommu(iommu_id, domain)
753                 break;
754
755         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
756                 return NULL;
757
758         return g_iommus[iommu_id];
759 }
760
761 static void domain_update_iommu_coherency(struct dmar_domain *domain)
762 {
763         struct dmar_drhd_unit *drhd;
764         struct intel_iommu *iommu;
765         bool found = false;
766         int i;
767
768         domain->iommu_coherency = 1;
769
770         for_each_domain_iommu(i, domain) {
771                 found = true;
772                 if (!ecap_coherent(g_iommus[i]->ecap)) {
773                         domain->iommu_coherency = 0;
774                         break;
775                 }
776         }
777         if (found)
778                 return;
779
780         /* No hardware attached; use lowest common denominator */
781         rcu_read_lock();
782         for_each_active_iommu(iommu, drhd) {
783                 if (!ecap_coherent(iommu->ecap)) {
784                         domain->iommu_coherency = 0;
785                         break;
786                 }
787         }
788         rcu_read_unlock();
789 }
790
791 static int domain_update_iommu_snooping(struct intel_iommu *skip)
792 {
793         struct dmar_drhd_unit *drhd;
794         struct intel_iommu *iommu;
795         int ret = 1;
796
797         rcu_read_lock();
798         for_each_active_iommu(iommu, drhd) {
799                 if (iommu != skip) {
800                         if (!ecap_sc_support(iommu->ecap)) {
801                                 ret = 0;
802                                 break;
803                         }
804                 }
805         }
806         rcu_read_unlock();
807
808         return ret;
809 }
810
811 static int domain_update_iommu_superpage(struct intel_iommu *skip)
812 {
813         struct dmar_drhd_unit *drhd;
814         struct intel_iommu *iommu;
815         int mask = 0xf;
816
817         if (!intel_iommu_superpage) {
818                 return 0;
819         }
820
821         /* set iommu_superpage to the smallest common denominator */
822         rcu_read_lock();
823         for_each_active_iommu(iommu, drhd) {
824                 if (iommu != skip) {
825                         mask &= cap_super_page_val(iommu->cap);
826                         if (!mask)
827                                 break;
828                 }
829         }
830         rcu_read_unlock();
831
832         return fls(mask);
833 }
834
835 /* Some capabilities may be different across iommus */
836 static void domain_update_iommu_cap(struct dmar_domain *domain)
837 {
838         domain_update_iommu_coherency(domain);
839         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
840         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
841 }
842
843 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
844                                                        u8 bus, u8 devfn, int alloc)
845 {
846         struct root_entry *root = &iommu->root_entry[bus];
847         struct context_entry *context;
848         u64 *entry;
849
850         entry = &root->lo;
851         if (ecs_enabled(iommu)) {
852                 if (devfn >= 0x80) {
853                         devfn -= 0x80;
854                         entry = &root->hi;
855                 }
856                 devfn *= 2;
857         }
858         if (*entry & 1)
859                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
860         else {
861                 unsigned long phy_addr;
862                 if (!alloc)
863                         return NULL;
864
865                 context = alloc_pgtable_page(iommu->node);
866                 if (!context)
867                         return NULL;
868
869                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
870                 phy_addr = virt_to_phys((void *)context);
871                 *entry = phy_addr | 1;
872                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
873         }
874         return &context[devfn];
875 }
876
877 static int iommu_dummy(struct device *dev)
878 {
879         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
880 }
881
882 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
883 {
884         struct dmar_drhd_unit *drhd = NULL;
885         struct intel_iommu *iommu;
886         struct device *tmp;
887         struct pci_dev *ptmp, *pdev = NULL;
888         u16 segment = 0;
889         int i;
890
891         if (iommu_dummy(dev))
892                 return NULL;
893
894         if (dev_is_pci(dev)) {
895                 struct pci_dev *pf_pdev;
896
897                 pdev = to_pci_dev(dev);
898                 /* VFs aren't listed in scope tables; we need to look up
899                  * the PF instead to find the IOMMU. */
900                 pf_pdev = pci_physfn(pdev);
901                 dev = &pf_pdev->dev;
902                 segment = pci_domain_nr(pdev->bus);
903         } else if (has_acpi_companion(dev))
904                 dev = &ACPI_COMPANION(dev)->dev;
905
906         rcu_read_lock();
907         for_each_active_iommu(iommu, drhd) {
908                 if (pdev && segment != drhd->segment)
909                         continue;
910
911                 for_each_active_dev_scope(drhd->devices,
912                                           drhd->devices_cnt, i, tmp) {
913                         if (tmp == dev) {
914                                 /* For a VF use its original BDF# not that of the PF
915                                  * which we used for the IOMMU lookup. Strictly speaking
916                                  * we could do this for all PCI devices; we only need to
917                                  * get the BDF# from the scope table for ACPI matches. */
918                                 if (pdev->is_virtfn)
919                                         goto got_pdev;
920
921                                 *bus = drhd->devices[i].bus;
922                                 *devfn = drhd->devices[i].devfn;
923                                 goto out;
924                         }
925
926                         if (!pdev || !dev_is_pci(tmp))
927                                 continue;
928
929                         ptmp = to_pci_dev(tmp);
930                         if (ptmp->subordinate &&
931                             ptmp->subordinate->number <= pdev->bus->number &&
932                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
933                                 goto got_pdev;
934                 }
935
936                 if (pdev && drhd->include_all) {
937                 got_pdev:
938                         *bus = pdev->bus->number;
939                         *devfn = pdev->devfn;
940                         goto out;
941                 }
942         }
943         iommu = NULL;
944  out:
945         rcu_read_unlock();
946
947         return iommu;
948 }
949
950 static void domain_flush_cache(struct dmar_domain *domain,
951                                void *addr, int size)
952 {
953         if (!domain->iommu_coherency)
954                 clflush_cache_range(addr, size);
955 }
956
957 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
958 {
959         struct context_entry *context;
960         int ret = 0;
961         unsigned long flags;
962
963         spin_lock_irqsave(&iommu->lock, flags);
964         context = iommu_context_addr(iommu, bus, devfn, 0);
965         if (context)
966                 ret = context_present(context);
967         spin_unlock_irqrestore(&iommu->lock, flags);
968         return ret;
969 }
970
971 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
972 {
973         struct context_entry *context;
974         unsigned long flags;
975
976         spin_lock_irqsave(&iommu->lock, flags);
977         context = iommu_context_addr(iommu, bus, devfn, 0);
978         if (context) {
979                 context_clear_entry(context);
980                 __iommu_flush_cache(iommu, context, sizeof(*context));
981         }
982         spin_unlock_irqrestore(&iommu->lock, flags);
983 }
984
985 static void free_context_table(struct intel_iommu *iommu)
986 {
987         int i;
988         unsigned long flags;
989         struct context_entry *context;
990
991         spin_lock_irqsave(&iommu->lock, flags);
992         if (!iommu->root_entry) {
993                 goto out;
994         }
995         for (i = 0; i < ROOT_ENTRY_NR; i++) {
996                 context = iommu_context_addr(iommu, i, 0, 0);
997                 if (context)
998                         free_pgtable_page(context);
999
1000                 if (!ecs_enabled(iommu))
1001                         continue;
1002
1003                 context = iommu_context_addr(iommu, i, 0x80, 0);
1004                 if (context)
1005                         free_pgtable_page(context);
1006
1007         }
1008         free_pgtable_page(iommu->root_entry);
1009         iommu->root_entry = NULL;
1010 out:
1011         spin_unlock_irqrestore(&iommu->lock, flags);
1012 }
1013
1014 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1015                                       unsigned long pfn, int *target_level)
1016 {
1017         struct dma_pte *parent, *pte = NULL;
1018         int level = agaw_to_level(domain->agaw);
1019         int offset;
1020
1021         BUG_ON(!domain->pgd);
1022
1023         if (!domain_pfn_supported(domain, pfn))
1024                 /* Address beyond IOMMU's addressing capabilities. */
1025                 return NULL;
1026
1027         parent = domain->pgd;
1028
1029         while (1) {
1030                 void *tmp_page;
1031
1032                 offset = pfn_level_offset(pfn, level);
1033                 pte = &parent[offset];
1034                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1035                         break;
1036                 if (level == *target_level)
1037                         break;
1038
1039                 if (!dma_pte_present(pte)) {
1040                         uint64_t pteval;
1041
1042                         tmp_page = alloc_pgtable_page(domain->nid);
1043
1044                         if (!tmp_page)
1045                                 return NULL;
1046
1047                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1048                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1049                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1050                                 /* Someone else set it while we were thinking; use theirs. */
1051                                 free_pgtable_page(tmp_page);
1052                         else
1053                                 domain_flush_cache(domain, pte, sizeof(*pte));
1054                 }
1055                 if (level == 1)
1056                         break;
1057
1058                 parent = phys_to_virt(dma_pte_addr(pte));
1059                 level--;
1060         }
1061
1062         if (!*target_level)
1063                 *target_level = level;
1064
1065         return pte;
1066 }
1067
1068
1069 /* return address's pte at specific level */
1070 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1071                                          unsigned long pfn,
1072                                          int level, int *large_page)
1073 {
1074         struct dma_pte *parent, *pte = NULL;
1075         int total = agaw_to_level(domain->agaw);
1076         int offset;
1077
1078         parent = domain->pgd;
1079         while (level <= total) {
1080                 offset = pfn_level_offset(pfn, total);
1081                 pte = &parent[offset];
1082                 if (level == total)
1083                         return pte;
1084
1085                 if (!dma_pte_present(pte)) {
1086                         *large_page = total;
1087                         break;
1088                 }
1089
1090                 if (dma_pte_superpage(pte)) {
1091                         *large_page = total;
1092                         return pte;
1093                 }
1094
1095                 parent = phys_to_virt(dma_pte_addr(pte));
1096                 total--;
1097         }
1098         return NULL;
1099 }
1100
1101 /* clear last level pte, a tlb flush should be followed */
1102 static void dma_pte_clear_range(struct dmar_domain *domain,
1103                                 unsigned long start_pfn,
1104                                 unsigned long last_pfn)
1105 {
1106         unsigned int large_page = 1;
1107         struct dma_pte *first_pte, *pte;
1108
1109         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1110         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1111         BUG_ON(start_pfn > last_pfn);
1112
1113         /* we don't need lock here; nobody else touches the iova range */
1114         do {
1115                 large_page = 1;
1116                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1117                 if (!pte) {
1118                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1119                         continue;
1120                 }
1121                 do {
1122                         dma_clear_pte(pte);
1123                         start_pfn += lvl_to_nr_pages(large_page);
1124                         pte++;
1125                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1126
1127                 domain_flush_cache(domain, first_pte,
1128                                    (void *)pte - (void *)first_pte);
1129
1130         } while (start_pfn && start_pfn <= last_pfn);
1131 }
1132
1133 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1134                                struct dma_pte *pte, unsigned long pfn,
1135                                unsigned long start_pfn, unsigned long last_pfn)
1136 {
1137         pfn = max(start_pfn, pfn);
1138         pte = &pte[pfn_level_offset(pfn, level)];
1139
1140         do {
1141                 unsigned long level_pfn;
1142                 struct dma_pte *level_pte;
1143
1144                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1145                         goto next;
1146
1147                 level_pfn = pfn & level_mask(level - 1);
1148                 level_pte = phys_to_virt(dma_pte_addr(pte));
1149
1150                 if (level > 2)
1151                         dma_pte_free_level(domain, level - 1, level_pte,
1152                                            level_pfn, start_pfn, last_pfn);
1153
1154                 /* If range covers entire pagetable, free it */
1155                 if (!(start_pfn > level_pfn ||
1156                       last_pfn < level_pfn + level_size(level) - 1)) {
1157                         dma_clear_pte(pte);
1158                         domain_flush_cache(domain, pte, sizeof(*pte));
1159                         free_pgtable_page(level_pte);
1160                 }
1161 next:
1162                 pfn += level_size(level);
1163         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1164 }
1165
1166 /* clear last level (leaf) ptes and free page table pages. */
1167 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168                                    unsigned long start_pfn,
1169                                    unsigned long last_pfn)
1170 {
1171         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1172         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1173         BUG_ON(start_pfn > last_pfn);
1174
1175         dma_pte_clear_range(domain, start_pfn, last_pfn);
1176
1177         /* We don't need lock here; nobody else touches the iova range */
1178         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1179                            domain->pgd, 0, start_pfn, last_pfn);
1180
1181         /* free pgd */
1182         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1183                 free_pgtable_page(domain->pgd);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* When a page at a given level is being unlinked from its parent, we don't
1189    need to *modify* it at all. All we need to do is make a list of all the
1190    pages which can be freed just as soon as we've flushed the IOTLB and we
1191    know the hardware page-walk will no longer touch them.
1192    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1193    be freed. */
1194 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1195                                             int level, struct dma_pte *pte,
1196                                             struct page *freelist)
1197 {
1198         struct page *pg;
1199
1200         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1201         pg->freelist = freelist;
1202         freelist = pg;
1203
1204         if (level == 1)
1205                 return freelist;
1206
1207         pte = page_address(pg);
1208         do {
1209                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1210                         freelist = dma_pte_list_pagetables(domain, level - 1,
1211                                                            pte, freelist);
1212                 pte++;
1213         } while (!first_pte_in_page(pte));
1214
1215         return freelist;
1216 }
1217
1218 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1219                                         struct dma_pte *pte, unsigned long pfn,
1220                                         unsigned long start_pfn,
1221                                         unsigned long last_pfn,
1222                                         struct page *freelist)
1223 {
1224         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1225
1226         pfn = max(start_pfn, pfn);
1227         pte = &pte[pfn_level_offset(pfn, level)];
1228
1229         do {
1230                 unsigned long level_pfn;
1231
1232                 if (!dma_pte_present(pte))
1233                         goto next;
1234
1235                 level_pfn = pfn & level_mask(level);
1236
1237                 /* If range covers entire pagetable, free it */
1238                 if (start_pfn <= level_pfn &&
1239                     last_pfn >= level_pfn + level_size(level) - 1) {
1240                         /* These suborbinate page tables are going away entirely. Don't
1241                            bother to clear them; we're just going to *free* them. */
1242                         if (level > 1 && !dma_pte_superpage(pte))
1243                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1244
1245                         dma_clear_pte(pte);
1246                         if (!first_pte)
1247                                 first_pte = pte;
1248                         last_pte = pte;
1249                 } else if (level > 1) {
1250                         /* Recurse down into a level that isn't *entirely* obsolete */
1251                         freelist = dma_pte_clear_level(domain, level - 1,
1252                                                        phys_to_virt(dma_pte_addr(pte)),
1253                                                        level_pfn, start_pfn, last_pfn,
1254                                                        freelist);
1255                 }
1256 next:
1257                 pfn += level_size(level);
1258         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1259
1260         if (first_pte)
1261                 domain_flush_cache(domain, first_pte,
1262                                    (void *)++last_pte - (void *)first_pte);
1263
1264         return freelist;
1265 }
1266
1267 /* We can't just free the pages because the IOMMU may still be walking
1268    the page tables, and may have cached the intermediate levels. The
1269    pages can only be freed after the IOTLB flush has been done. */
1270 static struct page *domain_unmap(struct dmar_domain *domain,
1271                                  unsigned long start_pfn,
1272                                  unsigned long last_pfn)
1273 {
1274         struct page *freelist = NULL;
1275
1276         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278         BUG_ON(start_pfn > last_pfn);
1279
1280         /* we don't need lock here; nobody else touches the iova range */
1281         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1283
1284         /* free pgd */
1285         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1286                 struct page *pgd_page = virt_to_page(domain->pgd);
1287                 pgd_page->freelist = freelist;
1288                 freelist = pgd_page;
1289
1290                 domain->pgd = NULL;
1291         }
1292
1293         return freelist;
1294 }
1295
1296 static void dma_free_pagelist(struct page *freelist)
1297 {
1298         struct page *pg;
1299
1300         while ((pg = freelist)) {
1301                 freelist = pg->freelist;
1302                 free_pgtable_page(page_address(pg));
1303         }
1304 }
1305
1306 /* iommu handling */
1307 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1308 {
1309         struct root_entry *root;
1310         unsigned long flags;
1311
1312         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1313         if (!root) {
1314                 pr_err("Allocating root entry for %s failed\n",
1315                         iommu->name);
1316                 return -ENOMEM;
1317         }
1318
1319         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1320
1321         spin_lock_irqsave(&iommu->lock, flags);
1322         iommu->root_entry = root;
1323         spin_unlock_irqrestore(&iommu->lock, flags);
1324
1325         return 0;
1326 }
1327
1328 static void iommu_set_root_entry(struct intel_iommu *iommu)
1329 {
1330         u64 addr;
1331         u32 sts;
1332         unsigned long flag;
1333
1334         addr = virt_to_phys(iommu->root_entry);
1335         if (ecs_enabled(iommu))
1336                 addr |= DMA_RTADDR_RTT;
1337
1338         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1340
1341         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1342
1343         /* Make sure hardware complete it */
1344         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1345                       readl, (sts & DMA_GSTS_RTPS), sts);
1346
1347         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348 }
1349
1350 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1351 {
1352         u32 val;
1353         unsigned long flag;
1354
1355         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1356                 return;
1357
1358         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1359         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1360
1361         /* Make sure hardware complete it */
1362         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1363                       readl, (!(val & DMA_GSTS_WBFS)), val);
1364
1365         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1366 }
1367
1368 /* return value determine if we need a write buffer flush */
1369 static void __iommu_flush_context(struct intel_iommu *iommu,
1370                                   u16 did, u16 source_id, u8 function_mask,
1371                                   u64 type)
1372 {
1373         u64 val = 0;
1374         unsigned long flag;
1375
1376         switch (type) {
1377         case DMA_CCMD_GLOBAL_INVL:
1378                 val = DMA_CCMD_GLOBAL_INVL;
1379                 break;
1380         case DMA_CCMD_DOMAIN_INVL:
1381                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1382                 break;
1383         case DMA_CCMD_DEVICE_INVL:
1384                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1385                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1386                 break;
1387         default:
1388                 BUG();
1389         }
1390         val |= DMA_CCMD_ICC;
1391
1392         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1393         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1394
1395         /* Make sure hardware complete it */
1396         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1397                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1398
1399         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1400 }
1401
1402 /* return value determine if we need a write buffer flush */
1403 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1404                                 u64 addr, unsigned int size_order, u64 type)
1405 {
1406         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1407         u64 val = 0, val_iva = 0;
1408         unsigned long flag;
1409
1410         switch (type) {
1411         case DMA_TLB_GLOBAL_FLUSH:
1412                 /* global flush doesn't need set IVA_REG */
1413                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1414                 break;
1415         case DMA_TLB_DSI_FLUSH:
1416                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1417                 break;
1418         case DMA_TLB_PSI_FLUSH:
1419                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1420                 /* IH bit is passed in as part of address */
1421                 val_iva = size_order | addr;
1422                 break;
1423         default:
1424                 BUG();
1425         }
1426         /* Note: set drain read/write */
1427 #if 0
1428         /*
1429          * This is probably to be super secure.. Looks like we can
1430          * ignore it without any impact.
1431          */
1432         if (cap_read_drain(iommu->cap))
1433                 val |= DMA_TLB_READ_DRAIN;
1434 #endif
1435         if (cap_write_drain(iommu->cap))
1436                 val |= DMA_TLB_WRITE_DRAIN;
1437
1438         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1439         /* Note: Only uses first TLB reg currently */
1440         if (val_iva)
1441                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1442         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1443
1444         /* Make sure hardware complete it */
1445         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1446                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1447
1448         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1449
1450         /* check IOTLB invalidation granularity */
1451         if (DMA_TLB_IAIG(val) == 0)
1452                 pr_err("Flush IOTLB failed\n");
1453         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1454                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1455                         (unsigned long long)DMA_TLB_IIRG(type),
1456                         (unsigned long long)DMA_TLB_IAIG(val));
1457 }
1458
1459 static struct device_domain_info *
1460 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1461                          u8 bus, u8 devfn)
1462 {
1463         struct device_domain_info *info;
1464
1465         assert_spin_locked(&device_domain_lock);
1466
1467         if (!iommu->qi)
1468                 return NULL;
1469
1470         list_for_each_entry(info, &domain->devices, link)
1471                 if (info->iommu == iommu && info->bus == bus &&
1472                     info->devfn == devfn) {
1473                         if (info->ats_supported && info->dev)
1474                                 return info;
1475                         break;
1476                 }
1477
1478         return NULL;
1479 }
1480
1481 static void domain_update_iotlb(struct dmar_domain *domain)
1482 {
1483         struct device_domain_info *info;
1484         bool has_iotlb_device = false;
1485
1486         assert_spin_locked(&device_domain_lock);
1487
1488         list_for_each_entry(info, &domain->devices, link) {
1489                 struct pci_dev *pdev;
1490
1491                 if (!info->dev || !dev_is_pci(info->dev))
1492                         continue;
1493
1494                 pdev = to_pci_dev(info->dev);
1495                 if (pdev->ats_enabled) {
1496                         has_iotlb_device = true;
1497                         break;
1498                 }
1499         }
1500
1501         domain->has_iotlb_device = has_iotlb_device;
1502 }
1503
1504 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1505 {
1506         struct pci_dev *pdev;
1507
1508         assert_spin_locked(&device_domain_lock);
1509
1510         if (!info || !dev_is_pci(info->dev))
1511                 return;
1512
1513         pdev = to_pci_dev(info->dev);
1514
1515 #ifdef CONFIG_INTEL_IOMMU_SVM
1516         /* The PCIe spec, in its wisdom, declares that the behaviour of
1517            the device if you enable PASID support after ATS support is
1518            undefined. So always enable PASID support on devices which
1519            have it, even if we can't yet know if we're ever going to
1520            use it. */
1521         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1522                 info->pasid_enabled = 1;
1523
1524         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1525                 info->pri_enabled = 1;
1526 #endif
1527         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1528                 info->ats_enabled = 1;
1529                 domain_update_iotlb(info->domain);
1530                 info->ats_qdep = pci_ats_queue_depth(pdev);
1531         }
1532 }
1533
1534 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1535 {
1536         struct pci_dev *pdev;
1537
1538         assert_spin_locked(&device_domain_lock);
1539
1540         if (!dev_is_pci(info->dev))
1541                 return;
1542
1543         pdev = to_pci_dev(info->dev);
1544
1545         if (info->ats_enabled) {
1546                 pci_disable_ats(pdev);
1547                 info->ats_enabled = 0;
1548                 domain_update_iotlb(info->domain);
1549         }
1550 #ifdef CONFIG_INTEL_IOMMU_SVM
1551         if (info->pri_enabled) {
1552                 pci_disable_pri(pdev);
1553                 info->pri_enabled = 0;
1554         }
1555         if (info->pasid_enabled) {
1556                 pci_disable_pasid(pdev);
1557                 info->pasid_enabled = 0;
1558         }
1559 #endif
1560 }
1561
1562 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1563                                   u64 addr, unsigned mask)
1564 {
1565         u16 sid, qdep;
1566         unsigned long flags;
1567         struct device_domain_info *info;
1568
1569         if (!domain->has_iotlb_device)
1570                 return;
1571
1572         spin_lock_irqsave(&device_domain_lock, flags);
1573         list_for_each_entry(info, &domain->devices, link) {
1574                 if (!info->ats_enabled)
1575                         continue;
1576
1577                 sid = info->bus << 8 | info->devfn;
1578                 qdep = info->ats_qdep;
1579                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1580         }
1581         spin_unlock_irqrestore(&device_domain_lock, flags);
1582 }
1583
1584 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1585                                   struct dmar_domain *domain,
1586                                   unsigned long pfn, unsigned int pages,
1587                                   int ih, int map)
1588 {
1589         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1590         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1591         u16 did = domain->iommu_did[iommu->seq_id];
1592
1593         BUG_ON(pages == 0);
1594
1595         if (ih)
1596                 ih = 1 << 6;
1597         /*
1598          * Fallback to domain selective flush if no PSI support or the size is
1599          * too big.
1600          * PSI requires page size to be 2 ^ x, and the base address is naturally
1601          * aligned to the size
1602          */
1603         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1604                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                 DMA_TLB_DSI_FLUSH);
1606         else
1607                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1608                                                 DMA_TLB_PSI_FLUSH);
1609
1610         /*
1611          * In caching mode, changes of pages from non-present to present require
1612          * flush. However, device IOTLB doesn't need to be flushed in this case.
1613          */
1614         if (!cap_caching_mode(iommu->cap) || !map)
1615                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1616                                       addr, mask);
1617 }
1618
1619 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1620 {
1621         u32 pmen;
1622         unsigned long flags;
1623
1624         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1625         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1626         pmen &= ~DMA_PMEN_EPM;
1627         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1628
1629         /* wait for the protected region status bit to clear */
1630         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1631                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1632
1633         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1634 }
1635
1636 static void iommu_enable_translation(struct intel_iommu *iommu)
1637 {
1638         u32 sts;
1639         unsigned long flags;
1640
1641         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1642         iommu->gcmd |= DMA_GCMD_TE;
1643         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645         /* Make sure hardware complete it */
1646         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647                       readl, (sts & DMA_GSTS_TES), sts);
1648
1649         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1650 }
1651
1652 static void iommu_disable_translation(struct intel_iommu *iommu)
1653 {
1654         u32 sts;
1655         unsigned long flag;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668
1669 static int iommu_init_domains(struct intel_iommu *iommu)
1670 {
1671         u32 ndomains, nlongs;
1672         size_t size;
1673
1674         ndomains = cap_ndoms(iommu->cap);
1675         pr_debug("%s: Number of Domains supported <%d>\n",
1676                  iommu->name, ndomains);
1677         nlongs = BITS_TO_LONGS(ndomains);
1678
1679         spin_lock_init(&iommu->lock);
1680
1681         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1682         if (!iommu->domain_ids) {
1683                 pr_err("%s: Allocating domain id array failed\n",
1684                        iommu->name);
1685                 return -ENOMEM;
1686         }
1687
1688         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1689         iommu->domains = kzalloc(size, GFP_KERNEL);
1690
1691         if (iommu->domains) {
1692                 size = 256 * sizeof(struct dmar_domain *);
1693                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1694         }
1695
1696         if (!iommu->domains || !iommu->domains[0]) {
1697                 pr_err("%s: Allocating domain array failed\n",
1698                        iommu->name);
1699                 kfree(iommu->domain_ids);
1700                 kfree(iommu->domains);
1701                 iommu->domain_ids = NULL;
1702                 iommu->domains    = NULL;
1703                 return -ENOMEM;
1704         }
1705
1706
1707
1708         /*
1709          * If Caching mode is set, then invalid translations are tagged
1710          * with domain-id 0, hence we need to pre-allocate it. We also
1711          * use domain-id 0 as a marker for non-allocated domain-id, so
1712          * make sure it is not used for a real domain.
1713          */
1714         set_bit(0, iommu->domain_ids);
1715
1716         return 0;
1717 }
1718
1719 static void disable_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721         struct device_domain_info *info, *tmp;
1722         unsigned long flags;
1723
1724         if (!iommu->domains || !iommu->domain_ids)
1725                 return;
1726
1727         spin_lock_irqsave(&device_domain_lock, flags);
1728         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1729                 struct dmar_domain *domain;
1730
1731                 if (info->iommu != iommu)
1732                         continue;
1733
1734                 if (!info->dev || !info->domain)
1735                         continue;
1736
1737                 domain = info->domain;
1738
1739                 dmar_remove_one_dev_info(domain, info->dev);
1740
1741                 if (!domain_type_is_vm_or_si(domain))
1742                         domain_exit(domain);
1743         }
1744         spin_unlock_irqrestore(&device_domain_lock, flags);
1745
1746         if (iommu->gcmd & DMA_GCMD_TE)
1747                 iommu_disable_translation(iommu);
1748 }
1749
1750 static void free_dmar_iommu(struct intel_iommu *iommu)
1751 {
1752         if ((iommu->domains) && (iommu->domain_ids)) {
1753                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1754                 int i;
1755
1756                 for (i = 0; i < elems; i++)
1757                         kfree(iommu->domains[i]);
1758                 kfree(iommu->domains);
1759                 kfree(iommu->domain_ids);
1760                 iommu->domains = NULL;
1761                 iommu->domain_ids = NULL;
1762         }
1763
1764         g_iommus[iommu->seq_id] = NULL;
1765
1766         /* free context mapping */
1767         free_context_table(iommu);
1768
1769 #ifdef CONFIG_INTEL_IOMMU_SVM
1770         if (pasid_enabled(iommu)) {
1771                 if (ecap_prs(iommu->ecap))
1772                         intel_svm_finish_prq(iommu);
1773                 intel_svm_free_pasid_tables(iommu);
1774         }
1775 #endif
1776 }
1777
1778 static struct dmar_domain *alloc_domain(int flags)
1779 {
1780         struct dmar_domain *domain;
1781
1782         domain = alloc_domain_mem();
1783         if (!domain)
1784                 return NULL;
1785
1786         memset(domain, 0, sizeof(*domain));
1787         domain->nid = -1;
1788         domain->flags = flags;
1789         domain->has_iotlb_device = false;
1790         INIT_LIST_HEAD(&domain->devices);
1791
1792         return domain;
1793 }
1794
1795 /* Must be called with iommu->lock */
1796 static int domain_attach_iommu(struct dmar_domain *domain,
1797                                struct intel_iommu *iommu)
1798 {
1799         unsigned long ndomains;
1800         int num;
1801
1802         assert_spin_locked(&device_domain_lock);
1803         assert_spin_locked(&iommu->lock);
1804
1805         domain->iommu_refcnt[iommu->seq_id] += 1;
1806         domain->iommu_count += 1;
1807         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1808                 ndomains = cap_ndoms(iommu->cap);
1809                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1810
1811                 if (num >= ndomains) {
1812                         pr_err("%s: No free domain ids\n", iommu->name);
1813                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1814                         domain->iommu_count -= 1;
1815                         return -ENOSPC;
1816                 }
1817
1818                 set_bit(num, iommu->domain_ids);
1819                 set_iommu_domain(iommu, num, domain);
1820
1821                 domain->iommu_did[iommu->seq_id] = num;
1822                 domain->nid                      = iommu->node;
1823
1824                 domain_update_iommu_cap(domain);
1825         }
1826
1827         return 0;
1828 }
1829
1830 static int domain_detach_iommu(struct dmar_domain *domain,
1831                                struct intel_iommu *iommu)
1832 {
1833         int num, count = INT_MAX;
1834
1835         assert_spin_locked(&device_domain_lock);
1836         assert_spin_locked(&iommu->lock);
1837
1838         domain->iommu_refcnt[iommu->seq_id] -= 1;
1839         count = --domain->iommu_count;
1840         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1841                 num = domain->iommu_did[iommu->seq_id];
1842                 clear_bit(num, iommu->domain_ids);
1843                 set_iommu_domain(iommu, num, NULL);
1844
1845                 domain_update_iommu_cap(domain);
1846                 domain->iommu_did[iommu->seq_id] = 0;
1847         }
1848
1849         return count;
1850 }
1851
1852 static struct iova_domain reserved_iova_list;
1853 static struct lock_class_key reserved_rbtree_key;
1854
1855 static int dmar_init_reserved_ranges(void)
1856 {
1857         struct pci_dev *pdev = NULL;
1858         struct iova *iova;
1859         int i;
1860
1861         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1862                         DMA_32BIT_PFN);
1863
1864         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1865                 &reserved_rbtree_key);
1866
1867         /* IOAPIC ranges shouldn't be accessed by DMA */
1868         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1869                 IOVA_PFN(IOAPIC_RANGE_END));
1870         if (!iova) {
1871                 pr_err("Reserve IOAPIC range failed\n");
1872                 return -ENODEV;
1873         }
1874
1875         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1876         for_each_pci_dev(pdev) {
1877                 struct resource *r;
1878
1879                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1880                         r = &pdev->resource[i];
1881                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1882                                 continue;
1883                         iova = reserve_iova(&reserved_iova_list,
1884                                             IOVA_PFN(r->start),
1885                                             IOVA_PFN(r->end));
1886                         if (!iova) {
1887                                 pr_err("Reserve iova failed\n");
1888                                 return -ENODEV;
1889                         }
1890                 }
1891         }
1892         return 0;
1893 }
1894
1895 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1896 {
1897         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1898 }
1899
1900 static inline int guestwidth_to_adjustwidth(int gaw)
1901 {
1902         int agaw;
1903         int r = (gaw - 12) % 9;
1904
1905         if (r == 0)
1906                 agaw = gaw;
1907         else
1908                 agaw = gaw + 9 - r;
1909         if (agaw > 64)
1910                 agaw = 64;
1911         return agaw;
1912 }
1913
1914 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1915                        int guest_width)
1916 {
1917         int adjust_width, agaw;
1918         unsigned long sagaw;
1919
1920         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1921                         DMA_32BIT_PFN);
1922         domain_reserve_special_ranges(domain);
1923
1924         /* calculate AGAW */
1925         if (guest_width > cap_mgaw(iommu->cap))
1926                 guest_width = cap_mgaw(iommu->cap);
1927         domain->gaw = guest_width;
1928         adjust_width = guestwidth_to_adjustwidth(guest_width);
1929         agaw = width_to_agaw(adjust_width);
1930         sagaw = cap_sagaw(iommu->cap);
1931         if (!test_bit(agaw, &sagaw)) {
1932                 /* hardware doesn't support it, choose a bigger one */
1933                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1934                 agaw = find_next_bit(&sagaw, 5, agaw);
1935                 if (agaw >= 5)
1936                         return -ENODEV;
1937         }
1938         domain->agaw = agaw;
1939
1940         if (ecap_coherent(iommu->ecap))
1941                 domain->iommu_coherency = 1;
1942         else
1943                 domain->iommu_coherency = 0;
1944
1945         if (ecap_sc_support(iommu->ecap))
1946                 domain->iommu_snooping = 1;
1947         else
1948                 domain->iommu_snooping = 0;
1949
1950         if (intel_iommu_superpage)
1951                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1952         else
1953                 domain->iommu_superpage = 0;
1954
1955         domain->nid = iommu->node;
1956
1957         /* always allocate the top pgd */
1958         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1959         if (!domain->pgd)
1960                 return -ENOMEM;
1961         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1962         return 0;
1963 }
1964
1965 static void domain_exit(struct dmar_domain *domain)
1966 {
1967         struct page *freelist = NULL;
1968
1969         /* Domain 0 is reserved, so dont process it */
1970         if (!domain)
1971                 return;
1972
1973         /* Flush any lazy unmaps that may reference this domain */
1974         if (!intel_iommu_strict) {
1975                 int cpu;
1976
1977                 for_each_possible_cpu(cpu)
1978                         flush_unmaps_timeout(cpu);
1979         }
1980
1981         /* Remove associated devices and clear attached or cached domains */
1982         rcu_read_lock();
1983         domain_remove_dev_info(domain);
1984         rcu_read_unlock();
1985
1986         /* destroy iovas */
1987         put_iova_domain(&domain->iovad);
1988
1989         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1990
1991         dma_free_pagelist(freelist);
1992
1993         free_domain_mem(domain);
1994 }
1995
1996 static int domain_context_mapping_one(struct dmar_domain *domain,
1997                                       struct intel_iommu *iommu,
1998                                       u8 bus, u8 devfn)
1999 {
2000         u16 did = domain->iommu_did[iommu->seq_id];
2001         int translation = CONTEXT_TT_MULTI_LEVEL;
2002         struct device_domain_info *info = NULL;
2003         struct context_entry *context;
2004         unsigned long flags;
2005         struct dma_pte *pgd;
2006         int ret, agaw;
2007
2008         WARN_ON(did == 0);
2009
2010         if (hw_pass_through && domain_type_is_si(domain))
2011                 translation = CONTEXT_TT_PASS_THROUGH;
2012
2013         pr_debug("Set context mapping for %02x:%02x.%d\n",
2014                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2015
2016         BUG_ON(!domain->pgd);
2017
2018         spin_lock_irqsave(&device_domain_lock, flags);
2019         spin_lock(&iommu->lock);
2020
2021         ret = -ENOMEM;
2022         context = iommu_context_addr(iommu, bus, devfn, 1);
2023         if (!context)
2024                 goto out_unlock;
2025
2026         ret = 0;
2027         if (context_present(context))
2028                 goto out_unlock;
2029
2030         pgd = domain->pgd;
2031
2032         context_clear_entry(context);
2033         context_set_domain_id(context, did);
2034
2035         /*
2036          * Skip top levels of page tables for iommu which has less agaw
2037          * than default.  Unnecessary for PT mode.
2038          */
2039         if (translation != CONTEXT_TT_PASS_THROUGH) {
2040                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2041                         ret = -ENOMEM;
2042                         pgd = phys_to_virt(dma_pte_addr(pgd));
2043                         if (!dma_pte_present(pgd))
2044                                 goto out_unlock;
2045                 }
2046
2047                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2048                 if (info && info->ats_supported)
2049                         translation = CONTEXT_TT_DEV_IOTLB;
2050                 else
2051                         translation = CONTEXT_TT_MULTI_LEVEL;
2052
2053                 context_set_address_root(context, virt_to_phys(pgd));
2054                 context_set_address_width(context, iommu->agaw);
2055         } else {
2056                 /*
2057                  * In pass through mode, AW must be programmed to
2058                  * indicate the largest AGAW value supported by
2059                  * hardware. And ASR is ignored by hardware.
2060                  */
2061                 context_set_address_width(context, iommu->msagaw);
2062         }
2063
2064         context_set_translation_type(context, translation);
2065         context_set_fault_enable(context);
2066         context_set_present(context);
2067         domain_flush_cache(domain, context, sizeof(*context));
2068
2069         /*
2070          * It's a non-present to present mapping. If hardware doesn't cache
2071          * non-present entry we only need to flush the write-buffer. If the
2072          * _does_ cache non-present entries, then it does so in the special
2073          * domain #0, which we have to flush:
2074          */
2075         if (cap_caching_mode(iommu->cap)) {
2076                 iommu->flush.flush_context(iommu, 0,
2077                                            (((u16)bus) << 8) | devfn,
2078                                            DMA_CCMD_MASK_NOBIT,
2079                                            DMA_CCMD_DEVICE_INVL);
2080                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2081         } else {
2082                 iommu_flush_write_buffer(iommu);
2083         }
2084         iommu_enable_dev_iotlb(info);
2085
2086         ret = 0;
2087
2088 out_unlock:
2089         spin_unlock(&iommu->lock);
2090         spin_unlock_irqrestore(&device_domain_lock, flags);
2091
2092         return ret;
2093 }
2094
2095 struct domain_context_mapping_data {
2096         struct dmar_domain *domain;
2097         struct intel_iommu *iommu;
2098 };
2099
2100 static int domain_context_mapping_cb(struct pci_dev *pdev,
2101                                      u16 alias, void *opaque)
2102 {
2103         struct domain_context_mapping_data *data = opaque;
2104
2105         return domain_context_mapping_one(data->domain, data->iommu,
2106                                           PCI_BUS_NUM(alias), alias & 0xff);
2107 }
2108
2109 static int
2110 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2111 {
2112         struct intel_iommu *iommu;
2113         u8 bus, devfn;
2114         struct domain_context_mapping_data data;
2115
2116         iommu = device_to_iommu(dev, &bus, &devfn);
2117         if (!iommu)
2118                 return -ENODEV;
2119
2120         if (!dev_is_pci(dev))
2121                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2122
2123         data.domain = domain;
2124         data.iommu = iommu;
2125
2126         return pci_for_each_dma_alias(to_pci_dev(dev),
2127                                       &domain_context_mapping_cb, &data);
2128 }
2129
2130 static int domain_context_mapped_cb(struct pci_dev *pdev,
2131                                     u16 alias, void *opaque)
2132 {
2133         struct intel_iommu *iommu = opaque;
2134
2135         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2136 }
2137
2138 static int domain_context_mapped(struct device *dev)
2139 {
2140         struct intel_iommu *iommu;
2141         u8 bus, devfn;
2142
2143         iommu = device_to_iommu(dev, &bus, &devfn);
2144         if (!iommu)
2145                 return -ENODEV;
2146
2147         if (!dev_is_pci(dev))
2148                 return device_context_mapped(iommu, bus, devfn);
2149
2150         return !pci_for_each_dma_alias(to_pci_dev(dev),
2151                                        domain_context_mapped_cb, iommu);
2152 }
2153
2154 /* Returns a number of VTD pages, but aligned to MM page size */
2155 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2156                                             size_t size)
2157 {
2158         host_addr &= ~PAGE_MASK;
2159         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2160 }
2161
2162 /* Return largest possible superpage level for a given mapping */
2163 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2164                                           unsigned long iov_pfn,
2165                                           unsigned long phy_pfn,
2166                                           unsigned long pages)
2167 {
2168         int support, level = 1;
2169         unsigned long pfnmerge;
2170
2171         support = domain->iommu_superpage;
2172
2173         /* To use a large page, the virtual *and* physical addresses
2174            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2175            of them will mean we have to use smaller pages. So just
2176            merge them and check both at once. */
2177         pfnmerge = iov_pfn | phy_pfn;
2178
2179         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2180                 pages >>= VTD_STRIDE_SHIFT;
2181                 if (!pages)
2182                         break;
2183                 pfnmerge >>= VTD_STRIDE_SHIFT;
2184                 level++;
2185                 support--;
2186         }
2187         return level;
2188 }
2189
2190 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2191                             struct scatterlist *sg, unsigned long phys_pfn,
2192                             unsigned long nr_pages, int prot)
2193 {
2194         struct dma_pte *first_pte = NULL, *pte = NULL;
2195         phys_addr_t uninitialized_var(pteval);
2196         unsigned long sg_res = 0;
2197         unsigned int largepage_lvl = 0;
2198         unsigned long lvl_pages = 0;
2199
2200         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2201
2202         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2203                 return -EINVAL;
2204
2205         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2206
2207         if (!sg) {
2208                 sg_res = nr_pages;
2209                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2210         }
2211
2212         while (nr_pages > 0) {
2213                 uint64_t tmp;
2214
2215                 if (!sg_res) {
2216                         sg_res = aligned_nrpages(sg->offset, sg->length);
2217                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2218                         sg->dma_length = sg->length;
2219                         pteval = page_to_phys(sg_page(sg)) | prot;
2220                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2221                 }
2222
2223                 if (!pte) {
2224                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2225
2226                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2227                         if (!pte)
2228                                 return -ENOMEM;
2229                         /* It is large page*/
2230                         if (largepage_lvl > 1) {
2231                                 unsigned long nr_superpages, end_pfn;
2232
2233                                 pteval |= DMA_PTE_LARGE_PAGE;
2234                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2235
2236                                 nr_superpages = sg_res / lvl_pages;
2237                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2238
2239                                 /*
2240                                  * Ensure that old small page tables are
2241                                  * removed to make room for superpage(s).
2242                                  */
2243                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2244                         } else {
2245                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2246                         }
2247
2248                 }
2249                 /* We don't need lock here, nobody else
2250                  * touches the iova range
2251                  */
2252                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2253                 if (tmp) {
2254                         static int dumps = 5;
2255                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2256                                 iov_pfn, tmp, (unsigned long long)pteval);
2257                         if (dumps) {
2258                                 dumps--;
2259                                 debug_dma_dump_mappings(NULL);
2260                         }
2261                         WARN_ON(1);
2262                 }
2263
2264                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2265
2266                 BUG_ON(nr_pages < lvl_pages);
2267                 BUG_ON(sg_res < lvl_pages);
2268
2269                 nr_pages -= lvl_pages;
2270                 iov_pfn += lvl_pages;
2271                 phys_pfn += lvl_pages;
2272                 pteval += lvl_pages * VTD_PAGE_SIZE;
2273                 sg_res -= lvl_pages;
2274
2275                 /* If the next PTE would be the first in a new page, then we
2276                    need to flush the cache on the entries we've just written.
2277                    And then we'll need to recalculate 'pte', so clear it and
2278                    let it get set again in the if (!pte) block above.
2279
2280                    If we're done (!nr_pages) we need to flush the cache too.
2281
2282                    Also if we've been setting superpages, we may need to
2283                    recalculate 'pte' and switch back to smaller pages for the
2284                    end of the mapping, if the trailing size is not enough to
2285                    use another superpage (i.e. sg_res < lvl_pages). */
2286                 pte++;
2287                 if (!nr_pages || first_pte_in_page(pte) ||
2288                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2289                         domain_flush_cache(domain, first_pte,
2290                                            (void *)pte - (void *)first_pte);
2291                         pte = NULL;
2292                 }
2293
2294                 if (!sg_res && nr_pages)
2295                         sg = sg_next(sg);
2296         }
2297         return 0;
2298 }
2299
2300 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2301                                     struct scatterlist *sg, unsigned long nr_pages,
2302                                     int prot)
2303 {
2304         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2305 }
2306
2307 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2308                                      unsigned long phys_pfn, unsigned long nr_pages,
2309                                      int prot)
2310 {
2311         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2312 }
2313
2314 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2315 {
2316         if (!iommu)
2317                 return;
2318
2319         clear_context_table(iommu, bus, devfn);
2320         iommu->flush.flush_context(iommu, 0, 0, 0,
2321                                            DMA_CCMD_GLOBAL_INVL);
2322         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2323 }
2324
2325 static inline void unlink_domain_info(struct device_domain_info *info)
2326 {
2327         assert_spin_locked(&device_domain_lock);
2328         list_del(&info->link);
2329         list_del(&info->global);
2330         if (info->dev)
2331                 info->dev->archdata.iommu = NULL;
2332 }
2333
2334 static void domain_remove_dev_info(struct dmar_domain *domain)
2335 {
2336         struct device_domain_info *info, *tmp;
2337         unsigned long flags;
2338
2339         spin_lock_irqsave(&device_domain_lock, flags);
2340         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2341                 __dmar_remove_one_dev_info(info);
2342         spin_unlock_irqrestore(&device_domain_lock, flags);
2343 }
2344
2345 /*
2346  * find_domain
2347  * Note: we use struct device->archdata.iommu stores the info
2348  */
2349 static struct dmar_domain *find_domain(struct device *dev)
2350 {
2351         struct device_domain_info *info;
2352
2353         /* No lock here, assumes no domain exit in normal case */
2354         info = dev->archdata.iommu;
2355         if (info)
2356                 return info->domain;
2357         return NULL;
2358 }
2359
2360 static inline struct device_domain_info *
2361 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2362 {
2363         struct device_domain_info *info;
2364
2365         list_for_each_entry(info, &device_domain_list, global)
2366                 if (info->iommu->segment == segment && info->bus == bus &&
2367                     info->devfn == devfn)
2368                         return info;
2369
2370         return NULL;
2371 }
2372
2373 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2374                                                     int bus, int devfn,
2375                                                     struct device *dev,
2376                                                     struct dmar_domain *domain)
2377 {
2378         struct dmar_domain *found = NULL;
2379         struct device_domain_info *info;
2380         unsigned long flags;
2381         int ret;
2382
2383         info = alloc_devinfo_mem();
2384         if (!info)
2385                 return NULL;
2386
2387         info->bus = bus;
2388         info->devfn = devfn;
2389         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2390         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2391         info->ats_qdep = 0;
2392         info->dev = dev;
2393         info->domain = domain;
2394         info->iommu = iommu;
2395
2396         if (dev && dev_is_pci(dev)) {
2397                 struct pci_dev *pdev = to_pci_dev(info->dev);
2398
2399                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2400                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2401                     dmar_find_matched_atsr_unit(pdev))
2402                         info->ats_supported = 1;
2403
2404                 if (ecs_enabled(iommu)) {
2405                         if (pasid_enabled(iommu)) {
2406                                 int features = pci_pasid_features(pdev);
2407                                 if (features >= 0)
2408                                         info->pasid_supported = features | 1;
2409                         }
2410
2411                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2412                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2413                                 info->pri_supported = 1;
2414                 }
2415         }
2416
2417         spin_lock_irqsave(&device_domain_lock, flags);
2418         if (dev)
2419                 found = find_domain(dev);
2420
2421         if (!found) {
2422                 struct device_domain_info *info2;
2423                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2424                 if (info2) {
2425                         found      = info2->domain;
2426                         info2->dev = dev;
2427                 }
2428         }
2429
2430         if (found) {
2431                 spin_unlock_irqrestore(&device_domain_lock, flags);
2432                 free_devinfo_mem(info);
2433                 /* Caller must free the original domain */
2434                 return found;
2435         }
2436
2437         spin_lock(&iommu->lock);
2438         ret = domain_attach_iommu(domain, iommu);
2439         spin_unlock(&iommu->lock);
2440
2441         if (ret) {
2442                 spin_unlock_irqrestore(&device_domain_lock, flags);
2443                 free_devinfo_mem(info);
2444                 return NULL;
2445         }
2446
2447         list_add(&info->link, &domain->devices);
2448         list_add(&info->global, &device_domain_list);
2449         if (dev)
2450                 dev->archdata.iommu = info;
2451         spin_unlock_irqrestore(&device_domain_lock, flags);
2452
2453         if (dev && domain_context_mapping(domain, dev)) {
2454                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2455                 dmar_remove_one_dev_info(domain, dev);
2456                 return NULL;
2457         }
2458
2459         return domain;
2460 }
2461
2462 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2463 {
2464         *(u16 *)opaque = alias;
2465         return 0;
2466 }
2467
2468 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2469 {
2470         struct device_domain_info *info = NULL;
2471         struct dmar_domain *domain = NULL;
2472         struct intel_iommu *iommu;
2473         u16 req_id, dma_alias;
2474         unsigned long flags;
2475         u8 bus, devfn;
2476
2477         iommu = device_to_iommu(dev, &bus, &devfn);
2478         if (!iommu)
2479                 return NULL;
2480
2481         req_id = ((u16)bus << 8) | devfn;
2482
2483         if (dev_is_pci(dev)) {
2484                 struct pci_dev *pdev = to_pci_dev(dev);
2485
2486                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2487
2488                 spin_lock_irqsave(&device_domain_lock, flags);
2489                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2490                                                       PCI_BUS_NUM(dma_alias),
2491                                                       dma_alias & 0xff);
2492                 if (info) {
2493                         iommu = info->iommu;
2494                         domain = info->domain;
2495                 }
2496                 spin_unlock_irqrestore(&device_domain_lock, flags);
2497
2498                 /* DMA alias already has a domain, use it */
2499                 if (info)
2500                         goto out;
2501         }
2502
2503         /* Allocate and initialize new domain for the device */
2504         domain = alloc_domain(0);
2505         if (!domain)
2506                 return NULL;
2507         if (domain_init(domain, iommu, gaw)) {
2508                 domain_exit(domain);
2509                 return NULL;
2510         }
2511
2512 out:
2513
2514         return domain;
2515 }
2516
2517 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2518                                               struct dmar_domain *domain)
2519 {
2520         struct intel_iommu *iommu;
2521         struct dmar_domain *tmp;
2522         u16 req_id, dma_alias;
2523         u8 bus, devfn;
2524
2525         iommu = device_to_iommu(dev, &bus, &devfn);
2526         if (!iommu)
2527                 return NULL;
2528
2529         req_id = ((u16)bus << 8) | devfn;
2530
2531         if (dev_is_pci(dev)) {
2532                 struct pci_dev *pdev = to_pci_dev(dev);
2533
2534                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2535
2536                 /* register PCI DMA alias device */
2537                 if (req_id != dma_alias) {
2538                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2539                                         dma_alias & 0xff, NULL, domain);
2540
2541                         if (!tmp || tmp != domain)
2542                                 return tmp;
2543                 }
2544         }
2545
2546         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2547         if (!tmp || tmp != domain)
2548                 return tmp;
2549
2550         return domain;
2551 }
2552
2553 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2554 {
2555         struct dmar_domain *domain, *tmp;
2556
2557         domain = find_domain(dev);
2558         if (domain)
2559                 goto out;
2560
2561         domain = find_or_alloc_domain(dev, gaw);
2562         if (!domain)
2563                 goto out;
2564
2565         tmp = set_domain_for_dev(dev, domain);
2566         if (!tmp || domain != tmp) {
2567                 domain_exit(domain);
2568                 domain = tmp;
2569         }
2570
2571 out:
2572
2573         return domain;
2574 }
2575
2576 static int iommu_domain_identity_map(struct dmar_domain *domain,
2577                                      unsigned long long start,
2578                                      unsigned long long end)
2579 {
2580         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2581         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2582
2583         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2584                           dma_to_mm_pfn(last_vpfn))) {
2585                 pr_err("Reserving iova failed\n");
2586                 return -ENOMEM;
2587         }
2588
2589         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2590         /*
2591          * RMRR range might have overlap with physical memory range,
2592          * clear it first
2593          */
2594         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2595
2596         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2597                                   last_vpfn - first_vpfn + 1,
2598                                   DMA_PTE_READ|DMA_PTE_WRITE);
2599 }
2600
2601 static int domain_prepare_identity_map(struct device *dev,
2602                                        struct dmar_domain *domain,
2603                                        unsigned long long start,
2604                                        unsigned long long end)
2605 {
2606         /* For _hardware_ passthrough, don't bother. But for software
2607            passthrough, we do it anyway -- it may indicate a memory
2608            range which is reserved in E820, so which didn't get set
2609            up to start with in si_domain */
2610         if (domain == si_domain && hw_pass_through) {
2611                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2612                         dev_name(dev), start, end);
2613                 return 0;
2614         }
2615
2616         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2617                 dev_name(dev), start, end);
2618
2619         if (end < start) {
2620                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2621                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2622                         dmi_get_system_info(DMI_BIOS_VENDOR),
2623                         dmi_get_system_info(DMI_BIOS_VERSION),
2624                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2625                 return -EIO;
2626         }
2627
2628         if (end >> agaw_to_width(domain->agaw)) {
2629                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2630                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2631                      agaw_to_width(domain->agaw),
2632                      dmi_get_system_info(DMI_BIOS_VENDOR),
2633                      dmi_get_system_info(DMI_BIOS_VERSION),
2634                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2635                 return -EIO;
2636         }
2637
2638         return iommu_domain_identity_map(domain, start, end);
2639 }
2640
2641 static int iommu_prepare_identity_map(struct device *dev,
2642                                       unsigned long long start,
2643                                       unsigned long long end)
2644 {
2645         struct dmar_domain *domain;
2646         int ret;
2647
2648         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2649         if (!domain)
2650                 return -ENOMEM;
2651
2652         ret = domain_prepare_identity_map(dev, domain, start, end);
2653         if (ret)
2654                 domain_exit(domain);
2655
2656         return ret;
2657 }
2658
2659 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2660                                          struct device *dev)
2661 {
2662         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2663                 return 0;
2664         return iommu_prepare_identity_map(dev, rmrr->base_address,
2665                                           rmrr->end_address);
2666 }
2667
2668 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2669 static inline void iommu_prepare_isa(void)
2670 {
2671         struct pci_dev *pdev;
2672         int ret;
2673
2674         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2675         if (!pdev)
2676                 return;
2677
2678         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2679         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2680
2681         if (ret)
2682                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2683
2684         pci_dev_put(pdev);
2685 }
2686 #else
2687 static inline void iommu_prepare_isa(void)
2688 {
2689         return;
2690 }
2691 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2692
2693 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2694
2695 static int __init si_domain_init(int hw)
2696 {
2697         int nid, ret = 0;
2698
2699         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2700         if (!si_domain)
2701                 return -EFAULT;
2702
2703         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2704                 domain_exit(si_domain);
2705                 return -EFAULT;
2706         }
2707
2708         pr_debug("Identity mapping domain allocated\n");
2709
2710         if (hw)
2711                 return 0;
2712
2713         for_each_online_node(nid) {
2714                 unsigned long start_pfn, end_pfn;
2715                 int i;
2716
2717                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2718                         ret = iommu_domain_identity_map(si_domain,
2719                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2720                         if (ret)
2721                                 return ret;
2722                 }
2723         }
2724
2725         return 0;
2726 }
2727
2728 static int identity_mapping(struct device *dev)
2729 {
2730         struct device_domain_info *info;
2731
2732         if (likely(!iommu_identity_mapping))
2733                 return 0;
2734
2735         info = dev->archdata.iommu;
2736         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2737                 return (info->domain == si_domain);
2738
2739         return 0;
2740 }
2741
2742 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2743 {
2744         struct dmar_domain *ndomain;
2745         struct intel_iommu *iommu;
2746         u8 bus, devfn;
2747
2748         iommu = device_to_iommu(dev, &bus, &devfn);
2749         if (!iommu)
2750                 return -ENODEV;
2751
2752         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2753         if (ndomain != domain)
2754                 return -EBUSY;
2755
2756         return 0;
2757 }
2758
2759 static bool device_has_rmrr(struct device *dev)
2760 {
2761         struct dmar_rmrr_unit *rmrr;
2762         struct device *tmp;
2763         int i;
2764
2765         rcu_read_lock();
2766         for_each_rmrr_units(rmrr) {
2767                 /*
2768                  * Return TRUE if this RMRR contains the device that
2769                  * is passed in.
2770                  */
2771                 for_each_active_dev_scope(rmrr->devices,
2772                                           rmrr->devices_cnt, i, tmp)
2773                         if (tmp == dev) {
2774                                 rcu_read_unlock();
2775                                 return true;
2776                         }
2777         }
2778         rcu_read_unlock();
2779         return false;
2780 }
2781
2782 /*
2783  * There are a couple cases where we need to restrict the functionality of
2784  * devices associated with RMRRs.  The first is when evaluating a device for
2785  * identity mapping because problems exist when devices are moved in and out
2786  * of domains and their respective RMRR information is lost.  This means that
2787  * a device with associated RMRRs will never be in a "passthrough" domain.
2788  * The second is use of the device through the IOMMU API.  This interface
2789  * expects to have full control of the IOVA space for the device.  We cannot
2790  * satisfy both the requirement that RMRR access is maintained and have an
2791  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2792  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2793  * We therefore prevent devices associated with an RMRR from participating in
2794  * the IOMMU API, which eliminates them from device assignment.
2795  *
2796  * In both cases we assume that PCI USB devices with RMRRs have them largely
2797  * for historical reasons and that the RMRR space is not actively used post
2798  * boot.  This exclusion may change if vendors begin to abuse it.
2799  *
2800  * The same exception is made for graphics devices, with the requirement that
2801  * any use of the RMRR regions will be torn down before assigning the device
2802  * to a guest.
2803  */
2804 static bool device_is_rmrr_locked(struct device *dev)
2805 {
2806         if (!device_has_rmrr(dev))
2807                 return false;
2808
2809         if (dev_is_pci(dev)) {
2810                 struct pci_dev *pdev = to_pci_dev(dev);
2811
2812                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2813                         return false;
2814         }
2815
2816         return true;
2817 }
2818
2819 static int iommu_should_identity_map(struct device *dev, int startup)
2820 {
2821
2822         if (dev_is_pci(dev)) {
2823                 struct pci_dev *pdev = to_pci_dev(dev);
2824
2825                 if (device_is_rmrr_locked(dev))
2826                         return 0;
2827
2828                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2829                         return 1;
2830
2831                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2832                         return 1;
2833
2834                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2835                         return 0;
2836
2837                 /*
2838                  * We want to start off with all devices in the 1:1 domain, and
2839                  * take them out later if we find they can't access all of memory.
2840                  *
2841                  * However, we can't do this for PCI devices behind bridges,
2842                  * because all PCI devices behind the same bridge will end up
2843                  * with the same source-id on their transactions.
2844                  *
2845                  * Practically speaking, we can't change things around for these
2846                  * devices at run-time, because we can't be sure there'll be no
2847                  * DMA transactions in flight for any of their siblings.
2848                  *
2849                  * So PCI devices (unless they're on the root bus) as well as
2850                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2851                  * the 1:1 domain, just in _case_ one of their siblings turns out
2852                  * not to be able to map all of memory.
2853                  */
2854                 if (!pci_is_pcie(pdev)) {
2855                         if (!pci_is_root_bus(pdev->bus))
2856                                 return 0;
2857                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2858                                 return 0;
2859                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2860                         return 0;
2861         } else {
2862                 if (device_has_rmrr(dev))
2863                         return 0;
2864         }
2865
2866         /*
2867          * At boot time, we don't yet know if devices will be 64-bit capable.
2868          * Assume that they will — if they turn out not to be, then we can
2869          * take them out of the 1:1 domain later.
2870          */
2871         if (!startup) {
2872                 /*
2873                  * If the device's dma_mask is less than the system's memory
2874                  * size then this is not a candidate for identity mapping.
2875                  */
2876                 u64 dma_mask = *dev->dma_mask;
2877
2878                 if (dev->coherent_dma_mask &&
2879                     dev->coherent_dma_mask < dma_mask)
2880                         dma_mask = dev->coherent_dma_mask;
2881
2882                 return dma_mask >= dma_get_required_mask(dev);
2883         }
2884
2885         return 1;
2886 }
2887
2888 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2889 {
2890         int ret;
2891
2892         if (!iommu_should_identity_map(dev, 1))
2893                 return 0;
2894
2895         ret = domain_add_dev_info(si_domain, dev);
2896         if (!ret)
2897                 pr_info("%s identity mapping for device %s\n",
2898                         hw ? "Hardware" : "Software", dev_name(dev));
2899         else if (ret == -ENODEV)
2900                 /* device not associated with an iommu */
2901                 ret = 0;
2902
2903         return ret;
2904 }
2905
2906
2907 static int __init iommu_prepare_static_identity_mapping(int hw)
2908 {
2909         struct pci_dev *pdev = NULL;
2910         struct dmar_drhd_unit *drhd;
2911         struct intel_iommu *iommu;
2912         struct device *dev;
2913         int i;
2914         int ret = 0;
2915
2916         for_each_pci_dev(pdev) {
2917                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2918                 if (ret)
2919                         return ret;
2920         }
2921
2922         for_each_active_iommu(iommu, drhd)
2923                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2924                         struct acpi_device_physical_node *pn;
2925                         struct acpi_device *adev;
2926
2927                         if (dev->bus != &acpi_bus_type)
2928                                 continue;
2929
2930                         adev= to_acpi_device(dev);
2931                         mutex_lock(&adev->physical_node_lock);
2932                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2933                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2934                                 if (ret)
2935                                         break;
2936                         }
2937                         mutex_unlock(&adev->physical_node_lock);
2938                         if (ret)
2939                                 return ret;
2940                 }
2941
2942         return 0;
2943 }
2944
2945 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2946 {
2947         /*
2948          * Start from the sane iommu hardware state.
2949          * If the queued invalidation is already initialized by us
2950          * (for example, while enabling interrupt-remapping) then
2951          * we got the things already rolling from a sane state.
2952          */
2953         if (!iommu->qi) {
2954                 /*
2955                  * Clear any previous faults.
2956                  */
2957                 dmar_fault(-1, iommu);
2958                 /*
2959                  * Disable queued invalidation if supported and already enabled
2960                  * before OS handover.
2961                  */
2962                 dmar_disable_qi(iommu);
2963         }
2964
2965         if (dmar_enable_qi(iommu)) {
2966                 /*
2967                  * Queued Invalidate not enabled, use Register Based Invalidate
2968                  */
2969                 iommu->flush.flush_context = __iommu_flush_context;
2970                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2971                 pr_info("%s: Using Register based invalidation\n",
2972                         iommu->name);
2973         } else {
2974                 iommu->flush.flush_context = qi_flush_context;
2975                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2976                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2977         }
2978 }
2979
2980 static int copy_context_table(struct intel_iommu *iommu,
2981                               struct root_entry *old_re,
2982                               struct context_entry **tbl,
2983                               int bus, bool ext)
2984 {
2985         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2986         struct context_entry *new_ce = NULL, ce;
2987         struct context_entry *old_ce = NULL;
2988         struct root_entry re;
2989         phys_addr_t old_ce_phys;
2990
2991         tbl_idx = ext ? bus * 2 : bus;
2992         memcpy(&re, old_re, sizeof(re));
2993
2994         for (devfn = 0; devfn < 256; devfn++) {
2995                 /* First calculate the correct index */
2996                 idx = (ext ? devfn * 2 : devfn) % 256;
2997
2998                 if (idx == 0) {
2999                         /* First save what we may have and clean up */
3000                         if (new_ce) {
3001                                 tbl[tbl_idx] = new_ce;
3002                                 __iommu_flush_cache(iommu, new_ce,
3003                                                     VTD_PAGE_SIZE);
3004                                 pos = 1;
3005                         }
3006
3007                         if (old_ce)
3008                                 iounmap(old_ce);
3009
3010                         ret = 0;
3011                         if (devfn < 0x80)
3012                                 old_ce_phys = root_entry_lctp(&re);
3013                         else
3014                                 old_ce_phys = root_entry_uctp(&re);
3015
3016                         if (!old_ce_phys) {
3017                                 if (ext && devfn == 0) {
3018                                         /* No LCTP, try UCTP */
3019                                         devfn = 0x7f;
3020                                         continue;
3021                                 } else {
3022                                         goto out;
3023                                 }
3024                         }
3025
3026                         ret = -ENOMEM;
3027                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3028                                         MEMREMAP_WB);
3029                         if (!old_ce)
3030                                 goto out;
3031
3032                         new_ce = alloc_pgtable_page(iommu->node);
3033                         if (!new_ce)
3034                                 goto out_unmap;
3035
3036                         ret = 0;
3037                 }
3038
3039                 /* Now copy the context entry */
3040                 memcpy(&ce, old_ce + idx, sizeof(ce));
3041
3042                 if (!__context_present(&ce))
3043                         continue;
3044
3045                 did = context_domain_id(&ce);
3046                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3047                         set_bit(did, iommu->domain_ids);
3048
3049                 /*
3050                  * We need a marker for copied context entries. This
3051                  * marker needs to work for the old format as well as
3052                  * for extended context entries.
3053                  *
3054                  * Bit 67 of the context entry is used. In the old
3055                  * format this bit is available to software, in the
3056                  * extended format it is the PGE bit, but PGE is ignored
3057                  * by HW if PASIDs are disabled (and thus still
3058                  * available).
3059                  *
3060                  * So disable PASIDs first and then mark the entry
3061                  * copied. This means that we don't copy PASID
3062                  * translations from the old kernel, but this is fine as
3063                  * faults there are not fatal.
3064                  */
3065                 context_clear_pasid_enable(&ce);
3066                 context_set_copied(&ce);
3067
3068                 new_ce[idx] = ce;
3069         }
3070
3071         tbl[tbl_idx + pos] = new_ce;
3072
3073         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3074
3075 out_unmap:
3076         memunmap(old_ce);
3077
3078 out:
3079         return ret;
3080 }
3081
3082 static int copy_translation_tables(struct intel_iommu *iommu)
3083 {
3084         struct context_entry **ctxt_tbls;
3085         struct root_entry *old_rt;
3086         phys_addr_t old_rt_phys;
3087         int ctxt_table_entries;
3088         unsigned long flags;
3089         u64 rtaddr_reg;
3090         int bus, ret;
3091         bool new_ext, ext;
3092
3093         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3094         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3095         new_ext    = !!ecap_ecs(iommu->ecap);
3096
3097         /*
3098          * The RTT bit can only be changed when translation is disabled,
3099          * but disabling translation means to open a window for data
3100          * corruption. So bail out and don't copy anything if we would
3101          * have to change the bit.
3102          */
3103         if (new_ext != ext)
3104                 return -EINVAL;
3105
3106         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3107         if (!old_rt_phys)
3108                 return -EINVAL;
3109
3110         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3111         if (!old_rt)
3112                 return -ENOMEM;
3113
3114         /* This is too big for the stack - allocate it from slab */
3115         ctxt_table_entries = ext ? 512 : 256;
3116         ret = -ENOMEM;
3117         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3118         if (!ctxt_tbls)
3119                 goto out_unmap;
3120
3121         for (bus = 0; bus < 256; bus++) {
3122                 ret = copy_context_table(iommu, &old_rt[bus],
3123                                          ctxt_tbls, bus, ext);
3124                 if (ret) {
3125                         pr_err("%s: Failed to copy context table for bus %d\n",
3126                                 iommu->name, bus);
3127                         continue;
3128                 }
3129         }
3130
3131         spin_lock_irqsave(&iommu->lock, flags);
3132
3133         /* Context tables are copied, now write them to the root_entry table */
3134         for (bus = 0; bus < 256; bus++) {
3135                 int idx = ext ? bus * 2 : bus;
3136                 u64 val;
3137
3138                 if (ctxt_tbls[idx]) {
3139                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3140                         iommu->root_entry[bus].lo = val;
3141                 }
3142
3143                 if (!ext || !ctxt_tbls[idx + 1])
3144                         continue;
3145
3146                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3147                 iommu->root_entry[bus].hi = val;
3148         }
3149
3150         spin_unlock_irqrestore(&iommu->lock, flags);
3151
3152         kfree(ctxt_tbls);
3153
3154         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3155
3156         ret = 0;
3157
3158 out_unmap:
3159         memunmap(old_rt);
3160
3161         return ret;
3162 }
3163
3164 static int __init init_dmars(void)
3165 {
3166         struct dmar_drhd_unit *drhd;
3167         struct dmar_rmrr_unit *rmrr;
3168         bool copied_tables = false;
3169         struct device *dev;
3170         struct intel_iommu *iommu;
3171         int i, ret, cpu;
3172
3173         /*
3174          * for each drhd
3175          *    allocate root
3176          *    initialize and program root entry to not present
3177          * endfor
3178          */
3179         for_each_drhd_unit(drhd) {
3180                 /*
3181                  * lock not needed as this is only incremented in the single
3182                  * threaded kernel __init code path all other access are read
3183                  * only
3184                  */
3185                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3186                         g_num_of_iommus++;
3187                         continue;
3188                 }
3189                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3190         }
3191
3192         /* Preallocate enough resources for IOMMU hot-addition */
3193         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3194                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3195
3196         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3197                         GFP_KERNEL);
3198         if (!g_iommus) {
3199                 pr_err("Allocating global iommu array failed\n");
3200                 ret = -ENOMEM;
3201                 goto error;
3202         }
3203
3204         for_each_possible_cpu(cpu) {
3205                 struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
3206                                                               cpu);
3207
3208                 dfd->tables = kzalloc(g_num_of_iommus *
3209                                       sizeof(struct deferred_flush_table),
3210                                       GFP_KERNEL);
3211                 if (!dfd->tables) {
3212                         ret = -ENOMEM;
3213                         goto free_g_iommus;
3214                 }
3215
3216                 spin_lock_init(&dfd->lock);
3217                 setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
3218         }
3219
3220         for_each_active_iommu(iommu, drhd) {
3221                 g_iommus[iommu->seq_id] = iommu;
3222
3223                 intel_iommu_init_qi(iommu);
3224
3225                 ret = iommu_init_domains(iommu);
3226                 if (ret)
3227                         goto free_iommu;
3228
3229                 init_translation_status(iommu);
3230
3231                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3232                         iommu_disable_translation(iommu);
3233                         clear_translation_pre_enabled(iommu);
3234                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3235                                 iommu->name);
3236                 }
3237
3238                 /*
3239                  * TBD:
3240                  * we could share the same root & context tables
3241                  * among all IOMMU's. Need to Split it later.
3242                  */
3243                 ret = iommu_alloc_root_entry(iommu);
3244                 if (ret)
3245                         goto free_iommu;
3246
3247                 if (translation_pre_enabled(iommu)) {
3248                         pr_info("Translation already enabled - trying to copy translation structures\n");
3249
3250                         ret = copy_translation_tables(iommu);
3251                         if (ret) {
3252                                 /*
3253                                  * We found the IOMMU with translation
3254                                  * enabled - but failed to copy over the
3255                                  * old root-entry table. Try to proceed
3256                                  * by disabling translation now and
3257                                  * allocating a clean root-entry table.
3258                                  * This might cause DMAR faults, but
3259                                  * probably the dump will still succeed.
3260                                  */
3261                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3262                                        iommu->name);
3263                                 iommu_disable_translation(iommu);
3264                                 clear_translation_pre_enabled(iommu);
3265                         } else {
3266                                 pr_info("Copied translation tables from previous kernel for %s\n",
3267                                         iommu->name);
3268                                 copied_tables = true;
3269                         }
3270                 }
3271
3272                 if (!ecap_pass_through(iommu->ecap))
3273                         hw_pass_through = 0;
3274 #ifdef CONFIG_INTEL_IOMMU_SVM
3275                 if (pasid_enabled(iommu))
3276                         intel_svm_alloc_pasid_tables(iommu);
3277 #endif
3278         }
3279
3280         /*
3281          * Now that qi is enabled on all iommus, set the root entry and flush
3282          * caches. This is required on some Intel X58 chipsets, otherwise the
3283          * flush_context function will loop forever and the boot hangs.
3284          */
3285         for_each_active_iommu(iommu, drhd) {
3286                 iommu_flush_write_buffer(iommu);
3287                 iommu_set_root_entry(iommu);
3288                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3289                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3290         }
3291
3292         if (iommu_pass_through)
3293                 iommu_identity_mapping |= IDENTMAP_ALL;
3294
3295 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3296         iommu_identity_mapping |= IDENTMAP_GFX;
3297 #endif
3298
3299         if (iommu_identity_mapping) {
3300                 ret = si_domain_init(hw_pass_through);
3301                 if (ret)
3302                         goto free_iommu;
3303         }
3304
3305         check_tylersburg_isoch();
3306
3307         /*
3308          * If we copied translations from a previous kernel in the kdump
3309          * case, we can not assign the devices to domains now, as that
3310          * would eliminate the old mappings. So skip this part and defer
3311          * the assignment to device driver initialization time.
3312          */
3313         if (copied_tables)
3314                 goto domains_done;
3315
3316         /*
3317          * If pass through is not set or not enabled, setup context entries for
3318          * identity mappings for rmrr, gfx, and isa and may fall back to static
3319          * identity mapping if iommu_identity_mapping is set.
3320          */
3321         if (iommu_identity_mapping) {
3322                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3323                 if (ret) {
3324                         pr_crit("Failed to setup IOMMU pass-through\n");
3325                         goto free_iommu;
3326                 }
3327         }
3328         /*
3329          * For each rmrr
3330          *   for each dev attached to rmrr
3331          *   do
3332          *     locate drhd for dev, alloc domain for dev
3333          *     allocate free domain
3334          *     allocate page table entries for rmrr
3335          *     if context not allocated for bus
3336          *           allocate and init context
3337          *           set present in root table for this bus
3338          *     init context with domain, translation etc
3339          *    endfor
3340          * endfor
3341          */
3342         pr_info("Setting RMRR:\n");
3343         for_each_rmrr_units(rmrr) {
3344                 /* some BIOS lists non-exist devices in DMAR table. */
3345                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3346                                           i, dev) {
3347                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3348                         if (ret)
3349                                 pr_err("Mapping reserved region failed\n");
3350                 }
3351         }
3352
3353         iommu_prepare_isa();
3354
3355 domains_done:
3356
3357         /*
3358          * for each drhd
3359          *   enable fault log
3360          *   global invalidate context cache
3361          *   global invalidate iotlb
3362          *   enable translation
3363          */
3364         for_each_iommu(iommu, drhd) {
3365                 if (drhd->ignored) {
3366                         /*
3367                          * we always have to disable PMRs or DMA may fail on
3368                          * this device
3369                          */
3370                         if (force_on)
3371                                 iommu_disable_protect_mem_regions(iommu);
3372                         continue;
3373                 }
3374
3375                 iommu_flush_write_buffer(iommu);
3376
3377 #ifdef CONFIG_INTEL_IOMMU_SVM
3378                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3379                         ret = intel_svm_enable_prq(iommu);
3380                         if (ret)
3381                                 goto free_iommu;
3382                 }
3383 #endif
3384                 ret = dmar_set_interrupt(iommu);
3385                 if (ret)
3386                         goto free_iommu;
3387
3388                 if (!translation_pre_enabled(iommu))
3389                         iommu_enable_translation(iommu);
3390
3391                 iommu_disable_protect_mem_regions(iommu);
3392         }
3393
3394         return 0;
3395
3396 free_iommu:
3397         for_each_active_iommu(iommu, drhd) {
3398                 disable_dmar_iommu(iommu);
3399                 free_dmar_iommu(iommu);
3400         }
3401 free_g_iommus:
3402         for_each_possible_cpu(cpu)
3403                 kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
3404         kfree(g_iommus);
3405 error:
3406         return ret;
3407 }
3408
3409 /* This takes a number of _MM_ pages, not VTD pages */
3410 static unsigned long intel_alloc_iova(struct device *dev,
3411                                      struct dmar_domain *domain,
3412                                      unsigned long nrpages, uint64_t dma_mask)
3413 {
3414         unsigned long iova_pfn = 0;
3415
3416         /* Restrict dma_mask to the width that the iommu can handle */
3417         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3418         /* Ensure we reserve the whole size-aligned region */
3419         nrpages = __roundup_pow_of_two(nrpages);
3420
3421         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3422                 /*
3423                  * First try to allocate an io virtual address in
3424                  * DMA_BIT_MASK(32) and if that fails then try allocating
3425                  * from higher range
3426                  */
3427                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3428                                            IOVA_PFN(DMA_BIT_MASK(32)));
3429                 if (iova_pfn)
3430                         return iova_pfn;
3431         }
3432         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3433         if (unlikely(!iova_pfn)) {
3434                 pr_err("Allocating %ld-page iova for %s failed",
3435                        nrpages, dev_name(dev));
3436                 return 0;
3437         }
3438
3439         return iova_pfn;
3440 }
3441
3442 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3443 {
3444         struct dmar_domain *domain, *tmp;
3445         struct dmar_rmrr_unit *rmrr;
3446         struct device *i_dev;
3447         int i, ret;
3448
3449         domain = find_domain(dev);
3450         if (domain)
3451                 goto out;
3452
3453         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3454         if (!domain)
3455                 goto out;
3456
3457         /* We have a new domain - setup possible RMRRs for the device */
3458         rcu_read_lock();
3459         for_each_rmrr_units(rmrr) {
3460                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3461                                           i, i_dev) {
3462                         if (i_dev != dev)
3463                                 continue;
3464
3465                         ret = domain_prepare_identity_map(dev, domain,
3466                                                           rmrr->base_address,
3467                                                           rmrr->end_address);
3468                         if (ret)
3469                                 dev_err(dev, "Mapping reserved region failed\n");
3470                 }
3471         }
3472         rcu_read_unlock();
3473
3474         tmp = set_domain_for_dev(dev, domain);
3475         if (!tmp || domain != tmp) {
3476                 domain_exit(domain);
3477                 domain = tmp;
3478         }
3479
3480 out:
3481
3482         if (!domain)
3483                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3484
3485
3486         return domain;
3487 }
3488
3489 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3490 {
3491         struct device_domain_info *info;
3492
3493         /* No lock here, assumes no domain exit in normal case */
3494         info = dev->archdata.iommu;
3495         if (likely(info))
3496                 return info->domain;
3497
3498         return __get_valid_domain_for_dev(dev);
3499 }
3500
3501 /* Check if the dev needs to go through non-identity map and unmap process.*/
3502 static int iommu_no_mapping(struct device *dev)
3503 {
3504         int found;
3505
3506         if (iommu_dummy(dev))
3507                 return 1;
3508
3509         if (!iommu_identity_mapping)
3510                 return 0;
3511
3512         found = identity_mapping(dev);
3513         if (found) {
3514                 if (iommu_should_identity_map(dev, 0))
3515                         return 1;
3516                 else {
3517                         /*
3518                          * 32 bit DMA is removed from si_domain and fall back
3519                          * to non-identity mapping.
3520                          */
3521                         dmar_remove_one_dev_info(si_domain, dev);
3522                         pr_info("32bit %s uses non-identity mapping\n",
3523                                 dev_name(dev));
3524                         return 0;
3525                 }
3526         } else {
3527                 /*
3528                  * In case of a detached 64 bit DMA device from vm, the device
3529                  * is put into si_domain for identity mapping.
3530                  */
3531                 if (iommu_should_identity_map(dev, 0)) {
3532                         int ret;
3533                         ret = domain_add_dev_info(si_domain, dev);
3534                         if (!ret) {
3535                                 pr_info("64bit %s uses identity mapping\n",
3536                                         dev_name(dev));
3537                                 return 1;
3538                         }
3539                 }
3540         }
3541
3542         return 0;
3543 }
3544
3545 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3546                                      size_t size, int dir, u64 dma_mask)
3547 {
3548         struct dmar_domain *domain;
3549         phys_addr_t start_paddr;
3550         unsigned long iova_pfn;
3551         int prot = 0;
3552         int ret;
3553         struct intel_iommu *iommu;
3554         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3555
3556         BUG_ON(dir == DMA_NONE);
3557
3558         if (iommu_no_mapping(dev))
3559                 return paddr;
3560
3561         domain = get_valid_domain_for_dev(dev);
3562         if (!domain)
3563                 return 0;
3564
3565         iommu = domain_get_iommu(domain);
3566         size = aligned_nrpages(paddr, size);
3567
3568         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3569         if (!iova_pfn)
3570                 goto error;
3571
3572         /*
3573          * Check if DMAR supports zero-length reads on write only
3574          * mappings..
3575          */
3576         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3577                         !cap_zlr(iommu->cap))
3578                 prot |= DMA_PTE_READ;
3579         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3580                 prot |= DMA_PTE_WRITE;
3581         /*
3582          * paddr - (paddr + size) might be partial page, we should map the whole
3583          * page.  Note: if two part of one page are separately mapped, we
3584          * might have two guest_addr mapping to the same host paddr, but this
3585          * is not a big problem
3586          */
3587         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3588                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3589         if (ret)
3590                 goto error;
3591
3592         /* it's a non-present to present mapping. Only flush if caching mode */
3593         if (cap_caching_mode(iommu->cap))
3594                 iommu_flush_iotlb_psi(iommu, domain,
3595                                       mm_to_dma_pfn(iova_pfn),
3596                                       size, 0, 1);
3597         else
3598                 iommu_flush_write_buffer(iommu);
3599
3600         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3601         start_paddr += paddr & ~PAGE_MASK;
3602         return start_paddr;
3603
3604 error:
3605         if (iova_pfn)
3606                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3607         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3608                 dev_name(dev), size, (unsigned long long)paddr, dir);
3609         return 0;
3610 }
3611
3612 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3613                                  unsigned long offset, size_t size,
3614                                  enum dma_data_direction dir,
3615                                  unsigned long attrs)
3616 {
3617         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3618                                   dir, *dev->dma_mask);
3619 }
3620
3621 static void flush_unmaps(struct deferred_flush_data *flush_data)
3622 {
3623         int i, j;
3624
3625         flush_data->timer_on = 0;
3626
3627         /* just flush them all */
3628         for (i = 0; i < g_num_of_iommus; i++) {
3629                 struct intel_iommu *iommu = g_iommus[i];
3630                 struct deferred_flush_table *flush_table =
3631                                 &flush_data->tables[i];
3632                 if (!iommu)
3633                         continue;
3634
3635                 if (!flush_table->next)
3636                         continue;
3637
3638                 /* In caching mode, global flushes turn emulation expensive */
3639                 if (!cap_caching_mode(iommu->cap))
3640                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3641                                          DMA_TLB_GLOBAL_FLUSH);
3642                 for (j = 0; j < flush_table->next; j++) {
3643                         unsigned long mask;
3644                         struct deferred_flush_entry *entry =
3645                                                 &flush_table->entries[j];
3646                         unsigned long iova_pfn = entry->iova_pfn;
3647                         unsigned long nrpages = entry->nrpages;
3648                         struct dmar_domain *domain = entry->domain;
3649                         struct page *freelist = entry->freelist;
3650
3651                         /* On real hardware multiple invalidations are expensive */
3652                         if (cap_caching_mode(iommu->cap))
3653                                 iommu_flush_iotlb_psi(iommu, domain,
3654                                         mm_to_dma_pfn(iova_pfn),
3655                                         nrpages, !freelist, 0);
3656                         else {
3657                                 mask = ilog2(nrpages);
3658                                 iommu_flush_dev_iotlb(domain,
3659                                                 (uint64_t)iova_pfn << PAGE_SHIFT, mask);
3660                         }
3661                         free_iova_fast(&domain->iovad, iova_pfn, nrpages);
3662                         if (freelist)
3663                                 dma_free_pagelist(freelist);
3664                 }
3665                 flush_table->next = 0;
3666         }
3667
3668         flush_data->size = 0;
3669 }
3670
3671 static void flush_unmaps_timeout(unsigned long cpuid)
3672 {
3673         struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3674         unsigned long flags;
3675
3676         spin_lock_irqsave(&flush_data->lock, flags);
3677         flush_unmaps(flush_data);
3678         spin_unlock_irqrestore(&flush_data->lock, flags);
3679 }
3680
3681 static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
3682                       unsigned long nrpages, struct page *freelist)
3683 {
3684         unsigned long flags;
3685         int entry_id, iommu_id;
3686         struct intel_iommu *iommu;
3687         struct deferred_flush_entry *entry;
3688         struct deferred_flush_data *flush_data;
3689         unsigned int cpuid;
3690
3691         cpuid = get_cpu();
3692         flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3693
3694         /* Flush all CPUs' entries to avoid deferring too much.  If
3695          * this becomes a bottleneck, can just flush us, and rely on
3696          * flush timer for the rest.
3697          */
3698         if (flush_data->size == HIGH_WATER_MARK) {
3699                 int cpu;
3700
3701                 for_each_online_cpu(cpu)
3702                         flush_unmaps_timeout(cpu);
3703         }
3704
3705         spin_lock_irqsave(&flush_data->lock, flags);
3706
3707         iommu = domain_get_iommu(dom);
3708         iommu_id = iommu->seq_id;
3709
3710         entry_id = flush_data->tables[iommu_id].next;
3711         ++(flush_data->tables[iommu_id].next);
3712
3713         entry = &flush_data->tables[iommu_id].entries[entry_id];
3714         entry->domain = dom;
3715         entry->iova_pfn = iova_pfn;
3716         entry->nrpages = nrpages;
3717         entry->freelist = freelist;
3718
3719         if (!flush_data->timer_on) {
3720                 mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
3721                 flush_data->timer_on = 1;
3722         }
3723         flush_data->size++;
3724         spin_unlock_irqrestore(&flush_data->lock, flags);
3725
3726         put_cpu();
3727 }
3728
3729 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3730 {
3731         struct dmar_domain *domain;
3732         unsigned long start_pfn, last_pfn;
3733         unsigned long nrpages;
3734         unsigned long iova_pfn;
3735         struct intel_iommu *iommu;
3736         struct page *freelist;
3737
3738         if (iommu_no_mapping(dev))
3739                 return;
3740
3741         domain = find_domain(dev);
3742         BUG_ON(!domain);
3743
3744         iommu = domain_get_iommu(domain);
3745
3746         iova_pfn = IOVA_PFN(dev_addr);
3747
3748         nrpages = aligned_nrpages(dev_addr, size);
3749         start_pfn = mm_to_dma_pfn(iova_pfn);
3750         last_pfn = start_pfn + nrpages - 1;
3751
3752         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3753                  dev_name(dev), start_pfn, last_pfn);
3754
3755         freelist = domain_unmap(domain, start_pfn, last_pfn);
3756
3757         if (intel_iommu_strict) {
3758                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3759                                       nrpages, !freelist, 0);
3760                 /* free iova */
3761                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3762                 dma_free_pagelist(freelist);
3763         } else {
3764                 add_unmap(domain, iova_pfn, nrpages, freelist);
3765                 /*
3766                  * queue up the release of the unmap to save the 1/6th of the
3767                  * cpu used up by the iotlb flush operation...
3768                  */
3769         }
3770 }
3771
3772 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3773                              size_t size, enum dma_data_direction dir,
3774                              unsigned long attrs)
3775 {
3776         intel_unmap(dev, dev_addr, size);
3777 }
3778
3779 static void *intel_alloc_coherent(struct device *dev, size_t size,
3780                                   dma_addr_t *dma_handle, gfp_t flags,
3781                                   unsigned long attrs)
3782 {
3783         struct page *page = NULL;
3784         int order;
3785
3786         size = PAGE_ALIGN(size);
3787         order = get_order(size);
3788
3789         if (!iommu_no_mapping(dev))
3790                 flags &= ~(GFP_DMA | GFP_DMA32);
3791         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3792                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3793                         flags |= GFP_DMA;
3794                 else
3795                         flags |= GFP_DMA32;
3796         }
3797
3798         if (gfpflags_allow_blocking(flags)) {
3799                 unsigned int count = size >> PAGE_SHIFT;
3800
3801                 page = dma_alloc_from_contiguous(dev, count, order);
3802                 if (page && iommu_no_mapping(dev) &&
3803                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3804                         dma_release_from_contiguous(dev, page, count);
3805                         page = NULL;
3806                 }
3807         }
3808
3809         if (!page)
3810                 page = alloc_pages(flags, order);
3811         if (!page)
3812                 return NULL;
3813         memset(page_address(page), 0, size);
3814
3815         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3816                                          DMA_BIDIRECTIONAL,
3817                                          dev->coherent_dma_mask);
3818         if (*dma_handle)
3819                 return page_address(page);
3820         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3821                 __free_pages(page, order);
3822
3823         return NULL;
3824 }
3825
3826 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3827                                 dma_addr_t dma_handle, unsigned long attrs)
3828 {
3829         int order;
3830         struct page *page = virt_to_page(vaddr);
3831
3832         size = PAGE_ALIGN(size);
3833         order = get_order(size);
3834
3835         intel_unmap(dev, dma_handle, size);
3836         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3837                 __free_pages(page, order);
3838 }
3839
3840 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3841                            int nelems, enum dma_data_direction dir,
3842                            unsigned long attrs)
3843 {
3844         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3845         unsigned long nrpages = 0;
3846         struct scatterlist *sg;
3847         int i;
3848
3849         for_each_sg(sglist, sg, nelems, i) {
3850                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3851         }
3852
3853         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3854 }
3855
3856 static int intel_nontranslate_map_sg(struct device *hddev,
3857         struct scatterlist *sglist, int nelems, int dir)
3858 {
3859         int i;
3860         struct scatterlist *sg;
3861
3862         for_each_sg(sglist, sg, nelems, i) {
3863                 BUG_ON(!sg_page(sg));
3864                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3865                 sg->dma_length = sg->length;
3866         }
3867         return nelems;
3868 }
3869
3870 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3871                         enum dma_data_direction dir, unsigned long attrs)
3872 {
3873         int i;
3874         struct dmar_domain *domain;
3875         size_t size = 0;
3876         int prot = 0;
3877         unsigned long iova_pfn;
3878         int ret;
3879         struct scatterlist *sg;
3880         unsigned long start_vpfn;
3881         struct intel_iommu *iommu;
3882
3883         BUG_ON(dir == DMA_NONE);
3884         if (iommu_no_mapping(dev))
3885                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3886
3887         domain = get_valid_domain_for_dev(dev);
3888         if (!domain)
3889                 return 0;
3890
3891         iommu = domain_get_iommu(domain);
3892
3893         for_each_sg(sglist, sg, nelems, i)
3894                 size += aligned_nrpages(sg->offset, sg->length);
3895
3896         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3897                                 *dev->dma_mask);
3898         if (!iova_pfn) {
3899                 sglist->dma_length = 0;
3900                 return 0;
3901         }
3902
3903         /*
3904          * Check if DMAR supports zero-length reads on write only
3905          * mappings..
3906          */
3907         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3908                         !cap_zlr(iommu->cap))
3909                 prot |= DMA_PTE_READ;
3910         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3911                 prot |= DMA_PTE_WRITE;
3912
3913         start_vpfn = mm_to_dma_pfn(iova_pfn);
3914
3915         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3916         if (unlikely(ret)) {
3917                 dma_pte_free_pagetable(domain, start_vpfn,
3918                                        start_vpfn + size - 1);
3919                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3920                 return 0;
3921         }
3922
3923         /* it's a non-present to present mapping. Only flush if caching mode */
3924         if (cap_caching_mode(iommu->cap))
3925                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3926         else
3927                 iommu_flush_write_buffer(iommu);
3928
3929         return nelems;
3930 }
3931
3932 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3933 {
3934         return !dma_addr;
3935 }
3936
3937 struct dma_map_ops intel_dma_ops = {
3938         .alloc = intel_alloc_coherent,
3939         .free = intel_free_coherent,
3940         .map_sg = intel_map_sg,
3941         .unmap_sg = intel_unmap_sg,
3942         .map_page = intel_map_page,
3943         .unmap_page = intel_unmap_page,
3944         .mapping_error = intel_mapping_error,
3945 };
3946
3947 static inline int iommu_domain_cache_init(void)
3948 {
3949         int ret = 0;
3950
3951         iommu_domain_cache = kmem_cache_create("iommu_domain",
3952                                          sizeof(struct dmar_domain),
3953                                          0,
3954                                          SLAB_HWCACHE_ALIGN,
3955
3956                                          NULL);
3957         if (!iommu_domain_cache) {
3958                 pr_err("Couldn't create iommu_domain cache\n");
3959                 ret = -ENOMEM;
3960         }
3961
3962         return ret;
3963 }
3964
3965 static inline int iommu_devinfo_cache_init(void)
3966 {
3967         int ret = 0;
3968
3969         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3970                                          sizeof(struct device_domain_info),
3971                                          0,
3972                                          SLAB_HWCACHE_ALIGN,
3973                                          NULL);
3974         if (!iommu_devinfo_cache) {
3975                 pr_err("Couldn't create devinfo cache\n");
3976                 ret = -ENOMEM;
3977         }
3978
3979         return ret;
3980 }
3981
3982 static int __init iommu_init_mempool(void)
3983 {
3984         int ret;
3985         ret = iova_cache_get();
3986         if (ret)
3987                 return ret;
3988
3989         ret = iommu_domain_cache_init();
3990         if (ret)
3991                 goto domain_error;
3992
3993         ret = iommu_devinfo_cache_init();
3994         if (!ret)
3995                 return ret;
3996
3997         kmem_cache_destroy(iommu_domain_cache);
3998 domain_error:
3999         iova_cache_put();
4000
4001         return -ENOMEM;
4002 }
4003
4004 static void __init iommu_exit_mempool(void)
4005 {
4006         kmem_cache_destroy(iommu_devinfo_cache);
4007         kmem_cache_destroy(iommu_domain_cache);
4008         iova_cache_put();
4009 }
4010
4011 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4012 {
4013         struct dmar_drhd_unit *drhd;
4014         u32 vtbar;
4015         int rc;
4016
4017         /* We know that this device on this chipset has its own IOMMU.
4018          * If we find it under a different IOMMU, then the BIOS is lying
4019          * to us. Hope that the IOMMU for this device is actually
4020          * disabled, and it needs no translation...
4021          */
4022         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4023         if (rc) {
4024                 /* "can't" happen */
4025                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4026                 return;
4027         }
4028         vtbar &= 0xffff0000;
4029
4030         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4031         drhd = dmar_find_matched_drhd_unit(pdev);
4032         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4033                             TAINT_FIRMWARE_WORKAROUND,
4034                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4035                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4036 }
4037 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4038
4039 static void __init init_no_remapping_devices(void)
4040 {
4041         struct dmar_drhd_unit *drhd;
4042         struct device *dev;
4043         int i;
4044
4045         for_each_drhd_unit(drhd) {
4046                 if (!drhd->include_all) {
4047                         for_each_active_dev_scope(drhd->devices,
4048                                                   drhd->devices_cnt, i, dev)
4049                                 break;
4050                         /* ignore DMAR unit if no devices exist */
4051                         if (i == drhd->devices_cnt)
4052                                 drhd->ignored = 1;
4053                 }
4054         }
4055
4056         for_each_active_drhd_unit(drhd) {
4057                 if (drhd->include_all)
4058                         continue;
4059
4060                 for_each_active_dev_scope(drhd->devices,
4061                                           drhd->devices_cnt, i, dev)
4062                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4063                                 break;
4064                 if (i < drhd->devices_cnt)
4065                         continue;
4066
4067                 /* This IOMMU has *only* gfx devices. Either bypass it or
4068                    set the gfx_mapped flag, as appropriate */
4069                 if (dmar_map_gfx) {
4070                         intel_iommu_gfx_mapped = 1;
4071                 } else {
4072                         drhd->ignored = 1;
4073                         for_each_active_dev_scope(drhd->devices,
4074                                                   drhd->devices_cnt, i, dev)
4075                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4076                 }
4077         }
4078 }
4079
4080 #ifdef CONFIG_SUSPEND
4081 static int init_iommu_hw(void)
4082 {
4083         struct dmar_drhd_unit *drhd;
4084         struct intel_iommu *iommu = NULL;
4085
4086         for_each_active_iommu(iommu, drhd)
4087                 if (iommu->qi)
4088                         dmar_reenable_qi(iommu);
4089
4090         for_each_iommu(iommu, drhd) {
4091                 if (drhd->ignored) {
4092                         /*
4093                          * we always have to disable PMRs or DMA may fail on
4094                          * this device
4095                          */
4096                         if (force_on)
4097                                 iommu_disable_protect_mem_regions(iommu);
4098                         continue;
4099                 }
4100         
4101                 iommu_flush_write_buffer(iommu);
4102
4103                 iommu_set_root_entry(iommu);
4104
4105                 iommu->flush.flush_context(iommu, 0, 0, 0,
4106                                            DMA_CCMD_GLOBAL_INVL);
4107                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4108                 iommu_enable_translation(iommu);
4109                 iommu_disable_protect_mem_regions(iommu);
4110         }
4111
4112         return 0;
4113 }
4114
4115 static void iommu_flush_all(void)
4116 {
4117         struct dmar_drhd_unit *drhd;
4118         struct intel_iommu *iommu;
4119
4120         for_each_active_iommu(iommu, drhd) {
4121                 iommu->flush.flush_context(iommu, 0, 0, 0,
4122                                            DMA_CCMD_GLOBAL_INVL);
4123                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4124                                          DMA_TLB_GLOBAL_FLUSH);
4125         }
4126 }
4127
4128 static int iommu_suspend(void)
4129 {
4130         struct dmar_drhd_unit *drhd;
4131         struct intel_iommu *iommu = NULL;
4132         unsigned long flag;
4133
4134         for_each_active_iommu(iommu, drhd) {
4135                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4136                                                  GFP_ATOMIC);
4137                 if (!iommu->iommu_state)
4138                         goto nomem;
4139         }
4140
4141         iommu_flush_all();
4142
4143         for_each_active_iommu(iommu, drhd) {
4144                 iommu_disable_translation(iommu);
4145
4146                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4147
4148                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4149                         readl(iommu->reg + DMAR_FECTL_REG);
4150                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4151                         readl(iommu->reg + DMAR_FEDATA_REG);
4152                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4153                         readl(iommu->reg + DMAR_FEADDR_REG);
4154                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4155                         readl(iommu->reg + DMAR_FEUADDR_REG);
4156
4157                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4158         }
4159         return 0;
4160
4161 nomem:
4162         for_each_active_iommu(iommu, drhd)
4163                 kfree(iommu->iommu_state);
4164
4165         return -ENOMEM;
4166 }
4167
4168 static void iommu_resume(void)
4169 {
4170         struct dmar_drhd_unit *drhd;
4171         struct intel_iommu *iommu = NULL;
4172         unsigned long flag;
4173
4174         if (init_iommu_hw()) {
4175                 if (force_on)
4176                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4177                 else
4178                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4179                 return;
4180         }
4181
4182         for_each_active_iommu(iommu, drhd) {
4183
4184                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4185
4186                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4187                         iommu->reg + DMAR_FECTL_REG);
4188                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4189                         iommu->reg + DMAR_FEDATA_REG);
4190                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4191                         iommu->reg + DMAR_FEADDR_REG);
4192                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4193                         iommu->reg + DMAR_FEUADDR_REG);
4194
4195                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4196         }
4197
4198         for_each_active_iommu(iommu, drhd)
4199                 kfree(iommu->iommu_state);
4200 }
4201
4202 static struct syscore_ops iommu_syscore_ops = {
4203         .resume         = iommu_resume,
4204         .suspend        = iommu_suspend,
4205 };
4206
4207 static void __init init_iommu_pm_ops(void)
4208 {
4209         register_syscore_ops(&iommu_syscore_ops);
4210 }
4211
4212 #else
4213 static inline void init_iommu_pm_ops(void) {}
4214 #endif  /* CONFIG_PM */
4215
4216
4217 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4218 {
4219         struct acpi_dmar_reserved_memory *rmrr;
4220         struct dmar_rmrr_unit *rmrru;
4221
4222         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4223         if (!rmrru)
4224                 return -ENOMEM;
4225
4226         rmrru->hdr = header;
4227         rmrr = (struct acpi_dmar_reserved_memory *)header;
4228         rmrru->base_address = rmrr->base_address;
4229         rmrru->end_address = rmrr->end_address;
4230         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4231                                 ((void *)rmrr) + rmrr->header.length,
4232                                 &rmrru->devices_cnt);
4233         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4234                 kfree(rmrru);
4235                 return -ENOMEM;
4236         }
4237
4238         list_add(&rmrru->list, &dmar_rmrr_units);
4239
4240         return 0;
4241 }
4242
4243 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4244 {
4245         struct dmar_atsr_unit *atsru;
4246         struct acpi_dmar_atsr *tmp;
4247
4248         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4249                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4250                 if (atsr->segment != tmp->segment)
4251                         continue;
4252                 if (atsr->header.length != tmp->header.length)
4253                         continue;
4254                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4255                         return atsru;
4256         }
4257
4258         return NULL;
4259 }
4260
4261 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4262 {
4263         struct acpi_dmar_atsr *atsr;
4264         struct dmar_atsr_unit *atsru;
4265
4266         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4267                 return 0;
4268
4269         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4270         atsru = dmar_find_atsr(atsr);
4271         if (atsru)
4272                 return 0;
4273
4274         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4275         if (!atsru)
4276                 return -ENOMEM;
4277
4278         /*
4279          * If memory is allocated from slab by ACPI _DSM method, we need to
4280          * copy the memory content because the memory buffer will be freed
4281          * on return.
4282          */
4283         atsru->hdr = (void *)(atsru + 1);
4284         memcpy(atsru->hdr, hdr, hdr->length);
4285         atsru->include_all = atsr->flags & 0x1;
4286         if (!atsru->include_all) {
4287                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4288                                 (void *)atsr + atsr->header.length,
4289                                 &atsru->devices_cnt);
4290                 if (atsru->devices_cnt && atsru->devices == NULL) {
4291                         kfree(atsru);
4292                         return -ENOMEM;
4293                 }
4294         }
4295
4296         list_add_rcu(&atsru->list, &dmar_atsr_units);
4297
4298         return 0;
4299 }
4300
4301 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4302 {
4303         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4304         kfree(atsru);
4305 }
4306
4307 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4308 {
4309         struct acpi_dmar_atsr *atsr;
4310         struct dmar_atsr_unit *atsru;
4311
4312         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4313         atsru = dmar_find_atsr(atsr);
4314         if (atsru) {
4315                 list_del_rcu(&atsru->list);
4316                 synchronize_rcu();
4317                 intel_iommu_free_atsr(atsru);
4318         }
4319
4320         return 0;
4321 }
4322
4323 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4324 {
4325         int i;
4326         struct device *dev;
4327         struct acpi_dmar_atsr *atsr;
4328         struct dmar_atsr_unit *atsru;
4329
4330         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4331         atsru = dmar_find_atsr(atsr);
4332         if (!atsru)
4333                 return 0;
4334
4335         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4336                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4337                                           i, dev)
4338                         return -EBUSY;
4339         }
4340
4341         return 0;
4342 }
4343
4344 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4345 {
4346         int sp, ret = 0;
4347         struct intel_iommu *iommu = dmaru->iommu;
4348
4349         if (g_iommus[iommu->seq_id])
4350                 return 0;
4351
4352         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4353                 pr_warn("%s: Doesn't support hardware pass through.\n",
4354                         iommu->name);
4355                 return -ENXIO;
4356         }
4357         if (!ecap_sc_support(iommu->ecap) &&
4358             domain_update_iommu_snooping(iommu)) {
4359                 pr_warn("%s: Doesn't support snooping.\n",
4360                         iommu->name);
4361                 return -ENXIO;
4362         }
4363         sp = domain_update_iommu_superpage(iommu) - 1;
4364         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4365                 pr_warn("%s: Doesn't support large page.\n",
4366                         iommu->name);
4367                 return -ENXIO;
4368         }
4369
4370         /*
4371          * Disable translation if already enabled prior to OS handover.
4372          */
4373         if (iommu->gcmd & DMA_GCMD_TE)
4374                 iommu_disable_translation(iommu);
4375
4376         g_iommus[iommu->seq_id] = iommu;
4377         ret = iommu_init_domains(iommu);
4378         if (ret == 0)
4379                 ret = iommu_alloc_root_entry(iommu);
4380         if (ret)
4381                 goto out;
4382
4383 #ifdef CONFIG_INTEL_IOMMU_SVM
4384         if (pasid_enabled(iommu))
4385                 intel_svm_alloc_pasid_tables(iommu);
4386 #endif
4387
4388         if (dmaru->ignored) {
4389                 /*
4390                  * we always have to disable PMRs or DMA may fail on this device
4391                  */
4392                 if (force_on)
4393                         iommu_disable_protect_mem_regions(iommu);
4394                 return 0;
4395         }
4396
4397         intel_iommu_init_qi(iommu);
4398         iommu_flush_write_buffer(iommu);
4399
4400 #ifdef CONFIG_INTEL_IOMMU_SVM
4401         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4402                 ret = intel_svm_enable_prq(iommu);
4403                 if (ret)
4404                         goto disable_iommu;
4405         }
4406 #endif
4407         ret = dmar_set_interrupt(iommu);
4408         if (ret)
4409                 goto disable_iommu;
4410
4411         iommu_set_root_entry(iommu);
4412         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4413         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4414         iommu_enable_translation(iommu);
4415
4416         iommu_disable_protect_mem_regions(iommu);
4417         return 0;
4418
4419 disable_iommu:
4420         disable_dmar_iommu(iommu);
4421 out:
4422         free_dmar_iommu(iommu);
4423         return ret;
4424 }
4425
4426 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4427 {
4428         int ret = 0;
4429         struct intel_iommu *iommu = dmaru->iommu;
4430
4431         if (!intel_iommu_enabled)
4432                 return 0;
4433         if (iommu == NULL)
4434                 return -EINVAL;
4435
4436         if (insert) {
4437                 ret = intel_iommu_add(dmaru);
4438         } else {
4439                 disable_dmar_iommu(iommu);
4440                 free_dmar_iommu(iommu);
4441         }
4442
4443         return ret;
4444 }
4445
4446 static void intel_iommu_free_dmars(void)
4447 {
4448         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4449         struct dmar_atsr_unit *atsru, *atsr_n;
4450
4451         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4452                 list_del(&rmrru->list);
4453                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4454                 kfree(rmrru);
4455         }
4456
4457         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4458                 list_del(&atsru->list);
4459                 intel_iommu_free_atsr(atsru);
4460         }
4461 }
4462
4463 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4464 {
4465         int i, ret = 1;
4466         struct pci_bus *bus;
4467         struct pci_dev *bridge = NULL;
4468         struct device *tmp;
4469         struct acpi_dmar_atsr *atsr;
4470         struct dmar_atsr_unit *atsru;
4471
4472         dev = pci_physfn(dev);
4473         for (bus = dev->bus; bus; bus = bus->parent) {
4474                 bridge = bus->self;
4475                 /* If it's an integrated device, allow ATS */
4476                 if (!bridge)
4477                         return 1;
4478                 /* Connected via non-PCIe: no ATS */
4479                 if (!pci_is_pcie(bridge) ||
4480                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4481                         return 0;
4482                 /* If we found the root port, look it up in the ATSR */
4483                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4484                         break;
4485         }
4486
4487         rcu_read_lock();
4488         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4489                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4490                 if (atsr->segment != pci_domain_nr(dev->bus))
4491                         continue;
4492
4493                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4494                         if (tmp == &bridge->dev)
4495                                 goto out;
4496
4497                 if (atsru->include_all)
4498                         goto out;
4499         }
4500         ret = 0;
4501 out:
4502         rcu_read_unlock();
4503
4504         return ret;
4505 }
4506
4507 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4508 {
4509         int ret = 0;
4510         struct dmar_rmrr_unit *rmrru;
4511         struct dmar_atsr_unit *atsru;
4512         struct acpi_dmar_atsr *atsr;
4513         struct acpi_dmar_reserved_memory *rmrr;
4514
4515         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4516                 return 0;
4517
4518         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4519                 rmrr = container_of(rmrru->hdr,
4520                                     struct acpi_dmar_reserved_memory, header);
4521                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4522                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4523                                 ((void *)rmrr) + rmrr->header.length,
4524                                 rmrr->segment, rmrru->devices,
4525                                 rmrru->devices_cnt);
4526                         if(ret < 0)
4527                                 return ret;
4528                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4529                         dmar_remove_dev_scope(info, rmrr->segment,
4530                                 rmrru->devices, rmrru->devices_cnt);
4531                 }
4532         }
4533
4534         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4535                 if (atsru->include_all)
4536                         continue;
4537
4538                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4539                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4540                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4541                                         (void *)atsr + atsr->header.length,
4542                                         atsr->segment, atsru->devices,
4543                                         atsru->devices_cnt);
4544                         if (ret > 0)
4545                                 break;
4546                         else if(ret < 0)
4547                                 return ret;
4548                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4549                         if (dmar_remove_dev_scope(info, atsr->segment,
4550                                         atsru->devices, atsru->devices_cnt))
4551                                 break;
4552                 }
4553         }
4554
4555         return 0;
4556 }
4557
4558 /*
4559  * Here we only respond to action of unbound device from driver.
4560  *
4561  * Added device is not attached to its DMAR domain here yet. That will happen
4562  * when mapping the device to iova.
4563  */
4564 static int device_notifier(struct notifier_block *nb,
4565                                   unsigned long action, void *data)
4566 {
4567         struct device *dev = data;
4568         struct dmar_domain *domain;
4569
4570         if (iommu_dummy(dev))
4571                 return 0;
4572
4573         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4574                 return 0;
4575
4576         domain = find_domain(dev);
4577         if (!domain)
4578                 return 0;
4579
4580         dmar_remove_one_dev_info(domain, dev);
4581         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4582                 domain_exit(domain);
4583
4584         return 0;
4585 }
4586
4587 static struct notifier_block device_nb = {
4588         .notifier_call = device_notifier,
4589 };
4590
4591 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4592                                        unsigned long val, void *v)
4593 {
4594         struct memory_notify *mhp = v;
4595         unsigned long long start, end;
4596         unsigned long start_vpfn, last_vpfn;
4597
4598         switch (val) {
4599         case MEM_GOING_ONLINE:
4600                 start = mhp->start_pfn << PAGE_SHIFT;
4601                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4602                 if (iommu_domain_identity_map(si_domain, start, end)) {
4603                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4604                                 start, end);
4605                         return NOTIFY_BAD;
4606                 }
4607                 break;
4608
4609         case MEM_OFFLINE:
4610         case MEM_CANCEL_ONLINE:
4611                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4612                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4613                 while (start_vpfn <= last_vpfn) {
4614                         struct iova *iova;
4615                         struct dmar_drhd_unit *drhd;
4616                         struct intel_iommu *iommu;
4617                         struct page *freelist;
4618
4619                         iova = find_iova(&si_domain->iovad, start_vpfn);
4620                         if (iova == NULL) {
4621                                 pr_debug("Failed get IOVA for PFN %lx\n",
4622                                          start_vpfn);
4623                                 break;
4624                         }
4625
4626                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4627                                                      start_vpfn, last_vpfn);
4628                         if (iova == NULL) {
4629                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4630                                         start_vpfn, last_vpfn);
4631                                 return NOTIFY_BAD;
4632                         }
4633
4634                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4635                                                iova->pfn_hi);
4636
4637                         rcu_read_lock();
4638                         for_each_active_iommu(iommu, drhd)
4639                                 iommu_flush_iotlb_psi(iommu, si_domain,
4640                                         iova->pfn_lo, iova_size(iova),
4641                                         !freelist, 0);
4642                         rcu_read_unlock();
4643                         dma_free_pagelist(freelist);
4644
4645                         start_vpfn = iova->pfn_hi + 1;
4646                         free_iova_mem(iova);
4647                 }
4648                 break;
4649         }
4650
4651         return NOTIFY_OK;
4652 }
4653
4654 static struct notifier_block intel_iommu_memory_nb = {
4655         .notifier_call = intel_iommu_memory_notifier,
4656         .priority = 0
4657 };
4658
4659 static void free_all_cpu_cached_iovas(unsigned int cpu)
4660 {
4661         int i;
4662
4663         for (i = 0; i < g_num_of_iommus; i++) {
4664                 struct intel_iommu *iommu = g_iommus[i];
4665                 struct dmar_domain *domain;
4666                 int did;
4667
4668                 if (!iommu)
4669                         continue;
4670
4671                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4672                         domain = get_iommu_domain(iommu, (u16)did);
4673
4674                         if (!domain)
4675                                 continue;
4676                         free_cpu_cached_iovas(cpu, &domain->iovad);
4677                 }
4678         }
4679 }
4680
4681 static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
4682                                     unsigned long action, void *v)
4683 {
4684         unsigned int cpu = (unsigned long)v;
4685
4686         switch (action) {
4687         case CPU_DEAD:
4688         case CPU_DEAD_FROZEN:
4689                 free_all_cpu_cached_iovas(cpu);
4690                 flush_unmaps_timeout(cpu);
4691                 break;
4692         }
4693         return NOTIFY_OK;
4694 }
4695
4696 static struct notifier_block intel_iommu_cpu_nb = {
4697         .notifier_call = intel_iommu_cpu_notifier,
4698 };
4699
4700 static ssize_t intel_iommu_show_version(struct device *dev,
4701                                         struct device_attribute *attr,
4702                                         char *buf)
4703 {
4704         struct intel_iommu *iommu = dev_get_drvdata(dev);
4705         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4706         return sprintf(buf, "%d:%d\n",
4707                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4708 }
4709 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4710
4711 static ssize_t intel_iommu_show_address(struct device *dev,
4712                                         struct device_attribute *attr,
4713                                         char *buf)
4714 {
4715         struct intel_iommu *iommu = dev_get_drvdata(dev);
4716         return sprintf(buf, "%llx\n", iommu->reg_phys);
4717 }
4718 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4719
4720 static ssize_t intel_iommu_show_cap(struct device *dev,
4721                                     struct device_attribute *attr,
4722                                     char *buf)
4723 {
4724         struct intel_iommu *iommu = dev_get_drvdata(dev);
4725         return sprintf(buf, "%llx\n", iommu->cap);
4726 }
4727 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4728
4729 static ssize_t intel_iommu_show_ecap(struct device *dev,
4730                                     struct device_attribute *attr,
4731                                     char *buf)
4732 {
4733         struct intel_iommu *iommu = dev_get_drvdata(dev);
4734         return sprintf(buf, "%llx\n", iommu->ecap);
4735 }
4736 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4737
4738 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4739                                       struct device_attribute *attr,
4740                                       char *buf)
4741 {
4742         struct intel_iommu *iommu = dev_get_drvdata(dev);
4743         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4744 }
4745 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4746
4747 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4748                                            struct device_attribute *attr,
4749                                            char *buf)
4750 {
4751         struct intel_iommu *iommu = dev_get_drvdata(dev);
4752         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4753                                                   cap_ndoms(iommu->cap)));
4754 }
4755 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4756
4757 static struct attribute *intel_iommu_attrs[] = {
4758         &dev_attr_version.attr,
4759         &dev_attr_address.attr,
4760         &dev_attr_cap.attr,
4761         &dev_attr_ecap.attr,
4762         &dev_attr_domains_supported.attr,
4763         &dev_attr_domains_used.attr,
4764         NULL,
4765 };
4766
4767 static struct attribute_group intel_iommu_group = {
4768         .name = "intel-iommu",
4769         .attrs = intel_iommu_attrs,
4770 };
4771
4772 const struct attribute_group *intel_iommu_groups[] = {
4773         &intel_iommu_group,
4774         NULL,
4775 };
4776
4777 int __init intel_iommu_init(void)
4778 {
4779         int ret = -ENODEV;
4780         struct dmar_drhd_unit *drhd;
4781         struct intel_iommu *iommu;
4782
4783         /* VT-d is required for a TXT/tboot launch, so enforce that */
4784         force_on = tboot_force_iommu();
4785
4786         if (iommu_init_mempool()) {
4787                 if (force_on)
4788                         panic("tboot: Failed to initialize iommu memory\n");
4789                 return -ENOMEM;
4790         }
4791
4792         down_write(&dmar_global_lock);
4793         if (dmar_table_init()) {
4794                 if (force_on)
4795                         panic("tboot: Failed to initialize DMAR table\n");
4796                 goto out_free_dmar;
4797         }
4798
4799         if (dmar_dev_scope_init() < 0) {
4800                 if (force_on)
4801                         panic("tboot: Failed to initialize DMAR device scope\n");
4802                 goto out_free_dmar;
4803         }
4804
4805         if (no_iommu || dmar_disabled)
4806                 goto out_free_dmar;
4807
4808         if (list_empty(&dmar_rmrr_units))
4809                 pr_info("No RMRR found\n");
4810
4811         if (list_empty(&dmar_atsr_units))
4812                 pr_info("No ATSR found\n");
4813
4814         if (dmar_init_reserved_ranges()) {
4815                 if (force_on)
4816                         panic("tboot: Failed to reserve iommu ranges\n");
4817                 goto out_free_reserved_range;
4818         }
4819
4820         init_no_remapping_devices();
4821
4822         ret = init_dmars();
4823         if (ret) {
4824                 if (force_on)
4825                         panic("tboot: Failed to initialize DMARs\n");
4826                 pr_err("Initialization failed\n");
4827                 goto out_free_reserved_range;
4828         }
4829         up_write(&dmar_global_lock);
4830         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4831
4832 #ifdef CONFIG_SWIOTLB
4833         swiotlb = 0;
4834 #endif
4835         dma_ops = &intel_dma_ops;
4836
4837         init_iommu_pm_ops();
4838
4839         for_each_active_iommu(iommu, drhd)
4840                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4841                                                        intel_iommu_groups,
4842                                                        "%s", iommu->name);
4843
4844         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4845         bus_register_notifier(&pci_bus_type, &device_nb);
4846         if (si_domain && !hw_pass_through)
4847                 register_memory_notifier(&intel_iommu_memory_nb);
4848         register_hotcpu_notifier(&intel_iommu_cpu_nb);
4849
4850         intel_iommu_enabled = 1;
4851
4852         return 0;
4853
4854 out_free_reserved_range:
4855         put_iova_domain(&reserved_iova_list);
4856 out_free_dmar:
4857         intel_iommu_free_dmars();
4858         up_write(&dmar_global_lock);
4859         iommu_exit_mempool();
4860         return ret;
4861 }
4862
4863 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4864 {
4865         struct intel_iommu *iommu = opaque;
4866
4867         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4868         return 0;
4869 }
4870
4871 /*
4872  * NB - intel-iommu lacks any sort of reference counting for the users of
4873  * dependent devices.  If multiple endpoints have intersecting dependent
4874  * devices, unbinding the driver from any one of them will possibly leave
4875  * the others unable to operate.
4876  */
4877 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4878 {
4879         if (!iommu || !dev || !dev_is_pci(dev))
4880                 return;
4881
4882         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4883 }
4884
4885 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4886 {
4887         struct intel_iommu *iommu;
4888         unsigned long flags;
4889
4890         assert_spin_locked(&device_domain_lock);
4891
4892         if (WARN_ON(!info))
4893                 return;
4894
4895         iommu = info->iommu;
4896
4897         if (info->dev) {
4898                 iommu_disable_dev_iotlb(info);
4899                 domain_context_clear(iommu, info->dev);
4900         }
4901
4902         unlink_domain_info(info);
4903
4904         spin_lock_irqsave(&iommu->lock, flags);
4905         domain_detach_iommu(info->domain, iommu);
4906         spin_unlock_irqrestore(&iommu->lock, flags);
4907
4908         free_devinfo_mem(info);
4909 }
4910
4911 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4912                                      struct device *dev)
4913 {
4914         struct device_domain_info *info;
4915         unsigned long flags;
4916
4917         spin_lock_irqsave(&device_domain_lock, flags);
4918         info = dev->archdata.iommu;
4919         __dmar_remove_one_dev_info(info);
4920         spin_unlock_irqrestore(&device_domain_lock, flags);
4921 }
4922
4923 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4924 {
4925         int adjust_width;
4926
4927         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4928                         DMA_32BIT_PFN);
4929         domain_reserve_special_ranges(domain);
4930
4931         /* calculate AGAW */
4932         domain->gaw = guest_width;
4933         adjust_width = guestwidth_to_adjustwidth(guest_width);
4934         domain->agaw = width_to_agaw(adjust_width);
4935
4936         domain->iommu_coherency = 0;
4937         domain->iommu_snooping = 0;
4938         domain->iommu_superpage = 0;
4939         domain->max_addr = 0;
4940
4941         /* always allocate the top pgd */
4942         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4943         if (!domain->pgd)
4944                 return -ENOMEM;
4945         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4946         return 0;
4947 }
4948
4949 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4950 {
4951         struct dmar_domain *dmar_domain;
4952         struct iommu_domain *domain;
4953
4954         if (type != IOMMU_DOMAIN_UNMANAGED)
4955                 return NULL;
4956
4957         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4958         if (!dmar_domain) {
4959                 pr_err("Can't allocate dmar_domain\n");
4960                 return NULL;
4961         }
4962         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4963                 pr_err("Domain initialization failed\n");
4964                 domain_exit(dmar_domain);
4965                 return NULL;
4966         }
4967         domain_update_iommu_cap(dmar_domain);
4968
4969         domain = &dmar_domain->domain;
4970         domain->geometry.aperture_start = 0;
4971         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4972         domain->geometry.force_aperture = true;
4973
4974         return domain;
4975 }
4976
4977 static void intel_iommu_domain_free(struct iommu_domain *domain)
4978 {
4979         domain_exit(to_dmar_domain(domain));
4980 }
4981
4982 static int intel_iommu_attach_device(struct iommu_domain *domain,
4983                                      struct device *dev)
4984 {
4985         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4986         struct intel_iommu *iommu;
4987         int addr_width;
4988         u8 bus, devfn;
4989
4990         if (device_is_rmrr_locked(dev)) {
4991                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4992                 return -EPERM;
4993         }
4994
4995         /* normally dev is not mapped */
4996         if (unlikely(domain_context_mapped(dev))) {
4997                 struct dmar_domain *old_domain;
4998
4999                 old_domain = find_domain(dev);
5000                 if (old_domain) {
5001                         rcu_read_lock();
5002                         dmar_remove_one_dev_info(old_domain, dev);
5003                         rcu_read_unlock();
5004
5005                         if (!domain_type_is_vm_or_si(old_domain) &&
5006                              list_empty(&old_domain->devices))
5007                                 domain_exit(old_domain);
5008                 }
5009         }
5010
5011         iommu = device_to_iommu(dev, &bus, &devfn);
5012         if (!iommu)
5013                 return -ENODEV;
5014
5015         /* check if this iommu agaw is sufficient for max mapped address */
5016         addr_width = agaw_to_width(iommu->agaw);
5017         if (addr_width > cap_mgaw(iommu->cap))
5018                 addr_width = cap_mgaw(iommu->cap);
5019
5020         if (dmar_domain->max_addr > (1LL << addr_width)) {
5021                 pr_err("%s: iommu width (%d) is not "
5022                        "sufficient for the mapped address (%llx)\n",
5023                        __func__, addr_width, dmar_domain->max_addr);
5024                 return -EFAULT;
5025         }
5026         dmar_domain->gaw = addr_width;
5027
5028         /*
5029          * Knock out extra levels of page tables if necessary
5030          */
5031         while (iommu->agaw < dmar_domain->agaw) {
5032                 struct dma_pte *pte;
5033
5034                 pte = dmar_domain->pgd;
5035                 if (dma_pte_present(pte)) {
5036                         dmar_domain->pgd = (struct dma_pte *)
5037                                 phys_to_virt(dma_pte_addr(pte));
5038                         free_pgtable_page(pte);
5039                 }
5040                 dmar_domain->agaw--;
5041         }
5042
5043         return domain_add_dev_info(dmar_domain, dev);
5044 }
5045
5046 static void intel_iommu_detach_device(struct iommu_domain *domain,
5047                                       struct device *dev)
5048 {
5049         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5050 }
5051
5052 static int intel_iommu_map(struct iommu_domain *domain,
5053                            unsigned long iova, phys_addr_t hpa,
5054                            size_t size, int iommu_prot)
5055 {
5056         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5057         u64 max_addr;
5058         int prot = 0;
5059         int ret;
5060
5061         if (iommu_prot & IOMMU_READ)
5062                 prot |= DMA_PTE_READ;
5063         if (iommu_prot & IOMMU_WRITE)
5064                 prot |= DMA_PTE_WRITE;
5065         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5066                 prot |= DMA_PTE_SNP;
5067
5068         max_addr = iova + size;
5069         if (dmar_domain->max_addr < max_addr) {
5070                 u64 end;
5071
5072                 /* check if minimum agaw is sufficient for mapped address */
5073                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5074                 if (end < max_addr) {
5075                         pr_err("%s: iommu width (%d) is not "
5076                                "sufficient for the mapped address (%llx)\n",
5077                                __func__, dmar_domain->gaw, max_addr);
5078                         return -EFAULT;
5079                 }
5080                 dmar_domain->max_addr = max_addr;
5081         }
5082         /* Round up size to next multiple of PAGE_SIZE, if it and
5083            the low bits of hpa would take us onto the next page */
5084         size = aligned_nrpages(hpa, size);
5085         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5086                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5087         return ret;
5088 }
5089
5090 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5091                                 unsigned long iova, size_t size)
5092 {
5093         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5094         struct page *freelist = NULL;
5095         struct intel_iommu *iommu;
5096         unsigned long start_pfn, last_pfn;
5097         unsigned int npages;
5098         int iommu_id, level = 0;
5099
5100         /* Cope with horrid API which requires us to unmap more than the
5101            size argument if it happens to be a large-page mapping. */
5102         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5103
5104         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5105                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5106
5107         start_pfn = iova >> VTD_PAGE_SHIFT;
5108         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5109
5110         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5111
5112         npages = last_pfn - start_pfn + 1;
5113
5114         for_each_domain_iommu(iommu_id, dmar_domain) {
5115                 iommu = g_iommus[iommu_id];
5116
5117                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5118                                       start_pfn, npages, !freelist, 0);
5119         }
5120
5121         dma_free_pagelist(freelist);
5122
5123         if (dmar_domain->max_addr == iova + size)
5124                 dmar_domain->max_addr = iova;
5125
5126         return size;
5127 }
5128
5129 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5130                                             dma_addr_t iova)
5131 {
5132         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5133         struct dma_pte *pte;
5134         int level = 0;
5135         u64 phys = 0;
5136
5137         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5138         if (pte)
5139                 phys = dma_pte_addr(pte);
5140
5141         return phys;
5142 }
5143
5144 static bool intel_iommu_capable(enum iommu_cap cap)
5145 {
5146         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5147                 return domain_update_iommu_snooping(NULL) == 1;
5148         if (cap == IOMMU_CAP_INTR_REMAP)
5149                 return irq_remapping_enabled == 1;
5150
5151         return false;
5152 }
5153
5154 static int intel_iommu_add_device(struct device *dev)
5155 {
5156         struct intel_iommu *iommu;
5157         struct iommu_group *group;
5158         u8 bus, devfn;
5159
5160         iommu = device_to_iommu(dev, &bus, &devfn);
5161         if (!iommu)
5162                 return -ENODEV;
5163
5164         iommu_device_link(iommu->iommu_dev, dev);
5165
5166         group = iommu_group_get_for_dev(dev);
5167
5168         if (IS_ERR(group))
5169                 return PTR_ERR(group);
5170
5171         iommu_group_put(group);
5172         return 0;
5173 }
5174
5175 static void intel_iommu_remove_device(struct device *dev)
5176 {
5177         struct intel_iommu *iommu;
5178         u8 bus, devfn;
5179
5180         iommu = device_to_iommu(dev, &bus, &devfn);
5181         if (!iommu)
5182                 return;
5183
5184         iommu_group_remove_device(dev);
5185
5186         iommu_device_unlink(iommu->iommu_dev, dev);
5187 }
5188
5189 #ifdef CONFIG_INTEL_IOMMU_SVM
5190 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5191 {
5192         struct device_domain_info *info;
5193         struct context_entry *context;
5194         struct dmar_domain *domain;
5195         unsigned long flags;
5196         u64 ctx_lo;
5197         int ret;
5198
5199         domain = get_valid_domain_for_dev(sdev->dev);
5200         if (!domain)
5201                 return -EINVAL;
5202
5203         spin_lock_irqsave(&device_domain_lock, flags);
5204         spin_lock(&iommu->lock);
5205
5206         ret = -EINVAL;
5207         info = sdev->dev->archdata.iommu;
5208         if (!info || !info->pasid_supported)
5209                 goto out;
5210
5211         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5212         if (WARN_ON(!context))
5213                 goto out;
5214
5215         ctx_lo = context[0].lo;
5216
5217         sdev->did = domain->iommu_did[iommu->seq_id];
5218         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5219
5220         if (!(ctx_lo & CONTEXT_PASIDE)) {
5221                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5222                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
5223                 wmb();
5224                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5225                  * extended to permit requests-with-PASID if the PASIDE bit
5226                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5227                  * however, the PASIDE bit is ignored and requests-with-PASID
5228                  * are unconditionally blocked. Which makes less sense.
5229                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5230                  * "guest mode" translation types depending on whether ATS
5231                  * is available or not. Annoyingly, we can't use the new
5232                  * modes *unless* PASIDE is set. */
5233                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5234                         ctx_lo &= ~CONTEXT_TT_MASK;
5235                         if (info->ats_supported)
5236                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5237                         else
5238                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5239                 }
5240                 ctx_lo |= CONTEXT_PASIDE;
5241                 if (iommu->pasid_state_table)
5242                         ctx_lo |= CONTEXT_DINVE;
5243                 if (info->pri_supported)
5244                         ctx_lo |= CONTEXT_PRS;
5245                 context[0].lo = ctx_lo;
5246                 wmb();
5247                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5248                                            DMA_CCMD_MASK_NOBIT,
5249                                            DMA_CCMD_DEVICE_INVL);
5250         }
5251
5252         /* Enable PASID support in the device, if it wasn't already */
5253         if (!info->pasid_enabled)
5254                 iommu_enable_dev_iotlb(info);
5255
5256         if (info->ats_enabled) {
5257                 sdev->dev_iotlb = 1;
5258                 sdev->qdep = info->ats_qdep;
5259                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5260                         sdev->qdep = 0;
5261         }
5262         ret = 0;
5263
5264  out:
5265         spin_unlock(&iommu->lock);
5266         spin_unlock_irqrestore(&device_domain_lock, flags);
5267
5268         return ret;
5269 }
5270
5271 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5272 {
5273         struct intel_iommu *iommu;
5274         u8 bus, devfn;
5275
5276         if (iommu_dummy(dev)) {
5277                 dev_warn(dev,
5278                          "No IOMMU translation for device; cannot enable SVM\n");
5279                 return NULL;
5280         }
5281
5282         iommu = device_to_iommu(dev, &bus, &devfn);
5283         if ((!iommu)) {
5284                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5285                 return NULL;
5286         }
5287
5288         if (!iommu->pasid_table) {
5289                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5290                 return NULL;
5291         }
5292
5293         return iommu;
5294 }
5295 #endif /* CONFIG_INTEL_IOMMU_SVM */
5296
5297 static const struct iommu_ops intel_iommu_ops = {
5298         .capable        = intel_iommu_capable,
5299         .domain_alloc   = intel_iommu_domain_alloc,
5300         .domain_free    = intel_iommu_domain_free,
5301         .attach_dev     = intel_iommu_attach_device,
5302         .detach_dev     = intel_iommu_detach_device,
5303         .map            = intel_iommu_map,
5304         .unmap          = intel_iommu_unmap,
5305         .map_sg         = default_iommu_map_sg,
5306         .iova_to_phys   = intel_iommu_iova_to_phys,
5307         .add_device     = intel_iommu_add_device,
5308         .remove_device  = intel_iommu_remove_device,
5309         .device_group   = pci_device_group,
5310         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5311 };
5312
5313 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5314 {
5315         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5316         pr_info("Disabling IOMMU for graphics on this chipset\n");
5317         dmar_map_gfx = 0;
5318 }
5319
5320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5323 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5325 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5326 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5327
5328 static void quirk_iommu_rwbf(struct pci_dev *dev)
5329 {
5330         /*
5331          * Mobile 4 Series Chipset neglects to set RWBF capability,
5332          * but needs it. Same seems to hold for the desktop versions.
5333          */
5334         pr_info("Forcing write-buffer flush capability\n");
5335         rwbf_quirk = 1;
5336 }
5337
5338 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5339 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5340 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5341 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5342 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5343 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5344 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5345
5346 #define GGC 0x52
5347 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5348 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5349 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5350 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5351 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5352 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5353 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5354 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5355
5356 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5357 {
5358         unsigned short ggc;
5359
5360         if (pci_read_config_word(dev, GGC, &ggc))
5361                 return;
5362
5363         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5364                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5365                 dmar_map_gfx = 0;
5366         } else if (dmar_map_gfx) {
5367                 /* we have to ensure the gfx device is idle before we flush */
5368                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5369                 intel_iommu_strict = 1;
5370        }
5371 }
5372 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5373 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5374 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5375 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5376
5377 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5378    ISOCH DMAR unit for the Azalia sound device, but not give it any
5379    TLB entries, which causes it to deadlock. Check for that.  We do
5380    this in a function called from init_dmars(), instead of in a PCI
5381    quirk, because we don't want to print the obnoxious "BIOS broken"
5382    message if VT-d is actually disabled.
5383 */
5384 static void __init check_tylersburg_isoch(void)
5385 {
5386         struct pci_dev *pdev;
5387         uint32_t vtisochctrl;
5388
5389         /* If there's no Azalia in the system anyway, forget it. */
5390         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5391         if (!pdev)
5392                 return;
5393         pci_dev_put(pdev);
5394
5395         /* System Management Registers. Might be hidden, in which case
5396            we can't do the sanity check. But that's OK, because the
5397            known-broken BIOSes _don't_ actually hide it, so far. */
5398         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5399         if (!pdev)
5400                 return;
5401
5402         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5403                 pci_dev_put(pdev);
5404                 return;
5405         }
5406
5407         pci_dev_put(pdev);
5408
5409         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5410         if (vtisochctrl & 1)
5411                 return;
5412
5413         /* Drop all bits other than the number of TLB entries */
5414         vtisochctrl &= 0x1c;
5415
5416         /* If we have the recommended number of TLB entries (16), fine. */
5417         if (vtisochctrl == 0x10)
5418                 return;
5419
5420         /* Zero TLB entries? You get to ride the short bus to school. */
5421         if (!vtisochctrl) {
5422                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5423                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5424                      dmi_get_system_info(DMI_BIOS_VENDOR),
5425                      dmi_get_system_info(DMI_BIOS_VERSION),
5426                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5427                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5428                 return;
5429         }
5430
5431         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5432                vtisochctrl);
5433 }