df53855a5f3ebd4cf640ca2f814f3ab2f252b5e9
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /*
368  * Domain represents a virtual machine, more than one devices
369  * across iommus may be owned in one domain, e.g. kvm guest.
370  */
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
372
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
375
376 #define for_each_domain_iommu(idx, domain)                      \
377         for (idx = 0; idx < g_num_of_iommus; idx++)             \
378                 if (domain->iommu_refcnt[idx])
379
380 struct dmar_domain {
381         int     nid;                    /* node id */
382
383         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
384                                         /* Refcount of devices per iommu */
385
386
387         u16             iommu_did[DMAR_UNITS_SUPPORTED];
388                                         /* Domain ids per IOMMU. Use u16 since
389                                          * domain ids are 16 bit wide according
390                                          * to VT-d spec, section 9.3 */
391
392         struct list_head devices;       /* all devices' list */
393         struct iova_domain iovad;       /* iova's that belong to this domain */
394
395         struct dma_pte  *pgd;           /* virtual address */
396         int             gaw;            /* max guest address width */
397
398         /* adjusted guest address width, 0 is level 2 30-bit */
399         int             agaw;
400
401         int             flags;          /* flags to find out type of domain */
402
403         int             iommu_coherency;/* indicate coherency of iommu access */
404         int             iommu_snooping; /* indicate snooping control feature*/
405         int             iommu_count;    /* reference count of iommu */
406         int             iommu_superpage;/* Level of superpages supported:
407                                            0 == 4KiB (no superpages), 1 == 2MiB,
408                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
409         u64             max_addr;       /* maximum mapped address */
410
411         struct iommu_domain domain;     /* generic domain data structure for
412                                            iommu core */
413 };
414
415 /* PCI domain-device relationship */
416 struct device_domain_info {
417         struct list_head link;  /* link to domain siblings */
418         struct list_head global; /* link to global list */
419         u8 bus;                 /* PCI bus number */
420         u8 devfn;               /* PCI devfn number */
421         struct {
422                 u8 enabled:1;
423                 u8 qdep;
424         } ats;                  /* ATS state */
425         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
426         struct intel_iommu *iommu; /* IOMMU used by this device */
427         struct dmar_domain *domain; /* pointer to domain */
428 };
429
430 struct dmar_rmrr_unit {
431         struct list_head list;          /* list of rmrr units   */
432         struct acpi_dmar_header *hdr;   /* ACPI header          */
433         u64     base_address;           /* reserved base address*/
434         u64     end_address;            /* reserved end address */
435         struct dmar_dev_scope *devices; /* target devices */
436         int     devices_cnt;            /* target device count */
437 };
438
439 struct dmar_atsr_unit {
440         struct list_head list;          /* list of ATSR units */
441         struct acpi_dmar_header *hdr;   /* ACPI header */
442         struct dmar_dev_scope *devices; /* target devices */
443         int devices_cnt;                /* target device count */
444         u8 include_all:1;               /* include all ports */
445 };
446
447 static LIST_HEAD(dmar_atsr_units);
448 static LIST_HEAD(dmar_rmrr_units);
449
450 #define for_each_rmrr_units(rmrr) \
451         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
452
453 static void flush_unmaps_timeout(unsigned long data);
454
455 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
456
457 #define HIGH_WATER_MARK 250
458 struct deferred_flush_tables {
459         int next;
460         struct iova *iova[HIGH_WATER_MARK];
461         struct dmar_domain *domain[HIGH_WATER_MARK];
462         struct page *freelist[HIGH_WATER_MARK];
463 };
464
465 static struct deferred_flush_tables *deferred_flush;
466
467 /* bitmap for indexing intel_iommus */
468 static int g_num_of_iommus;
469
470 static DEFINE_SPINLOCK(async_umap_flush_lock);
471 static LIST_HEAD(unmaps_to_do);
472
473 static int timer_on;
474 static long list_size;
475
476 static void domain_exit(struct dmar_domain *domain);
477 static void domain_remove_dev_info(struct dmar_domain *domain);
478 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
479                                      struct device *dev);
480 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
481 static void domain_context_clear(struct intel_iommu *iommu,
482                                  struct device *dev);
483 static int domain_detach_iommu(struct dmar_domain *domain,
484                                struct intel_iommu *iommu);
485
486 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
487 int dmar_disabled = 0;
488 #else
489 int dmar_disabled = 1;
490 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
491
492 int intel_iommu_enabled = 0;
493 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
494
495 static int dmar_map_gfx = 1;
496 static int dmar_forcedac;
497 static int intel_iommu_strict;
498 static int intel_iommu_superpage = 1;
499 static int intel_iommu_ecs = 1;
500
501 /* We only actually use ECS when PASID support (on the new bit 40)
502  * is also advertised. Some early implementations — the ones with
503  * PASID support on bit 28 — have issues even when we *only* use
504  * extended root/context tables. */
505 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
506                             ecap_pasid(iommu->ecap))
507
508 int intel_iommu_gfx_mapped;
509 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
510
511 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
512 static DEFINE_SPINLOCK(device_domain_lock);
513 static LIST_HEAD(device_domain_list);
514
515 static const struct iommu_ops intel_iommu_ops;
516
517 static bool translation_pre_enabled(struct intel_iommu *iommu)
518 {
519         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
520 }
521
522 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
523 {
524         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
525 }
526
527 static void init_translation_status(struct intel_iommu *iommu)
528 {
529         u32 gsts;
530
531         gsts = readl(iommu->reg + DMAR_GSTS_REG);
532         if (gsts & DMA_GSTS_TES)
533                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
534 }
535
536 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
537 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
538 {
539         return container_of(dom, struct dmar_domain, domain);
540 }
541
542 static int __init intel_iommu_setup(char *str)
543 {
544         if (!str)
545                 return -EINVAL;
546         while (*str) {
547                 if (!strncmp(str, "on", 2)) {
548                         dmar_disabled = 0;
549                         pr_info("IOMMU enabled\n");
550                 } else if (!strncmp(str, "off", 3)) {
551                         dmar_disabled = 1;
552                         pr_info("IOMMU disabled\n");
553                 } else if (!strncmp(str, "igfx_off", 8)) {
554                         dmar_map_gfx = 0;
555                         pr_info("Disable GFX device mapping\n");
556                 } else if (!strncmp(str, "forcedac", 8)) {
557                         pr_info("Forcing DAC for PCI devices\n");
558                         dmar_forcedac = 1;
559                 } else if (!strncmp(str, "strict", 6)) {
560                         pr_info("Disable batched IOTLB flush\n");
561                         intel_iommu_strict = 1;
562                 } else if (!strncmp(str, "sp_off", 6)) {
563                         pr_info("Disable supported super page\n");
564                         intel_iommu_superpage = 0;
565                 } else if (!strncmp(str, "ecs_off", 7)) {
566                         printk(KERN_INFO
567                                 "Intel-IOMMU: disable extended context table support\n");
568                         intel_iommu_ecs = 0;
569                 }
570
571                 str += strcspn(str, ",");
572                 while (*str == ',')
573                         str++;
574         }
575         return 0;
576 }
577 __setup("intel_iommu=", intel_iommu_setup);
578
579 static struct kmem_cache *iommu_domain_cache;
580 static struct kmem_cache *iommu_devinfo_cache;
581
582 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
583 {
584         struct dmar_domain **domains;
585         int idx = did >> 8;
586
587         domains = iommu->domains[idx];
588         if (!domains)
589                 return NULL;
590
591         return domains[did & 0xff];
592 }
593
594 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
595                              struct dmar_domain *domain)
596 {
597         struct dmar_domain **domains;
598         int idx = did >> 8;
599
600         if (!iommu->domains[idx]) {
601                 size_t size = 256 * sizeof(struct dmar_domain *);
602                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
603         }
604
605         domains = iommu->domains[idx];
606         if (WARN_ON(!domains))
607                 return;
608         else
609                 domains[did & 0xff] = domain;
610 }
611
612 static inline void *alloc_pgtable_page(int node)
613 {
614         struct page *page;
615         void *vaddr = NULL;
616
617         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
618         if (page)
619                 vaddr = page_address(page);
620         return vaddr;
621 }
622
623 static inline void free_pgtable_page(void *vaddr)
624 {
625         free_page((unsigned long)vaddr);
626 }
627
628 static inline void *alloc_domain_mem(void)
629 {
630         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
631 }
632
633 static void free_domain_mem(void *vaddr)
634 {
635         kmem_cache_free(iommu_domain_cache, vaddr);
636 }
637
638 static inline void * alloc_devinfo_mem(void)
639 {
640         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
641 }
642
643 static inline void free_devinfo_mem(void *vaddr)
644 {
645         kmem_cache_free(iommu_devinfo_cache, vaddr);
646 }
647
648 static inline int domain_type_is_vm(struct dmar_domain *domain)
649 {
650         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
651 }
652
653 static inline int domain_type_is_si(struct dmar_domain *domain)
654 {
655         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
656 }
657
658 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
659 {
660         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
661                                 DOMAIN_FLAG_STATIC_IDENTITY);
662 }
663
664 static inline int domain_pfn_supported(struct dmar_domain *domain,
665                                        unsigned long pfn)
666 {
667         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
668
669         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
670 }
671
672 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
673 {
674         unsigned long sagaw;
675         int agaw = -1;
676
677         sagaw = cap_sagaw(iommu->cap);
678         for (agaw = width_to_agaw(max_gaw);
679              agaw >= 0; agaw--) {
680                 if (test_bit(agaw, &sagaw))
681                         break;
682         }
683
684         return agaw;
685 }
686
687 /*
688  * Calculate max SAGAW for each iommu.
689  */
690 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
691 {
692         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
693 }
694
695 /*
696  * calculate agaw for each iommu.
697  * "SAGAW" may be different across iommus, use a default agaw, and
698  * get a supported less agaw for iommus that don't support the default agaw.
699  */
700 int iommu_calculate_agaw(struct intel_iommu *iommu)
701 {
702         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
703 }
704
705 /* This functionin only returns single iommu in a domain */
706 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
707 {
708         int iommu_id;
709
710         /* si_domain and vm domain should not get here. */
711         BUG_ON(domain_type_is_vm_or_si(domain));
712         for_each_domain_iommu(iommu_id, domain)
713                 break;
714
715         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
716                 return NULL;
717
718         return g_iommus[iommu_id];
719 }
720
721 static void domain_update_iommu_coherency(struct dmar_domain *domain)
722 {
723         struct dmar_drhd_unit *drhd;
724         struct intel_iommu *iommu;
725         bool found = false;
726         int i;
727
728         domain->iommu_coherency = 1;
729
730         for_each_domain_iommu(i, domain) {
731                 found = true;
732                 if (!ecap_coherent(g_iommus[i]->ecap)) {
733                         domain->iommu_coherency = 0;
734                         break;
735                 }
736         }
737         if (found)
738                 return;
739
740         /* No hardware attached; use lowest common denominator */
741         rcu_read_lock();
742         for_each_active_iommu(iommu, drhd) {
743                 if (!ecap_coherent(iommu->ecap)) {
744                         domain->iommu_coherency = 0;
745                         break;
746                 }
747         }
748         rcu_read_unlock();
749 }
750
751 static int domain_update_iommu_snooping(struct intel_iommu *skip)
752 {
753         struct dmar_drhd_unit *drhd;
754         struct intel_iommu *iommu;
755         int ret = 1;
756
757         rcu_read_lock();
758         for_each_active_iommu(iommu, drhd) {
759                 if (iommu != skip) {
760                         if (!ecap_sc_support(iommu->ecap)) {
761                                 ret = 0;
762                                 break;
763                         }
764                 }
765         }
766         rcu_read_unlock();
767
768         return ret;
769 }
770
771 static int domain_update_iommu_superpage(struct intel_iommu *skip)
772 {
773         struct dmar_drhd_unit *drhd;
774         struct intel_iommu *iommu;
775         int mask = 0xf;
776
777         if (!intel_iommu_superpage) {
778                 return 0;
779         }
780
781         /* set iommu_superpage to the smallest common denominator */
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (iommu != skip) {
785                         mask &= cap_super_page_val(iommu->cap);
786                         if (!mask)
787                                 break;
788                 }
789         }
790         rcu_read_unlock();
791
792         return fls(mask);
793 }
794
795 /* Some capabilities may be different across iommus */
796 static void domain_update_iommu_cap(struct dmar_domain *domain)
797 {
798         domain_update_iommu_coherency(domain);
799         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
800         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
801 }
802
803 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
804                                                        u8 bus, u8 devfn, int alloc)
805 {
806         struct root_entry *root = &iommu->root_entry[bus];
807         struct context_entry *context;
808         u64 *entry;
809
810         entry = &root->lo;
811         if (ecs_enabled(iommu)) {
812                 if (devfn >= 0x80) {
813                         devfn -= 0x80;
814                         entry = &root->hi;
815                 }
816                 devfn *= 2;
817         }
818         if (*entry & 1)
819                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
820         else {
821                 unsigned long phy_addr;
822                 if (!alloc)
823                         return NULL;
824
825                 context = alloc_pgtable_page(iommu->node);
826                 if (!context)
827                         return NULL;
828
829                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
830                 phy_addr = virt_to_phys((void *)context);
831                 *entry = phy_addr | 1;
832                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
833         }
834         return &context[devfn];
835 }
836
837 static int iommu_dummy(struct device *dev)
838 {
839         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
840 }
841
842 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
843 {
844         struct dmar_drhd_unit *drhd = NULL;
845         struct intel_iommu *iommu;
846         struct device *tmp;
847         struct pci_dev *ptmp, *pdev = NULL;
848         u16 segment = 0;
849         int i;
850
851         if (iommu_dummy(dev))
852                 return NULL;
853
854         if (dev_is_pci(dev)) {
855                 pdev = to_pci_dev(dev);
856                 segment = pci_domain_nr(pdev->bus);
857         } else if (has_acpi_companion(dev))
858                 dev = &ACPI_COMPANION(dev)->dev;
859
860         rcu_read_lock();
861         for_each_active_iommu(iommu, drhd) {
862                 if (pdev && segment != drhd->segment)
863                         continue;
864
865                 for_each_active_dev_scope(drhd->devices,
866                                           drhd->devices_cnt, i, tmp) {
867                         if (tmp == dev) {
868                                 *bus = drhd->devices[i].bus;
869                                 *devfn = drhd->devices[i].devfn;
870                                 goto out;
871                         }
872
873                         if (!pdev || !dev_is_pci(tmp))
874                                 continue;
875
876                         ptmp = to_pci_dev(tmp);
877                         if (ptmp->subordinate &&
878                             ptmp->subordinate->number <= pdev->bus->number &&
879                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
880                                 goto got_pdev;
881                 }
882
883                 if (pdev && drhd->include_all) {
884                 got_pdev:
885                         *bus = pdev->bus->number;
886                         *devfn = pdev->devfn;
887                         goto out;
888                 }
889         }
890         iommu = NULL;
891  out:
892         rcu_read_unlock();
893
894         return iommu;
895 }
896
897 static void domain_flush_cache(struct dmar_domain *domain,
898                                void *addr, int size)
899 {
900         if (!domain->iommu_coherency)
901                 clflush_cache_range(addr, size);
902 }
903
904 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
905 {
906         struct context_entry *context;
907         int ret = 0;
908         unsigned long flags;
909
910         spin_lock_irqsave(&iommu->lock, flags);
911         context = iommu_context_addr(iommu, bus, devfn, 0);
912         if (context)
913                 ret = context_present(context);
914         spin_unlock_irqrestore(&iommu->lock, flags);
915         return ret;
916 }
917
918 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
919 {
920         struct context_entry *context;
921         unsigned long flags;
922
923         spin_lock_irqsave(&iommu->lock, flags);
924         context = iommu_context_addr(iommu, bus, devfn, 0);
925         if (context) {
926                 context_clear_entry(context);
927                 __iommu_flush_cache(iommu, context, sizeof(*context));
928         }
929         spin_unlock_irqrestore(&iommu->lock, flags);
930 }
931
932 static void free_context_table(struct intel_iommu *iommu)
933 {
934         int i;
935         unsigned long flags;
936         struct context_entry *context;
937
938         spin_lock_irqsave(&iommu->lock, flags);
939         if (!iommu->root_entry) {
940                 goto out;
941         }
942         for (i = 0; i < ROOT_ENTRY_NR; i++) {
943                 context = iommu_context_addr(iommu, i, 0, 0);
944                 if (context)
945                         free_pgtable_page(context);
946
947                 if (!ecs_enabled(iommu))
948                         continue;
949
950                 context = iommu_context_addr(iommu, i, 0x80, 0);
951                 if (context)
952                         free_pgtable_page(context);
953
954         }
955         free_pgtable_page(iommu->root_entry);
956         iommu->root_entry = NULL;
957 out:
958         spin_unlock_irqrestore(&iommu->lock, flags);
959 }
960
961 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
962                                       unsigned long pfn, int *target_level)
963 {
964         struct dma_pte *parent, *pte = NULL;
965         int level = agaw_to_level(domain->agaw);
966         int offset;
967
968         BUG_ON(!domain->pgd);
969
970         if (!domain_pfn_supported(domain, pfn))
971                 /* Address beyond IOMMU's addressing capabilities. */
972                 return NULL;
973
974         parent = domain->pgd;
975
976         while (1) {
977                 void *tmp_page;
978
979                 offset = pfn_level_offset(pfn, level);
980                 pte = &parent[offset];
981                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
982                         break;
983                 if (level == *target_level)
984                         break;
985
986                 if (!dma_pte_present(pte)) {
987                         uint64_t pteval;
988
989                         tmp_page = alloc_pgtable_page(domain->nid);
990
991                         if (!tmp_page)
992                                 return NULL;
993
994                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
995                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
996                         if (cmpxchg64(&pte->val, 0ULL, pteval))
997                                 /* Someone else set it while we were thinking; use theirs. */
998                                 free_pgtable_page(tmp_page);
999                         else
1000                                 domain_flush_cache(domain, pte, sizeof(*pte));
1001                 }
1002                 if (level == 1)
1003                         break;
1004
1005                 parent = phys_to_virt(dma_pte_addr(pte));
1006                 level--;
1007         }
1008
1009         if (!*target_level)
1010                 *target_level = level;
1011
1012         return pte;
1013 }
1014
1015
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018                                          unsigned long pfn,
1019                                          int level, int *large_page)
1020 {
1021         struct dma_pte *parent, *pte = NULL;
1022         int total = agaw_to_level(domain->agaw);
1023         int offset;
1024
1025         parent = domain->pgd;
1026         while (level <= total) {
1027                 offset = pfn_level_offset(pfn, total);
1028                 pte = &parent[offset];
1029                 if (level == total)
1030                         return pte;
1031
1032                 if (!dma_pte_present(pte)) {
1033                         *large_page = total;
1034                         break;
1035                 }
1036
1037                 if (dma_pte_superpage(pte)) {
1038                         *large_page = total;
1039                         return pte;
1040                 }
1041
1042                 parent = phys_to_virt(dma_pte_addr(pte));
1043                 total--;
1044         }
1045         return NULL;
1046 }
1047
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050                                 unsigned long start_pfn,
1051                                 unsigned long last_pfn)
1052 {
1053         unsigned int large_page = 1;
1054         struct dma_pte *first_pte, *pte;
1055
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         /* we don't need lock here; nobody else touches the iova range */
1061         do {
1062                 large_page = 1;
1063                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064                 if (!pte) {
1065                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1066                         continue;
1067                 }
1068                 do {
1069                         dma_clear_pte(pte);
1070                         start_pfn += lvl_to_nr_pages(large_page);
1071                         pte++;
1072                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073
1074                 domain_flush_cache(domain, first_pte,
1075                                    (void *)pte - (void *)first_pte);
1076
1077         } while (start_pfn && start_pfn <= last_pfn);
1078 }
1079
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081                                struct dma_pte *pte, unsigned long pfn,
1082                                unsigned long start_pfn, unsigned long last_pfn)
1083 {
1084         pfn = max(start_pfn, pfn);
1085         pte = &pte[pfn_level_offset(pfn, level)];
1086
1087         do {
1088                 unsigned long level_pfn;
1089                 struct dma_pte *level_pte;
1090
1091                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1092                         goto next;
1093
1094                 level_pfn = pfn & level_mask(level - 1);
1095                 level_pte = phys_to_virt(dma_pte_addr(pte));
1096
1097                 if (level > 2)
1098                         dma_pte_free_level(domain, level - 1, level_pte,
1099                                            level_pfn, start_pfn, last_pfn);
1100
1101                 /* If range covers entire pagetable, free it */
1102                 if (!(start_pfn > level_pfn ||
1103                       last_pfn < level_pfn + level_size(level) - 1)) {
1104                         dma_clear_pte(pte);
1105                         domain_flush_cache(domain, pte, sizeof(*pte));
1106                         free_pgtable_page(level_pte);
1107                 }
1108 next:
1109                 pfn += level_size(level);
1110         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1111 }
1112
1113 /* free page table pages. last level pte should already be cleared */
1114 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1115                                    unsigned long start_pfn,
1116                                    unsigned long last_pfn)
1117 {
1118         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1119         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1120         BUG_ON(start_pfn > last_pfn);
1121
1122         dma_pte_clear_range(domain, start_pfn, last_pfn);
1123
1124         /* We don't need lock here; nobody else touches the iova range */
1125         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1126                            domain->pgd, 0, start_pfn, last_pfn);
1127
1128         /* free pgd */
1129         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1130                 free_pgtable_page(domain->pgd);
1131                 domain->pgd = NULL;
1132         }
1133 }
1134
1135 /* When a page at a given level is being unlinked from its parent, we don't
1136    need to *modify* it at all. All we need to do is make a list of all the
1137    pages which can be freed just as soon as we've flushed the IOTLB and we
1138    know the hardware page-walk will no longer touch them.
1139    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1140    be freed. */
1141 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1142                                             int level, struct dma_pte *pte,
1143                                             struct page *freelist)
1144 {
1145         struct page *pg;
1146
1147         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1148         pg->freelist = freelist;
1149         freelist = pg;
1150
1151         if (level == 1)
1152                 return freelist;
1153
1154         pte = page_address(pg);
1155         do {
1156                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1157                         freelist = dma_pte_list_pagetables(domain, level - 1,
1158                                                            pte, freelist);
1159                 pte++;
1160         } while (!first_pte_in_page(pte));
1161
1162         return freelist;
1163 }
1164
1165 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1166                                         struct dma_pte *pte, unsigned long pfn,
1167                                         unsigned long start_pfn,
1168                                         unsigned long last_pfn,
1169                                         struct page *freelist)
1170 {
1171         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1172
1173         pfn = max(start_pfn, pfn);
1174         pte = &pte[pfn_level_offset(pfn, level)];
1175
1176         do {
1177                 unsigned long level_pfn;
1178
1179                 if (!dma_pte_present(pte))
1180                         goto next;
1181
1182                 level_pfn = pfn & level_mask(level);
1183
1184                 /* If range covers entire pagetable, free it */
1185                 if (start_pfn <= level_pfn &&
1186                     last_pfn >= level_pfn + level_size(level) - 1) {
1187                         /* These suborbinate page tables are going away entirely. Don't
1188                            bother to clear them; we're just going to *free* them. */
1189                         if (level > 1 && !dma_pte_superpage(pte))
1190                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1191
1192                         dma_clear_pte(pte);
1193                         if (!first_pte)
1194                                 first_pte = pte;
1195                         last_pte = pte;
1196                 } else if (level > 1) {
1197                         /* Recurse down into a level that isn't *entirely* obsolete */
1198                         freelist = dma_pte_clear_level(domain, level - 1,
1199                                                        phys_to_virt(dma_pte_addr(pte)),
1200                                                        level_pfn, start_pfn, last_pfn,
1201                                                        freelist);
1202                 }
1203 next:
1204                 pfn += level_size(level);
1205         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1206
1207         if (first_pte)
1208                 domain_flush_cache(domain, first_pte,
1209                                    (void *)++last_pte - (void *)first_pte);
1210
1211         return freelist;
1212 }
1213
1214 /* We can't just free the pages because the IOMMU may still be walking
1215    the page tables, and may have cached the intermediate levels. The
1216    pages can only be freed after the IOTLB flush has been done. */
1217 static struct page *domain_unmap(struct dmar_domain *domain,
1218                                  unsigned long start_pfn,
1219                                  unsigned long last_pfn)
1220 {
1221         struct page *freelist = NULL;
1222
1223         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1224         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1225         BUG_ON(start_pfn > last_pfn);
1226
1227         /* we don't need lock here; nobody else touches the iova range */
1228         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1229                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1230
1231         /* free pgd */
1232         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1233                 struct page *pgd_page = virt_to_page(domain->pgd);
1234                 pgd_page->freelist = freelist;
1235                 freelist = pgd_page;
1236
1237                 domain->pgd = NULL;
1238         }
1239
1240         return freelist;
1241 }
1242
1243 static void dma_free_pagelist(struct page *freelist)
1244 {
1245         struct page *pg;
1246
1247         while ((pg = freelist)) {
1248                 freelist = pg->freelist;
1249                 free_pgtable_page(page_address(pg));
1250         }
1251 }
1252
1253 /* iommu handling */
1254 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1255 {
1256         struct root_entry *root;
1257         unsigned long flags;
1258
1259         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1260         if (!root) {
1261                 pr_err("Allocating root entry for %s failed\n",
1262                         iommu->name);
1263                 return -ENOMEM;
1264         }
1265
1266         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1267
1268         spin_lock_irqsave(&iommu->lock, flags);
1269         iommu->root_entry = root;
1270         spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272         return 0;
1273 }
1274
1275 static void iommu_set_root_entry(struct intel_iommu *iommu)
1276 {
1277         u64 addr;
1278         u32 sts;
1279         unsigned long flag;
1280
1281         addr = virt_to_phys(iommu->root_entry);
1282         if (ecs_enabled(iommu))
1283                 addr |= DMA_RTADDR_RTT;
1284
1285         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1286         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1287
1288         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1289
1290         /* Make sure hardware complete it */
1291         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1292                       readl, (sts & DMA_GSTS_RTPS), sts);
1293
1294         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1295 }
1296
1297 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1298 {
1299         u32 val;
1300         unsigned long flag;
1301
1302         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1303                 return;
1304
1305         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1306         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1307
1308         /* Make sure hardware complete it */
1309         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1310                       readl, (!(val & DMA_GSTS_WBFS)), val);
1311
1312         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1313 }
1314
1315 /* return value determine if we need a write buffer flush */
1316 static void __iommu_flush_context(struct intel_iommu *iommu,
1317                                   u16 did, u16 source_id, u8 function_mask,
1318                                   u64 type)
1319 {
1320         u64 val = 0;
1321         unsigned long flag;
1322
1323         switch (type) {
1324         case DMA_CCMD_GLOBAL_INVL:
1325                 val = DMA_CCMD_GLOBAL_INVL;
1326                 break;
1327         case DMA_CCMD_DOMAIN_INVL:
1328                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1329                 break;
1330         case DMA_CCMD_DEVICE_INVL:
1331                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1332                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1333                 break;
1334         default:
1335                 BUG();
1336         }
1337         val |= DMA_CCMD_ICC;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1344                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1351                                 u64 addr, unsigned int size_order, u64 type)
1352 {
1353         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1354         u64 val = 0, val_iva = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_TLB_GLOBAL_FLUSH:
1359                 /* global flush doesn't need set IVA_REG */
1360                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1361                 break;
1362         case DMA_TLB_DSI_FLUSH:
1363                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1364                 break;
1365         case DMA_TLB_PSI_FLUSH:
1366                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1367                 /* IH bit is passed in as part of address */
1368                 val_iva = size_order | addr;
1369                 break;
1370         default:
1371                 BUG();
1372         }
1373         /* Note: set drain read/write */
1374 #if 0
1375         /*
1376          * This is probably to be super secure.. Looks like we can
1377          * ignore it without any impact.
1378          */
1379         if (cap_read_drain(iommu->cap))
1380                 val |= DMA_TLB_READ_DRAIN;
1381 #endif
1382         if (cap_write_drain(iommu->cap))
1383                 val |= DMA_TLB_WRITE_DRAIN;
1384
1385         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1386         /* Note: Only uses first TLB reg currently */
1387         if (val_iva)
1388                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1389         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1390
1391         /* Make sure hardware complete it */
1392         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1393                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1394
1395         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396
1397         /* check IOTLB invalidation granularity */
1398         if (DMA_TLB_IAIG(val) == 0)
1399                 pr_err("Flush IOTLB failed\n");
1400         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1401                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1402                         (unsigned long long)DMA_TLB_IIRG(type),
1403                         (unsigned long long)DMA_TLB_IAIG(val));
1404 }
1405
1406 static struct device_domain_info *
1407 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1408                          u8 bus, u8 devfn)
1409 {
1410         bool found = false;
1411         struct device_domain_info *info;
1412         struct pci_dev *pdev;
1413
1414         assert_spin_locked(&device_domain_lock);
1415
1416         if (!ecap_dev_iotlb_support(iommu->ecap))
1417                 return NULL;
1418
1419         if (!iommu->qi)
1420                 return NULL;
1421
1422         list_for_each_entry(info, &domain->devices, link)
1423                 if (info->iommu == iommu && info->bus == bus &&
1424                     info->devfn == devfn) {
1425                         found = true;
1426                         break;
1427                 }
1428
1429         if (!found || !info->dev || !dev_is_pci(info->dev))
1430                 return NULL;
1431
1432         pdev = to_pci_dev(info->dev);
1433
1434         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1435                 return NULL;
1436
1437         if (!dmar_find_matched_atsr_unit(pdev))
1438                 return NULL;
1439
1440         return info;
1441 }
1442
1443 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1444 {
1445         struct pci_dev *pdev;
1446
1447         if (!info || !dev_is_pci(info->dev))
1448                 return;
1449
1450         pdev = to_pci_dev(info->dev);
1451         if (pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1452                 return;
1453
1454         info->ats.enabled = 1;
1455         info->ats.qdep = pci_ats_queue_depth(pdev);
1456 }
1457
1458 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1459 {
1460         if (!info->ats.enabled)
1461                 return;
1462
1463         pci_disable_ats(to_pci_dev(info->dev));
1464         info->ats.enabled = 0;
1465 }
1466
1467 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1468                                   u64 addr, unsigned mask)
1469 {
1470         u16 sid, qdep;
1471         unsigned long flags;
1472         struct device_domain_info *info;
1473
1474         spin_lock_irqsave(&device_domain_lock, flags);
1475         list_for_each_entry(info, &domain->devices, link) {
1476                 if (!info->ats.enabled)
1477                         continue;
1478
1479                 sid = info->bus << 8 | info->devfn;
1480                 qdep = info->ats.qdep;
1481                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1482         }
1483         spin_unlock_irqrestore(&device_domain_lock, flags);
1484 }
1485
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487                                   struct dmar_domain *domain,
1488                                   unsigned long pfn, unsigned int pages,
1489                                   int ih, int map)
1490 {
1491         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493         u16 did = domain->iommu_did[iommu->seq_id];
1494
1495         BUG_ON(pages == 0);
1496
1497         if (ih)
1498                 ih = 1 << 6;
1499         /*
1500          * Fallback to domain selective flush if no PSI support or the size is
1501          * too big.
1502          * PSI requires page size to be 2 ^ x, and the base address is naturally
1503          * aligned to the size
1504          */
1505         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507                                                 DMA_TLB_DSI_FLUSH);
1508         else
1509                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510                                                 DMA_TLB_PSI_FLUSH);
1511
1512         /*
1513          * In caching mode, changes of pages from non-present to present require
1514          * flush. However, device IOTLB doesn't need to be flushed in this case.
1515          */
1516         if (!cap_caching_mode(iommu->cap) || !map)
1517                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1518                                       addr, mask);
1519 }
1520
1521 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1522 {
1523         u32 pmen;
1524         unsigned long flags;
1525
1526         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1527         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1528         pmen &= ~DMA_PMEN_EPM;
1529         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1530
1531         /* wait for the protected region status bit to clear */
1532         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1533                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1534
1535         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1536 }
1537
1538 static void iommu_enable_translation(struct intel_iommu *iommu)
1539 {
1540         u32 sts;
1541         unsigned long flags;
1542
1543         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1544         iommu->gcmd |= DMA_GCMD_TE;
1545         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1546
1547         /* Make sure hardware complete it */
1548         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1549                       readl, (sts & DMA_GSTS_TES), sts);
1550
1551         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1552 }
1553
1554 static void iommu_disable_translation(struct intel_iommu *iommu)
1555 {
1556         u32 sts;
1557         unsigned long flag;
1558
1559         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1560         iommu->gcmd &= ~DMA_GCMD_TE;
1561         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1562
1563         /* Make sure hardware complete it */
1564         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1565                       readl, (!(sts & DMA_GSTS_TES)), sts);
1566
1567         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1568 }
1569
1570
1571 static int iommu_init_domains(struct intel_iommu *iommu)
1572 {
1573         u32 ndomains, nlongs;
1574         size_t size;
1575
1576         ndomains = cap_ndoms(iommu->cap);
1577         pr_debug("%s: Number of Domains supported <%d>\n",
1578                  iommu->name, ndomains);
1579         nlongs = BITS_TO_LONGS(ndomains);
1580
1581         spin_lock_init(&iommu->lock);
1582
1583         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1584         if (!iommu->domain_ids) {
1585                 pr_err("%s: Allocating domain id array failed\n",
1586                        iommu->name);
1587                 return -ENOMEM;
1588         }
1589
1590         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1591         iommu->domains = kzalloc(size, GFP_KERNEL);
1592
1593         if (iommu->domains) {
1594                 size = 256 * sizeof(struct dmar_domain *);
1595                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1596         }
1597
1598         if (!iommu->domains || !iommu->domains[0]) {
1599                 pr_err("%s: Allocating domain array failed\n",
1600                        iommu->name);
1601                 kfree(iommu->domain_ids);
1602                 kfree(iommu->domains);
1603                 iommu->domain_ids = NULL;
1604                 iommu->domains    = NULL;
1605                 return -ENOMEM;
1606         }
1607
1608
1609
1610         /*
1611          * If Caching mode is set, then invalid translations are tagged
1612          * with domain-id 0, hence we need to pre-allocate it. We also
1613          * use domain-id 0 as a marker for non-allocated domain-id, so
1614          * make sure it is not used for a real domain.
1615          */
1616         set_bit(0, iommu->domain_ids);
1617
1618         return 0;
1619 }
1620
1621 static void disable_dmar_iommu(struct intel_iommu *iommu)
1622 {
1623         struct device_domain_info *info, *tmp;
1624         unsigned long flags;
1625
1626         if (!iommu->domains || !iommu->domain_ids)
1627                 return;
1628
1629         spin_lock_irqsave(&device_domain_lock, flags);
1630         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1631                 struct dmar_domain *domain;
1632
1633                 if (info->iommu != iommu)
1634                         continue;
1635
1636                 if (!info->dev || !info->domain)
1637                         continue;
1638
1639                 domain = info->domain;
1640
1641                 dmar_remove_one_dev_info(domain, info->dev);
1642
1643                 if (!domain_type_is_vm_or_si(domain))
1644                         domain_exit(domain);
1645         }
1646         spin_unlock_irqrestore(&device_domain_lock, flags);
1647
1648         if (iommu->gcmd & DMA_GCMD_TE)
1649                 iommu_disable_translation(iommu);
1650 }
1651
1652 static void free_dmar_iommu(struct intel_iommu *iommu)
1653 {
1654         if ((iommu->domains) && (iommu->domain_ids)) {
1655                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1656                 int i;
1657
1658                 for (i = 0; i < elems; i++)
1659                         kfree(iommu->domains[i]);
1660                 kfree(iommu->domains);
1661                 kfree(iommu->domain_ids);
1662                 iommu->domains = NULL;
1663                 iommu->domain_ids = NULL;
1664         }
1665
1666         g_iommus[iommu->seq_id] = NULL;
1667
1668         /* free context mapping */
1669         free_context_table(iommu);
1670 }
1671
1672 static struct dmar_domain *alloc_domain(int flags)
1673 {
1674         struct dmar_domain *domain;
1675
1676         domain = alloc_domain_mem();
1677         if (!domain)
1678                 return NULL;
1679
1680         memset(domain, 0, sizeof(*domain));
1681         domain->nid = -1;
1682         domain->flags = flags;
1683         INIT_LIST_HEAD(&domain->devices);
1684
1685         return domain;
1686 }
1687
1688 /* Must be called with iommu->lock */
1689 static int domain_attach_iommu(struct dmar_domain *domain,
1690                                struct intel_iommu *iommu)
1691 {
1692         unsigned long ndomains;
1693         int num;
1694
1695         assert_spin_locked(&device_domain_lock);
1696         assert_spin_locked(&iommu->lock);
1697
1698         domain->iommu_refcnt[iommu->seq_id] += 1;
1699         domain->iommu_count += 1;
1700         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1701                 ndomains = cap_ndoms(iommu->cap);
1702                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1703
1704                 if (num >= ndomains) {
1705                         pr_err("%s: No free domain ids\n", iommu->name);
1706                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1707                         domain->iommu_count -= 1;
1708                         return -ENOSPC;
1709                 }
1710
1711                 set_bit(num, iommu->domain_ids);
1712                 set_iommu_domain(iommu, num, domain);
1713
1714                 domain->iommu_did[iommu->seq_id] = num;
1715                 domain->nid                      = iommu->node;
1716
1717                 domain_update_iommu_cap(domain);
1718         }
1719
1720         return 0;
1721 }
1722
1723 static int domain_detach_iommu(struct dmar_domain *domain,
1724                                struct intel_iommu *iommu)
1725 {
1726         int num, count = INT_MAX;
1727
1728         assert_spin_locked(&device_domain_lock);
1729         assert_spin_locked(&iommu->lock);
1730
1731         domain->iommu_refcnt[iommu->seq_id] -= 1;
1732         count = --domain->iommu_count;
1733         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1734                 num = domain->iommu_did[iommu->seq_id];
1735                 clear_bit(num, iommu->domain_ids);
1736                 set_iommu_domain(iommu, num, NULL);
1737
1738                 domain_update_iommu_cap(domain);
1739                 domain->iommu_did[iommu->seq_id] = 0;
1740         }
1741
1742         return count;
1743 }
1744
1745 static struct iova_domain reserved_iova_list;
1746 static struct lock_class_key reserved_rbtree_key;
1747
1748 static int dmar_init_reserved_ranges(void)
1749 {
1750         struct pci_dev *pdev = NULL;
1751         struct iova *iova;
1752         int i;
1753
1754         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1755                         DMA_32BIT_PFN);
1756
1757         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1758                 &reserved_rbtree_key);
1759
1760         /* IOAPIC ranges shouldn't be accessed by DMA */
1761         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1762                 IOVA_PFN(IOAPIC_RANGE_END));
1763         if (!iova) {
1764                 pr_err("Reserve IOAPIC range failed\n");
1765                 return -ENODEV;
1766         }
1767
1768         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1769         for_each_pci_dev(pdev) {
1770                 struct resource *r;
1771
1772                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1773                         r = &pdev->resource[i];
1774                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1775                                 continue;
1776                         iova = reserve_iova(&reserved_iova_list,
1777                                             IOVA_PFN(r->start),
1778                                             IOVA_PFN(r->end));
1779                         if (!iova) {
1780                                 pr_err("Reserve iova failed\n");
1781                                 return -ENODEV;
1782                         }
1783                 }
1784         }
1785         return 0;
1786 }
1787
1788 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1789 {
1790         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1791 }
1792
1793 static inline int guestwidth_to_adjustwidth(int gaw)
1794 {
1795         int agaw;
1796         int r = (gaw - 12) % 9;
1797
1798         if (r == 0)
1799                 agaw = gaw;
1800         else
1801                 agaw = gaw + 9 - r;
1802         if (agaw > 64)
1803                 agaw = 64;
1804         return agaw;
1805 }
1806
1807 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1808                        int guest_width)
1809 {
1810         int adjust_width, agaw;
1811         unsigned long sagaw;
1812
1813         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1814                         DMA_32BIT_PFN);
1815         domain_reserve_special_ranges(domain);
1816
1817         /* calculate AGAW */
1818         if (guest_width > cap_mgaw(iommu->cap))
1819                 guest_width = cap_mgaw(iommu->cap);
1820         domain->gaw = guest_width;
1821         adjust_width = guestwidth_to_adjustwidth(guest_width);
1822         agaw = width_to_agaw(adjust_width);
1823         sagaw = cap_sagaw(iommu->cap);
1824         if (!test_bit(agaw, &sagaw)) {
1825                 /* hardware doesn't support it, choose a bigger one */
1826                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1827                 agaw = find_next_bit(&sagaw, 5, agaw);
1828                 if (agaw >= 5)
1829                         return -ENODEV;
1830         }
1831         domain->agaw = agaw;
1832
1833         if (ecap_coherent(iommu->ecap))
1834                 domain->iommu_coherency = 1;
1835         else
1836                 domain->iommu_coherency = 0;
1837
1838         if (ecap_sc_support(iommu->ecap))
1839                 domain->iommu_snooping = 1;
1840         else
1841                 domain->iommu_snooping = 0;
1842
1843         if (intel_iommu_superpage)
1844                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1845         else
1846                 domain->iommu_superpage = 0;
1847
1848         domain->nid = iommu->node;
1849
1850         /* always allocate the top pgd */
1851         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1852         if (!domain->pgd)
1853                 return -ENOMEM;
1854         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1855         return 0;
1856 }
1857
1858 static void domain_exit(struct dmar_domain *domain)
1859 {
1860         struct page *freelist = NULL;
1861
1862         /* Domain 0 is reserved, so dont process it */
1863         if (!domain)
1864                 return;
1865
1866         /* Flush any lazy unmaps that may reference this domain */
1867         if (!intel_iommu_strict)
1868                 flush_unmaps_timeout(0);
1869
1870         /* Remove associated devices and clear attached or cached domains */
1871         rcu_read_lock();
1872         domain_remove_dev_info(domain);
1873         rcu_read_unlock();
1874
1875         /* destroy iovas */
1876         put_iova_domain(&domain->iovad);
1877
1878         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1879
1880         dma_free_pagelist(freelist);
1881
1882         free_domain_mem(domain);
1883 }
1884
1885 static int domain_context_mapping_one(struct dmar_domain *domain,
1886                                       struct intel_iommu *iommu,
1887                                       u8 bus, u8 devfn)
1888 {
1889         u16 did = domain->iommu_did[iommu->seq_id];
1890         int translation = CONTEXT_TT_MULTI_LEVEL;
1891         struct device_domain_info *info = NULL;
1892         struct context_entry *context;
1893         unsigned long flags;
1894         struct dma_pte *pgd;
1895         int ret, agaw;
1896
1897         WARN_ON(did == 0);
1898
1899         if (hw_pass_through && domain_type_is_si(domain))
1900                 translation = CONTEXT_TT_PASS_THROUGH;
1901
1902         pr_debug("Set context mapping for %02x:%02x.%d\n",
1903                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1904
1905         BUG_ON(!domain->pgd);
1906
1907         spin_lock_irqsave(&device_domain_lock, flags);
1908         spin_lock(&iommu->lock);
1909
1910         ret = -ENOMEM;
1911         context = iommu_context_addr(iommu, bus, devfn, 1);
1912         if (!context)
1913                 goto out_unlock;
1914
1915         ret = 0;
1916         if (context_present(context))
1917                 goto out_unlock;
1918
1919         pgd = domain->pgd;
1920
1921         context_clear_entry(context);
1922         context_set_domain_id(context, did);
1923
1924         /*
1925          * Skip top levels of page tables for iommu which has less agaw
1926          * than default.  Unnecessary for PT mode.
1927          */
1928         if (translation != CONTEXT_TT_PASS_THROUGH) {
1929                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1930                         ret = -ENOMEM;
1931                         pgd = phys_to_virt(dma_pte_addr(pgd));
1932                         if (!dma_pte_present(pgd))
1933                                 goto out_unlock;
1934                 }
1935
1936                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1937                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1938                                      CONTEXT_TT_MULTI_LEVEL;
1939
1940                 context_set_address_root(context, virt_to_phys(pgd));
1941                 context_set_address_width(context, iommu->agaw);
1942         } else {
1943                 /*
1944                  * In pass through mode, AW must be programmed to
1945                  * indicate the largest AGAW value supported by
1946                  * hardware. And ASR is ignored by hardware.
1947                  */
1948                 context_set_address_width(context, iommu->msagaw);
1949         }
1950
1951         context_set_translation_type(context, translation);
1952         context_set_fault_enable(context);
1953         context_set_present(context);
1954         domain_flush_cache(domain, context, sizeof(*context));
1955
1956         /*
1957          * It's a non-present to present mapping. If hardware doesn't cache
1958          * non-present entry we only need to flush the write-buffer. If the
1959          * _does_ cache non-present entries, then it does so in the special
1960          * domain #0, which we have to flush:
1961          */
1962         if (cap_caching_mode(iommu->cap)) {
1963                 iommu->flush.flush_context(iommu, 0,
1964                                            (((u16)bus) << 8) | devfn,
1965                                            DMA_CCMD_MASK_NOBIT,
1966                                            DMA_CCMD_DEVICE_INVL);
1967                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1968         } else {
1969                 iommu_flush_write_buffer(iommu);
1970         }
1971         iommu_enable_dev_iotlb(info);
1972
1973         ret = 0;
1974
1975 out_unlock:
1976         spin_unlock(&iommu->lock);
1977         spin_unlock_irqrestore(&device_domain_lock, flags);
1978
1979         return 0;
1980 }
1981
1982 struct domain_context_mapping_data {
1983         struct dmar_domain *domain;
1984         struct intel_iommu *iommu;
1985 };
1986
1987 static int domain_context_mapping_cb(struct pci_dev *pdev,
1988                                      u16 alias, void *opaque)
1989 {
1990         struct domain_context_mapping_data *data = opaque;
1991
1992         return domain_context_mapping_one(data->domain, data->iommu,
1993                                           PCI_BUS_NUM(alias), alias & 0xff);
1994 }
1995
1996 static int
1997 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1998 {
1999         struct intel_iommu *iommu;
2000         u8 bus, devfn;
2001         struct domain_context_mapping_data data;
2002
2003         iommu = device_to_iommu(dev, &bus, &devfn);
2004         if (!iommu)
2005                 return -ENODEV;
2006
2007         if (!dev_is_pci(dev))
2008                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2009
2010         data.domain = domain;
2011         data.iommu = iommu;
2012
2013         return pci_for_each_dma_alias(to_pci_dev(dev),
2014                                       &domain_context_mapping_cb, &data);
2015 }
2016
2017 static int domain_context_mapped_cb(struct pci_dev *pdev,
2018                                     u16 alias, void *opaque)
2019 {
2020         struct intel_iommu *iommu = opaque;
2021
2022         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2023 }
2024
2025 static int domain_context_mapped(struct device *dev)
2026 {
2027         struct intel_iommu *iommu;
2028         u8 bus, devfn;
2029
2030         iommu = device_to_iommu(dev, &bus, &devfn);
2031         if (!iommu)
2032                 return -ENODEV;
2033
2034         if (!dev_is_pci(dev))
2035                 return device_context_mapped(iommu, bus, devfn);
2036
2037         return !pci_for_each_dma_alias(to_pci_dev(dev),
2038                                        domain_context_mapped_cb, iommu);
2039 }
2040
2041 /* Returns a number of VTD pages, but aligned to MM page size */
2042 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2043                                             size_t size)
2044 {
2045         host_addr &= ~PAGE_MASK;
2046         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2047 }
2048
2049 /* Return largest possible superpage level for a given mapping */
2050 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2051                                           unsigned long iov_pfn,
2052                                           unsigned long phy_pfn,
2053                                           unsigned long pages)
2054 {
2055         int support, level = 1;
2056         unsigned long pfnmerge;
2057
2058         support = domain->iommu_superpage;
2059
2060         /* To use a large page, the virtual *and* physical addresses
2061            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2062            of them will mean we have to use smaller pages. So just
2063            merge them and check both at once. */
2064         pfnmerge = iov_pfn | phy_pfn;
2065
2066         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2067                 pages >>= VTD_STRIDE_SHIFT;
2068                 if (!pages)
2069                         break;
2070                 pfnmerge >>= VTD_STRIDE_SHIFT;
2071                 level++;
2072                 support--;
2073         }
2074         return level;
2075 }
2076
2077 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2078                             struct scatterlist *sg, unsigned long phys_pfn,
2079                             unsigned long nr_pages, int prot)
2080 {
2081         struct dma_pte *first_pte = NULL, *pte = NULL;
2082         phys_addr_t uninitialized_var(pteval);
2083         unsigned long sg_res = 0;
2084         unsigned int largepage_lvl = 0;
2085         unsigned long lvl_pages = 0;
2086
2087         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2088
2089         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2090                 return -EINVAL;
2091
2092         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2093
2094         if (!sg) {
2095                 sg_res = nr_pages;
2096                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2097         }
2098
2099         while (nr_pages > 0) {
2100                 uint64_t tmp;
2101
2102                 if (!sg_res) {
2103                         sg_res = aligned_nrpages(sg->offset, sg->length);
2104                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2105                         sg->dma_length = sg->length;
2106                         pteval = (sg_phys(sg) & PAGE_MASK) | prot;
2107                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2108                 }
2109
2110                 if (!pte) {
2111                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2112
2113                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2114                         if (!pte)
2115                                 return -ENOMEM;
2116                         /* It is large page*/
2117                         if (largepage_lvl > 1) {
2118                                 unsigned long nr_superpages, end_pfn;
2119
2120                                 pteval |= DMA_PTE_LARGE_PAGE;
2121                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2122
2123                                 nr_superpages = sg_res / lvl_pages;
2124                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2125
2126                                 /*
2127                                  * Ensure that old small page tables are
2128                                  * removed to make room for superpage(s).
2129                                  */
2130                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2131                         } else {
2132                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2133                         }
2134
2135                 }
2136                 /* We don't need lock here, nobody else
2137                  * touches the iova range
2138                  */
2139                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2140                 if (tmp) {
2141                         static int dumps = 5;
2142                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2143                                 iov_pfn, tmp, (unsigned long long)pteval);
2144                         if (dumps) {
2145                                 dumps--;
2146                                 debug_dma_dump_mappings(NULL);
2147                         }
2148                         WARN_ON(1);
2149                 }
2150
2151                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2152
2153                 BUG_ON(nr_pages < lvl_pages);
2154                 BUG_ON(sg_res < lvl_pages);
2155
2156                 nr_pages -= lvl_pages;
2157                 iov_pfn += lvl_pages;
2158                 phys_pfn += lvl_pages;
2159                 pteval += lvl_pages * VTD_PAGE_SIZE;
2160                 sg_res -= lvl_pages;
2161
2162                 /* If the next PTE would be the first in a new page, then we
2163                    need to flush the cache on the entries we've just written.
2164                    And then we'll need to recalculate 'pte', so clear it and
2165                    let it get set again in the if (!pte) block above.
2166
2167                    If we're done (!nr_pages) we need to flush the cache too.
2168
2169                    Also if we've been setting superpages, we may need to
2170                    recalculate 'pte' and switch back to smaller pages for the
2171                    end of the mapping, if the trailing size is not enough to
2172                    use another superpage (i.e. sg_res < lvl_pages). */
2173                 pte++;
2174                 if (!nr_pages || first_pte_in_page(pte) ||
2175                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2176                         domain_flush_cache(domain, first_pte,
2177                                            (void *)pte - (void *)first_pte);
2178                         pte = NULL;
2179                 }
2180
2181                 if (!sg_res && nr_pages)
2182                         sg = sg_next(sg);
2183         }
2184         return 0;
2185 }
2186
2187 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2188                                     struct scatterlist *sg, unsigned long nr_pages,
2189                                     int prot)
2190 {
2191         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2192 }
2193
2194 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2195                                      unsigned long phys_pfn, unsigned long nr_pages,
2196                                      int prot)
2197 {
2198         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2199 }
2200
2201 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2202 {
2203         if (!iommu)
2204                 return;
2205
2206         clear_context_table(iommu, bus, devfn);
2207         iommu->flush.flush_context(iommu, 0, 0, 0,
2208                                            DMA_CCMD_GLOBAL_INVL);
2209         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2210 }
2211
2212 static inline void unlink_domain_info(struct device_domain_info *info)
2213 {
2214         assert_spin_locked(&device_domain_lock);
2215         list_del(&info->link);
2216         list_del(&info->global);
2217         if (info->dev)
2218                 info->dev->archdata.iommu = NULL;
2219 }
2220
2221 static void domain_remove_dev_info(struct dmar_domain *domain)
2222 {
2223         struct device_domain_info *info, *tmp;
2224         unsigned long flags;
2225
2226         spin_lock_irqsave(&device_domain_lock, flags);
2227         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2228                 __dmar_remove_one_dev_info(info);
2229         spin_unlock_irqrestore(&device_domain_lock, flags);
2230 }
2231
2232 /*
2233  * find_domain
2234  * Note: we use struct device->archdata.iommu stores the info
2235  */
2236 static struct dmar_domain *find_domain(struct device *dev)
2237 {
2238         struct device_domain_info *info;
2239
2240         /* No lock here, assumes no domain exit in normal case */
2241         info = dev->archdata.iommu;
2242         if (info)
2243                 return info->domain;
2244         return NULL;
2245 }
2246
2247 static inline struct device_domain_info *
2248 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2249 {
2250         struct device_domain_info *info;
2251
2252         list_for_each_entry(info, &device_domain_list, global)
2253                 if (info->iommu->segment == segment && info->bus == bus &&
2254                     info->devfn == devfn)
2255                         return info;
2256
2257         return NULL;
2258 }
2259
2260 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2261                                                     int bus, int devfn,
2262                                                     struct device *dev,
2263                                                     struct dmar_domain *domain)
2264 {
2265         struct dmar_domain *found = NULL;
2266         struct device_domain_info *info;
2267         unsigned long flags;
2268         int ret;
2269
2270         info = alloc_devinfo_mem();
2271         if (!info)
2272                 return NULL;
2273
2274         info->bus = bus;
2275         info->devfn = devfn;
2276         info->ats.enabled = 0;
2277         info->ats.qdep = 0;
2278         info->dev = dev;
2279         info->domain = domain;
2280         info->iommu = iommu;
2281
2282         spin_lock_irqsave(&device_domain_lock, flags);
2283         if (dev)
2284                 found = find_domain(dev);
2285
2286         if (!found) {
2287                 struct device_domain_info *info2;
2288                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2289                 if (info2) {
2290                         found      = info2->domain;
2291                         info2->dev = dev;
2292                 }
2293         }
2294
2295         if (found) {
2296                 spin_unlock_irqrestore(&device_domain_lock, flags);
2297                 free_devinfo_mem(info);
2298                 /* Caller must free the original domain */
2299                 return found;
2300         }
2301
2302         spin_lock(&iommu->lock);
2303         ret = domain_attach_iommu(domain, iommu);
2304         spin_unlock(&iommu->lock);
2305
2306         if (ret) {
2307                 spin_unlock_irqrestore(&device_domain_lock, flags);
2308                 return NULL;
2309         }
2310
2311         list_add(&info->link, &domain->devices);
2312         list_add(&info->global, &device_domain_list);
2313         if (dev)
2314                 dev->archdata.iommu = info;
2315         spin_unlock_irqrestore(&device_domain_lock, flags);
2316
2317         if (dev && domain_context_mapping(domain, dev)) {
2318                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2319                 dmar_remove_one_dev_info(domain, dev);
2320                 return NULL;
2321         }
2322
2323         return domain;
2324 }
2325
2326 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2327 {
2328         *(u16 *)opaque = alias;
2329         return 0;
2330 }
2331
2332 /* domain is initialized */
2333 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2334 {
2335         struct device_domain_info *info = NULL;
2336         struct dmar_domain *domain, *tmp;
2337         struct intel_iommu *iommu;
2338         u16 req_id, dma_alias;
2339         unsigned long flags;
2340         u8 bus, devfn;
2341
2342         domain = find_domain(dev);
2343         if (domain)
2344                 return domain;
2345
2346         iommu = device_to_iommu(dev, &bus, &devfn);
2347         if (!iommu)
2348                 return NULL;
2349
2350         req_id = ((u16)bus << 8) | devfn;
2351
2352         if (dev_is_pci(dev)) {
2353                 struct pci_dev *pdev = to_pci_dev(dev);
2354
2355                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2356
2357                 spin_lock_irqsave(&device_domain_lock, flags);
2358                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2359                                                       PCI_BUS_NUM(dma_alias),
2360                                                       dma_alias & 0xff);
2361                 if (info) {
2362                         iommu = info->iommu;
2363                         domain = info->domain;
2364                 }
2365                 spin_unlock_irqrestore(&device_domain_lock, flags);
2366
2367                 /* DMA alias already has a domain, uses it */
2368                 if (info)
2369                         goto found_domain;
2370         }
2371
2372         /* Allocate and initialize new domain for the device */
2373         domain = alloc_domain(0);
2374         if (!domain)
2375                 return NULL;
2376         if (domain_init(domain, iommu, gaw)) {
2377                 domain_exit(domain);
2378                 return NULL;
2379         }
2380
2381         /* register PCI DMA alias device */
2382         if (req_id != dma_alias && dev_is_pci(dev)) {
2383                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2384                                                dma_alias & 0xff, NULL, domain);
2385
2386                 if (!tmp || tmp != domain) {
2387                         domain_exit(domain);
2388                         domain = tmp;
2389                 }
2390
2391                 if (!domain)
2392                         return NULL;
2393         }
2394
2395 found_domain:
2396         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2397
2398         if (!tmp || tmp != domain) {
2399                 domain_exit(domain);
2400                 domain = tmp;
2401         }
2402
2403         return domain;
2404 }
2405
2406 static int iommu_identity_mapping;
2407 #define IDENTMAP_ALL            1
2408 #define IDENTMAP_GFX            2
2409 #define IDENTMAP_AZALIA         4
2410
2411 static int iommu_domain_identity_map(struct dmar_domain *domain,
2412                                      unsigned long long start,
2413                                      unsigned long long end)
2414 {
2415         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2416         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2417
2418         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2419                           dma_to_mm_pfn(last_vpfn))) {
2420                 pr_err("Reserving iova failed\n");
2421                 return -ENOMEM;
2422         }
2423
2424         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2425         /*
2426          * RMRR range might have overlap with physical memory range,
2427          * clear it first
2428          */
2429         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2430
2431         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2432                                   last_vpfn - first_vpfn + 1,
2433                                   DMA_PTE_READ|DMA_PTE_WRITE);
2434 }
2435
2436 static int iommu_prepare_identity_map(struct device *dev,
2437                                       unsigned long long start,
2438                                       unsigned long long end)
2439 {
2440         struct dmar_domain *domain;
2441         int ret;
2442
2443         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2444         if (!domain)
2445                 return -ENOMEM;
2446
2447         /* For _hardware_ passthrough, don't bother. But for software
2448            passthrough, we do it anyway -- it may indicate a memory
2449            range which is reserved in E820, so which didn't get set
2450            up to start with in si_domain */
2451         if (domain == si_domain && hw_pass_through) {
2452                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2453                         dev_name(dev), start, end);
2454                 return 0;
2455         }
2456
2457         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2458                 dev_name(dev), start, end);
2459
2460         if (end < start) {
2461                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2462                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2463                         dmi_get_system_info(DMI_BIOS_VENDOR),
2464                         dmi_get_system_info(DMI_BIOS_VERSION),
2465                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2466                 ret = -EIO;
2467                 goto error;
2468         }
2469
2470         if (end >> agaw_to_width(domain->agaw)) {
2471                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2472                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2473                      agaw_to_width(domain->agaw),
2474                      dmi_get_system_info(DMI_BIOS_VENDOR),
2475                      dmi_get_system_info(DMI_BIOS_VERSION),
2476                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2477                 ret = -EIO;
2478                 goto error;
2479         }
2480
2481         ret = iommu_domain_identity_map(domain, start, end);
2482         if (ret)
2483                 goto error;
2484
2485         return 0;
2486
2487  error:
2488         domain_exit(domain);
2489         return ret;
2490 }
2491
2492 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2493                                          struct device *dev)
2494 {
2495         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2496                 return 0;
2497         return iommu_prepare_identity_map(dev, rmrr->base_address,
2498                                           rmrr->end_address);
2499 }
2500
2501 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2502 static inline void iommu_prepare_isa(void)
2503 {
2504         struct pci_dev *pdev;
2505         int ret;
2506
2507         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2508         if (!pdev)
2509                 return;
2510
2511         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2512         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2513
2514         if (ret)
2515                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2516
2517         pci_dev_put(pdev);
2518 }
2519 #else
2520 static inline void iommu_prepare_isa(void)
2521 {
2522         return;
2523 }
2524 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2525
2526 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2527
2528 static int __init si_domain_init(int hw)
2529 {
2530         int nid, ret = 0;
2531
2532         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2533         if (!si_domain)
2534                 return -EFAULT;
2535
2536         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2537                 domain_exit(si_domain);
2538                 return -EFAULT;
2539         }
2540
2541         pr_debug("Identity mapping domain allocated\n");
2542
2543         if (hw)
2544                 return 0;
2545
2546         for_each_online_node(nid) {
2547                 unsigned long start_pfn, end_pfn;
2548                 int i;
2549
2550                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2551                         ret = iommu_domain_identity_map(si_domain,
2552                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2553                         if (ret)
2554                                 return ret;
2555                 }
2556         }
2557
2558         return 0;
2559 }
2560
2561 static int identity_mapping(struct device *dev)
2562 {
2563         struct device_domain_info *info;
2564
2565         if (likely(!iommu_identity_mapping))
2566                 return 0;
2567
2568         info = dev->archdata.iommu;
2569         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2570                 return (info->domain == si_domain);
2571
2572         return 0;
2573 }
2574
2575 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2576 {
2577         struct dmar_domain *ndomain;
2578         struct intel_iommu *iommu;
2579         u8 bus, devfn;
2580
2581         iommu = device_to_iommu(dev, &bus, &devfn);
2582         if (!iommu)
2583                 return -ENODEV;
2584
2585         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2586         if (ndomain != domain)
2587                 return -EBUSY;
2588
2589         return 0;
2590 }
2591
2592 static bool device_has_rmrr(struct device *dev)
2593 {
2594         struct dmar_rmrr_unit *rmrr;
2595         struct device *tmp;
2596         int i;
2597
2598         rcu_read_lock();
2599         for_each_rmrr_units(rmrr) {
2600                 /*
2601                  * Return TRUE if this RMRR contains the device that
2602                  * is passed in.
2603                  */
2604                 for_each_active_dev_scope(rmrr->devices,
2605                                           rmrr->devices_cnt, i, tmp)
2606                         if (tmp == dev) {
2607                                 rcu_read_unlock();
2608                                 return true;
2609                         }
2610         }
2611         rcu_read_unlock();
2612         return false;
2613 }
2614
2615 /*
2616  * There are a couple cases where we need to restrict the functionality of
2617  * devices associated with RMRRs.  The first is when evaluating a device for
2618  * identity mapping because problems exist when devices are moved in and out
2619  * of domains and their respective RMRR information is lost.  This means that
2620  * a device with associated RMRRs will never be in a "passthrough" domain.
2621  * The second is use of the device through the IOMMU API.  This interface
2622  * expects to have full control of the IOVA space for the device.  We cannot
2623  * satisfy both the requirement that RMRR access is maintained and have an
2624  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2625  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2626  * We therefore prevent devices associated with an RMRR from participating in
2627  * the IOMMU API, which eliminates them from device assignment.
2628  *
2629  * In both cases we assume that PCI USB devices with RMRRs have them largely
2630  * for historical reasons and that the RMRR space is not actively used post
2631  * boot.  This exclusion may change if vendors begin to abuse it.
2632  *
2633  * The same exception is made for graphics devices, with the requirement that
2634  * any use of the RMRR regions will be torn down before assigning the device
2635  * to a guest.
2636  */
2637 static bool device_is_rmrr_locked(struct device *dev)
2638 {
2639         if (!device_has_rmrr(dev))
2640                 return false;
2641
2642         if (dev_is_pci(dev)) {
2643                 struct pci_dev *pdev = to_pci_dev(dev);
2644
2645                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2646                         return false;
2647         }
2648
2649         return true;
2650 }
2651
2652 static int iommu_should_identity_map(struct device *dev, int startup)
2653 {
2654
2655         if (dev_is_pci(dev)) {
2656                 struct pci_dev *pdev = to_pci_dev(dev);
2657
2658                 if (device_is_rmrr_locked(dev))
2659                         return 0;
2660
2661                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2662                         return 1;
2663
2664                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2665                         return 1;
2666
2667                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2668                         return 0;
2669
2670                 /*
2671                  * We want to start off with all devices in the 1:1 domain, and
2672                  * take them out later if we find they can't access all of memory.
2673                  *
2674                  * However, we can't do this for PCI devices behind bridges,
2675                  * because all PCI devices behind the same bridge will end up
2676                  * with the same source-id on their transactions.
2677                  *
2678                  * Practically speaking, we can't change things around for these
2679                  * devices at run-time, because we can't be sure there'll be no
2680                  * DMA transactions in flight for any of their siblings.
2681                  *
2682                  * So PCI devices (unless they're on the root bus) as well as
2683                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2684                  * the 1:1 domain, just in _case_ one of their siblings turns out
2685                  * not to be able to map all of memory.
2686                  */
2687                 if (!pci_is_pcie(pdev)) {
2688                         if (!pci_is_root_bus(pdev->bus))
2689                                 return 0;
2690                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2691                                 return 0;
2692                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2693                         return 0;
2694         } else {
2695                 if (device_has_rmrr(dev))
2696                         return 0;
2697         }
2698
2699         /*
2700          * At boot time, we don't yet know if devices will be 64-bit capable.
2701          * Assume that they will — if they turn out not to be, then we can
2702          * take them out of the 1:1 domain later.
2703          */
2704         if (!startup) {
2705                 /*
2706                  * If the device's dma_mask is less than the system's memory
2707                  * size then this is not a candidate for identity mapping.
2708                  */
2709                 u64 dma_mask = *dev->dma_mask;
2710
2711                 if (dev->coherent_dma_mask &&
2712                     dev->coherent_dma_mask < dma_mask)
2713                         dma_mask = dev->coherent_dma_mask;
2714
2715                 return dma_mask >= dma_get_required_mask(dev);
2716         }
2717
2718         return 1;
2719 }
2720
2721 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2722 {
2723         int ret;
2724
2725         if (!iommu_should_identity_map(dev, 1))
2726                 return 0;
2727
2728         ret = domain_add_dev_info(si_domain, dev);
2729         if (!ret)
2730                 pr_info("%s identity mapping for device %s\n",
2731                         hw ? "Hardware" : "Software", dev_name(dev));
2732         else if (ret == -ENODEV)
2733                 /* device not associated with an iommu */
2734                 ret = 0;
2735
2736         return ret;
2737 }
2738
2739
2740 static int __init iommu_prepare_static_identity_mapping(int hw)
2741 {
2742         struct pci_dev *pdev = NULL;
2743         struct dmar_drhd_unit *drhd;
2744         struct intel_iommu *iommu;
2745         struct device *dev;
2746         int i;
2747         int ret = 0;
2748
2749         for_each_pci_dev(pdev) {
2750                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2751                 if (ret)
2752                         return ret;
2753         }
2754
2755         for_each_active_iommu(iommu, drhd)
2756                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2757                         struct acpi_device_physical_node *pn;
2758                         struct acpi_device *adev;
2759
2760                         if (dev->bus != &acpi_bus_type)
2761                                 continue;
2762
2763                         adev= to_acpi_device(dev);
2764                         mutex_lock(&adev->physical_node_lock);
2765                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2766                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2767                                 if (ret)
2768                                         break;
2769                         }
2770                         mutex_unlock(&adev->physical_node_lock);
2771                         if (ret)
2772                                 return ret;
2773                 }
2774
2775         return 0;
2776 }
2777
2778 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2779 {
2780         /*
2781          * Start from the sane iommu hardware state.
2782          * If the queued invalidation is already initialized by us
2783          * (for example, while enabling interrupt-remapping) then
2784          * we got the things already rolling from a sane state.
2785          */
2786         if (!iommu->qi) {
2787                 /*
2788                  * Clear any previous faults.
2789                  */
2790                 dmar_fault(-1, iommu);
2791                 /*
2792                  * Disable queued invalidation if supported and already enabled
2793                  * before OS handover.
2794                  */
2795                 dmar_disable_qi(iommu);
2796         }
2797
2798         if (dmar_enable_qi(iommu)) {
2799                 /*
2800                  * Queued Invalidate not enabled, use Register Based Invalidate
2801                  */
2802                 iommu->flush.flush_context = __iommu_flush_context;
2803                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2804                 pr_info("%s: Using Register based invalidation\n",
2805                         iommu->name);
2806         } else {
2807                 iommu->flush.flush_context = qi_flush_context;
2808                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2809                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2810         }
2811 }
2812
2813 static int copy_context_table(struct intel_iommu *iommu,
2814                               struct root_entry __iomem *old_re,
2815                               struct context_entry **tbl,
2816                               int bus, bool ext)
2817 {
2818         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2819         struct context_entry __iomem *old_ce = NULL;
2820         struct context_entry *new_ce = NULL, ce;
2821         struct root_entry re;
2822         phys_addr_t old_ce_phys;
2823
2824         tbl_idx = ext ? bus * 2 : bus;
2825         memcpy_fromio(&re, old_re, sizeof(re));
2826
2827         for (devfn = 0; devfn < 256; devfn++) {
2828                 /* First calculate the correct index */
2829                 idx = (ext ? devfn * 2 : devfn) % 256;
2830
2831                 if (idx == 0) {
2832                         /* First save what we may have and clean up */
2833                         if (new_ce) {
2834                                 tbl[tbl_idx] = new_ce;
2835                                 __iommu_flush_cache(iommu, new_ce,
2836                                                     VTD_PAGE_SIZE);
2837                                 pos = 1;
2838                         }
2839
2840                         if (old_ce)
2841                                 iounmap(old_ce);
2842
2843                         ret = 0;
2844                         if (devfn < 0x80)
2845                                 old_ce_phys = root_entry_lctp(&re);
2846                         else
2847                                 old_ce_phys = root_entry_uctp(&re);
2848
2849                         if (!old_ce_phys) {
2850                                 if (ext && devfn == 0) {
2851                                         /* No LCTP, try UCTP */
2852                                         devfn = 0x7f;
2853                                         continue;
2854                                 } else {
2855                                         goto out;
2856                                 }
2857                         }
2858
2859                         ret = -ENOMEM;
2860                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2861                         if (!old_ce)
2862                                 goto out;
2863
2864                         new_ce = alloc_pgtable_page(iommu->node);
2865                         if (!new_ce)
2866                                 goto out_unmap;
2867
2868                         ret = 0;
2869                 }
2870
2871                 /* Now copy the context entry */
2872                 memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
2873
2874                 if (!__context_present(&ce))
2875                         continue;
2876
2877                 did = context_domain_id(&ce);
2878                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2879                         set_bit(did, iommu->domain_ids);
2880
2881                 /*
2882                  * We need a marker for copied context entries. This
2883                  * marker needs to work for the old format as well as
2884                  * for extended context entries.
2885                  *
2886                  * Bit 67 of the context entry is used. In the old
2887                  * format this bit is available to software, in the
2888                  * extended format it is the PGE bit, but PGE is ignored
2889                  * by HW if PASIDs are disabled (and thus still
2890                  * available).
2891                  *
2892                  * So disable PASIDs first and then mark the entry
2893                  * copied. This means that we don't copy PASID
2894                  * translations from the old kernel, but this is fine as
2895                  * faults there are not fatal.
2896                  */
2897                 context_clear_pasid_enable(&ce);
2898                 context_set_copied(&ce);
2899
2900                 new_ce[idx] = ce;
2901         }
2902
2903         tbl[tbl_idx + pos] = new_ce;
2904
2905         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2906
2907 out_unmap:
2908         iounmap(old_ce);
2909
2910 out:
2911         return ret;
2912 }
2913
2914 static int copy_translation_tables(struct intel_iommu *iommu)
2915 {
2916         struct root_entry __iomem *old_rt;
2917         struct context_entry **ctxt_tbls;
2918         phys_addr_t old_rt_phys;
2919         int ctxt_table_entries;
2920         unsigned long flags;
2921         u64 rtaddr_reg;
2922         int bus, ret;
2923         bool new_ext, ext;
2924
2925         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2926         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2927         new_ext    = !!ecap_ecs(iommu->ecap);
2928
2929         /*
2930          * The RTT bit can only be changed when translation is disabled,
2931          * but disabling translation means to open a window for data
2932          * corruption. So bail out and don't copy anything if we would
2933          * have to change the bit.
2934          */
2935         if (new_ext != ext)
2936                 return -EINVAL;
2937
2938         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2939         if (!old_rt_phys)
2940                 return -EINVAL;
2941
2942         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2943         if (!old_rt)
2944                 return -ENOMEM;
2945
2946         /* This is too big for the stack - allocate it from slab */
2947         ctxt_table_entries = ext ? 512 : 256;
2948         ret = -ENOMEM;
2949         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2950         if (!ctxt_tbls)
2951                 goto out_unmap;
2952
2953         for (bus = 0; bus < 256; bus++) {
2954                 ret = copy_context_table(iommu, &old_rt[bus],
2955                                          ctxt_tbls, bus, ext);
2956                 if (ret) {
2957                         pr_err("%s: Failed to copy context table for bus %d\n",
2958                                 iommu->name, bus);
2959                         continue;
2960                 }
2961         }
2962
2963         spin_lock_irqsave(&iommu->lock, flags);
2964
2965         /* Context tables are copied, now write them to the root_entry table */
2966         for (bus = 0; bus < 256; bus++) {
2967                 int idx = ext ? bus * 2 : bus;
2968                 u64 val;
2969
2970                 if (ctxt_tbls[idx]) {
2971                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2972                         iommu->root_entry[bus].lo = val;
2973                 }
2974
2975                 if (!ext || !ctxt_tbls[idx + 1])
2976                         continue;
2977
2978                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2979                 iommu->root_entry[bus].hi = val;
2980         }
2981
2982         spin_unlock_irqrestore(&iommu->lock, flags);
2983
2984         kfree(ctxt_tbls);
2985
2986         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2987
2988         ret = 0;
2989
2990 out_unmap:
2991         iounmap(old_rt);
2992
2993         return ret;
2994 }
2995
2996 static int __init init_dmars(void)
2997 {
2998         struct dmar_drhd_unit *drhd;
2999         struct dmar_rmrr_unit *rmrr;
3000         bool copied_tables = false;
3001         struct device *dev;
3002         struct intel_iommu *iommu;
3003         int i, ret;
3004
3005         /*
3006          * for each drhd
3007          *    allocate root
3008          *    initialize and program root entry to not present
3009          * endfor
3010          */
3011         for_each_drhd_unit(drhd) {
3012                 /*
3013                  * lock not needed as this is only incremented in the single
3014                  * threaded kernel __init code path all other access are read
3015                  * only
3016                  */
3017                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3018                         g_num_of_iommus++;
3019                         continue;
3020                 }
3021                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3022         }
3023
3024         /* Preallocate enough resources for IOMMU hot-addition */
3025         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3026                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3027
3028         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3029                         GFP_KERNEL);
3030         if (!g_iommus) {
3031                 pr_err("Allocating global iommu array failed\n");
3032                 ret = -ENOMEM;
3033                 goto error;
3034         }
3035
3036         deferred_flush = kzalloc(g_num_of_iommus *
3037                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3038         if (!deferred_flush) {
3039                 ret = -ENOMEM;
3040                 goto free_g_iommus;
3041         }
3042
3043         for_each_active_iommu(iommu, drhd) {
3044                 g_iommus[iommu->seq_id] = iommu;
3045
3046                 intel_iommu_init_qi(iommu);
3047
3048                 ret = iommu_init_domains(iommu);
3049                 if (ret)
3050                         goto free_iommu;
3051
3052                 init_translation_status(iommu);
3053
3054                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3055                         iommu_disable_translation(iommu);
3056                         clear_translation_pre_enabled(iommu);
3057                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3058                                 iommu->name);
3059                 }
3060
3061                 /*
3062                  * TBD:
3063                  * we could share the same root & context tables
3064                  * among all IOMMU's. Need to Split it later.
3065                  */
3066                 ret = iommu_alloc_root_entry(iommu);
3067                 if (ret)
3068                         goto free_iommu;
3069
3070                 if (translation_pre_enabled(iommu)) {
3071                         pr_info("Translation already enabled - trying to copy translation structures\n");
3072
3073                         ret = copy_translation_tables(iommu);
3074                         if (ret) {
3075                                 /*
3076                                  * We found the IOMMU with translation
3077                                  * enabled - but failed to copy over the
3078                                  * old root-entry table. Try to proceed
3079                                  * by disabling translation now and
3080                                  * allocating a clean root-entry table.
3081                                  * This might cause DMAR faults, but
3082                                  * probably the dump will still succeed.
3083                                  */
3084                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3085                                        iommu->name);
3086                                 iommu_disable_translation(iommu);
3087                                 clear_translation_pre_enabled(iommu);
3088                         } else {
3089                                 pr_info("Copied translation tables from previous kernel for %s\n",
3090                                         iommu->name);
3091                                 copied_tables = true;
3092                         }
3093                 }
3094
3095                 iommu_flush_write_buffer(iommu);
3096                 iommu_set_root_entry(iommu);
3097                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3098                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3099
3100                 if (!ecap_pass_through(iommu->ecap))
3101                         hw_pass_through = 0;
3102         }
3103
3104         if (iommu_pass_through)
3105                 iommu_identity_mapping |= IDENTMAP_ALL;
3106
3107 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3108         iommu_identity_mapping |= IDENTMAP_GFX;
3109 #endif
3110
3111         if (iommu_identity_mapping) {
3112                 ret = si_domain_init(hw_pass_through);
3113                 if (ret)
3114                         goto free_iommu;
3115         }
3116
3117         check_tylersburg_isoch();
3118
3119         /*
3120          * If we copied translations from a previous kernel in the kdump
3121          * case, we can not assign the devices to domains now, as that
3122          * would eliminate the old mappings. So skip this part and defer
3123          * the assignment to device driver initialization time.
3124          */
3125         if (copied_tables)
3126                 goto domains_done;
3127
3128         /*
3129          * If pass through is not set or not enabled, setup context entries for
3130          * identity mappings for rmrr, gfx, and isa and may fall back to static
3131          * identity mapping if iommu_identity_mapping is set.
3132          */
3133         if (iommu_identity_mapping) {
3134                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3135                 if (ret) {
3136                         pr_crit("Failed to setup IOMMU pass-through\n");
3137                         goto free_iommu;
3138                 }
3139         }
3140         /*
3141          * For each rmrr
3142          *   for each dev attached to rmrr
3143          *   do
3144          *     locate drhd for dev, alloc domain for dev
3145          *     allocate free domain
3146          *     allocate page table entries for rmrr
3147          *     if context not allocated for bus
3148          *           allocate and init context
3149          *           set present in root table for this bus
3150          *     init context with domain, translation etc
3151          *    endfor
3152          * endfor
3153          */
3154         pr_info("Setting RMRR:\n");
3155         for_each_rmrr_units(rmrr) {
3156                 /* some BIOS lists non-exist devices in DMAR table. */
3157                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3158                                           i, dev) {
3159                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3160                         if (ret)
3161                                 pr_err("Mapping reserved region failed\n");
3162                 }
3163         }
3164
3165         iommu_prepare_isa();
3166
3167 domains_done:
3168
3169         /*
3170          * for each drhd
3171          *   enable fault log
3172          *   global invalidate context cache
3173          *   global invalidate iotlb
3174          *   enable translation
3175          */
3176         for_each_iommu(iommu, drhd) {
3177                 if (drhd->ignored) {
3178                         /*
3179                          * we always have to disable PMRs or DMA may fail on
3180                          * this device
3181                          */
3182                         if (force_on)
3183                                 iommu_disable_protect_mem_regions(iommu);
3184                         continue;
3185                 }
3186
3187                 iommu_flush_write_buffer(iommu);
3188
3189                 ret = dmar_set_interrupt(iommu);
3190                 if (ret)
3191                         goto free_iommu;
3192
3193                 if (!translation_pre_enabled(iommu))
3194                         iommu_enable_translation(iommu);
3195
3196                 iommu_disable_protect_mem_regions(iommu);
3197         }
3198
3199         return 0;
3200
3201 free_iommu:
3202         for_each_active_iommu(iommu, drhd) {
3203                 disable_dmar_iommu(iommu);
3204                 free_dmar_iommu(iommu);
3205         }
3206         kfree(deferred_flush);
3207 free_g_iommus:
3208         kfree(g_iommus);
3209 error:
3210         return ret;
3211 }
3212
3213 /* This takes a number of _MM_ pages, not VTD pages */
3214 static struct iova *intel_alloc_iova(struct device *dev,
3215                                      struct dmar_domain *domain,
3216                                      unsigned long nrpages, uint64_t dma_mask)
3217 {
3218         struct iova *iova = NULL;
3219
3220         /* Restrict dma_mask to the width that the iommu can handle */
3221         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3222         /* Ensure we reserve the whole size-aligned region */
3223         nrpages = __roundup_pow_of_two(nrpages);
3224
3225         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3226                 /*
3227                  * First try to allocate an io virtual address in
3228                  * DMA_BIT_MASK(32) and if that fails then try allocating
3229                  * from higher range
3230                  */
3231                 iova = alloc_iova(&domain->iovad, nrpages,
3232                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3233                 if (iova)
3234                         return iova;
3235         }
3236         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3237         if (unlikely(!iova)) {
3238                 pr_err("Allocating %ld-page iova for %s failed",
3239                        nrpages, dev_name(dev));
3240                 return NULL;
3241         }
3242
3243         return iova;
3244 }
3245
3246 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3247 {
3248         struct dmar_domain *domain;
3249
3250         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3251         if (!domain) {
3252                 pr_err("Allocating domain for %s failed\n",
3253                        dev_name(dev));
3254                 return NULL;
3255         }
3256
3257         return domain;
3258 }
3259
3260 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3261 {
3262         struct device_domain_info *info;
3263
3264         /* No lock here, assumes no domain exit in normal case */
3265         info = dev->archdata.iommu;
3266         if (likely(info))
3267                 return info->domain;
3268
3269         return __get_valid_domain_for_dev(dev);
3270 }
3271
3272 /* Check if the dev needs to go through non-identity map and unmap process.*/
3273 static int iommu_no_mapping(struct device *dev)
3274 {
3275         int found;
3276
3277         if (iommu_dummy(dev))
3278                 return 1;
3279
3280         if (!iommu_identity_mapping)
3281                 return 0;
3282
3283         found = identity_mapping(dev);
3284         if (found) {
3285                 if (iommu_should_identity_map(dev, 0))
3286                         return 1;
3287                 else {
3288                         /*
3289                          * 32 bit DMA is removed from si_domain and fall back
3290                          * to non-identity mapping.
3291                          */
3292                         dmar_remove_one_dev_info(si_domain, dev);
3293                         pr_info("32bit %s uses non-identity mapping\n",
3294                                 dev_name(dev));
3295                         return 0;
3296                 }
3297         } else {
3298                 /*
3299                  * In case of a detached 64 bit DMA device from vm, the device
3300                  * is put into si_domain for identity mapping.
3301                  */
3302                 if (iommu_should_identity_map(dev, 0)) {
3303                         int ret;
3304                         ret = domain_add_dev_info(si_domain, dev);
3305                         if (!ret) {
3306                                 pr_info("64bit %s uses identity mapping\n",
3307                                         dev_name(dev));
3308                                 return 1;
3309                         }
3310                 }
3311         }
3312
3313         return 0;
3314 }
3315
3316 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3317                                      size_t size, int dir, u64 dma_mask)
3318 {
3319         struct dmar_domain *domain;
3320         phys_addr_t start_paddr;
3321         struct iova *iova;
3322         int prot = 0;
3323         int ret;
3324         struct intel_iommu *iommu;
3325         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3326
3327         BUG_ON(dir == DMA_NONE);
3328
3329         if (iommu_no_mapping(dev))
3330                 return paddr;
3331
3332         domain = get_valid_domain_for_dev(dev);
3333         if (!domain)
3334                 return 0;
3335
3336         iommu = domain_get_iommu(domain);
3337         size = aligned_nrpages(paddr, size);
3338
3339         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3340         if (!iova)
3341                 goto error;
3342
3343         /*
3344          * Check if DMAR supports zero-length reads on write only
3345          * mappings..
3346          */
3347         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3348                         !cap_zlr(iommu->cap))
3349                 prot |= DMA_PTE_READ;
3350         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3351                 prot |= DMA_PTE_WRITE;
3352         /*
3353          * paddr - (paddr + size) might be partial page, we should map the whole
3354          * page.  Note: if two part of one page are separately mapped, we
3355          * might have two guest_addr mapping to the same host paddr, but this
3356          * is not a big problem
3357          */
3358         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3359                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3360         if (ret)
3361                 goto error;
3362
3363         /* it's a non-present to present mapping. Only flush if caching mode */
3364         if (cap_caching_mode(iommu->cap))
3365                 iommu_flush_iotlb_psi(iommu, domain,
3366                                       mm_to_dma_pfn(iova->pfn_lo),
3367                                       size, 0, 1);
3368         else
3369                 iommu_flush_write_buffer(iommu);
3370
3371         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3372         start_paddr += paddr & ~PAGE_MASK;
3373         return start_paddr;
3374
3375 error:
3376         if (iova)
3377                 __free_iova(&domain->iovad, iova);
3378         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3379                 dev_name(dev), size, (unsigned long long)paddr, dir);
3380         return 0;
3381 }
3382
3383 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3384                                  unsigned long offset, size_t size,
3385                                  enum dma_data_direction dir,
3386                                  struct dma_attrs *attrs)
3387 {
3388         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3389                                   dir, *dev->dma_mask);
3390 }
3391
3392 static void flush_unmaps(void)
3393 {
3394         int i, j;
3395
3396         timer_on = 0;
3397
3398         /* just flush them all */
3399         for (i = 0; i < g_num_of_iommus; i++) {
3400                 struct intel_iommu *iommu = g_iommus[i];
3401                 if (!iommu)
3402                         continue;
3403
3404                 if (!deferred_flush[i].next)
3405                         continue;
3406
3407                 /* In caching mode, global flushes turn emulation expensive */
3408                 if (!cap_caching_mode(iommu->cap))
3409                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3410                                          DMA_TLB_GLOBAL_FLUSH);
3411                 for (j = 0; j < deferred_flush[i].next; j++) {
3412                         unsigned long mask;
3413                         struct iova *iova = deferred_flush[i].iova[j];
3414                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3415
3416                         /* On real hardware multiple invalidations are expensive */
3417                         if (cap_caching_mode(iommu->cap))
3418                                 iommu_flush_iotlb_psi(iommu, domain,
3419                                         iova->pfn_lo, iova_size(iova),
3420                                         !deferred_flush[i].freelist[j], 0);
3421                         else {
3422                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3423                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3424                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3425                         }
3426                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3427                         if (deferred_flush[i].freelist[j])
3428                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3429                 }
3430                 deferred_flush[i].next = 0;
3431         }
3432
3433         list_size = 0;
3434 }
3435
3436 static void flush_unmaps_timeout(unsigned long data)
3437 {
3438         unsigned long flags;
3439
3440         spin_lock_irqsave(&async_umap_flush_lock, flags);
3441         flush_unmaps();
3442         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3443 }
3444
3445 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3446 {
3447         unsigned long flags;
3448         int next, iommu_id;
3449         struct intel_iommu *iommu;
3450
3451         spin_lock_irqsave(&async_umap_flush_lock, flags);
3452         if (list_size == HIGH_WATER_MARK)
3453                 flush_unmaps();
3454
3455         iommu = domain_get_iommu(dom);
3456         iommu_id = iommu->seq_id;
3457
3458         next = deferred_flush[iommu_id].next;
3459         deferred_flush[iommu_id].domain[next] = dom;
3460         deferred_flush[iommu_id].iova[next] = iova;
3461         deferred_flush[iommu_id].freelist[next] = freelist;
3462         deferred_flush[iommu_id].next++;
3463
3464         if (!timer_on) {
3465                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3466                 timer_on = 1;
3467         }
3468         list_size++;
3469         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3470 }
3471
3472 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3473 {
3474         struct dmar_domain *domain;
3475         unsigned long start_pfn, last_pfn;
3476         struct iova *iova;
3477         struct intel_iommu *iommu;
3478         struct page *freelist;
3479
3480         if (iommu_no_mapping(dev))
3481                 return;
3482
3483         domain = find_domain(dev);
3484         BUG_ON(!domain);
3485
3486         iommu = domain_get_iommu(domain);
3487
3488         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3489         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3490                       (unsigned long long)dev_addr))
3491                 return;
3492
3493         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3494         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3495
3496         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3497                  dev_name(dev), start_pfn, last_pfn);
3498
3499         freelist = domain_unmap(domain, start_pfn, last_pfn);
3500
3501         if (intel_iommu_strict) {
3502                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3503                                       last_pfn - start_pfn + 1, !freelist, 0);
3504                 /* free iova */
3505                 __free_iova(&domain->iovad, iova);
3506                 dma_free_pagelist(freelist);
3507         } else {
3508                 add_unmap(domain, iova, freelist);
3509                 /*
3510                  * queue up the release of the unmap to save the 1/6th of the
3511                  * cpu used up by the iotlb flush operation...
3512                  */
3513         }
3514 }
3515
3516 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3517                              size_t size, enum dma_data_direction dir,
3518                              struct dma_attrs *attrs)
3519 {
3520         intel_unmap(dev, dev_addr);
3521 }
3522
3523 static void *intel_alloc_coherent(struct device *dev, size_t size,
3524                                   dma_addr_t *dma_handle, gfp_t flags,
3525                                   struct dma_attrs *attrs)
3526 {
3527         struct page *page = NULL;
3528         int order;
3529
3530         size = PAGE_ALIGN(size);
3531         order = get_order(size);
3532
3533         if (!iommu_no_mapping(dev))
3534                 flags &= ~(GFP_DMA | GFP_DMA32);
3535         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3536                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3537                         flags |= GFP_DMA;
3538                 else
3539                         flags |= GFP_DMA32;
3540         }
3541
3542         if (flags & __GFP_WAIT) {
3543                 unsigned int count = size >> PAGE_SHIFT;
3544
3545                 page = dma_alloc_from_contiguous(dev, count, order);
3546                 if (page && iommu_no_mapping(dev) &&
3547                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3548                         dma_release_from_contiguous(dev, page, count);
3549                         page = NULL;
3550                 }
3551         }
3552
3553         if (!page)
3554                 page = alloc_pages(flags, order);
3555         if (!page)
3556                 return NULL;
3557         memset(page_address(page), 0, size);
3558
3559         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3560                                          DMA_BIDIRECTIONAL,
3561                                          dev->coherent_dma_mask);
3562         if (*dma_handle)
3563                 return page_address(page);
3564         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3565                 __free_pages(page, order);
3566
3567         return NULL;
3568 }
3569
3570 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3571                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3572 {
3573         int order;
3574         struct page *page = virt_to_page(vaddr);
3575
3576         size = PAGE_ALIGN(size);
3577         order = get_order(size);
3578
3579         intel_unmap(dev, dma_handle);
3580         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3581                 __free_pages(page, order);
3582 }
3583
3584 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3585                            int nelems, enum dma_data_direction dir,
3586                            struct dma_attrs *attrs)
3587 {
3588         intel_unmap(dev, sglist[0].dma_address);
3589 }
3590
3591 static int intel_nontranslate_map_sg(struct device *hddev,
3592         struct scatterlist *sglist, int nelems, int dir)
3593 {
3594         int i;
3595         struct scatterlist *sg;
3596
3597         for_each_sg(sglist, sg, nelems, i) {
3598                 BUG_ON(!sg_page(sg));
3599                 sg->dma_address = sg_phys(sg);
3600                 sg->dma_length = sg->length;
3601         }
3602         return nelems;
3603 }
3604
3605 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3606                         enum dma_data_direction dir, struct dma_attrs *attrs)
3607 {
3608         int i;
3609         struct dmar_domain *domain;
3610         size_t size = 0;
3611         int prot = 0;
3612         struct iova *iova = NULL;
3613         int ret;
3614         struct scatterlist *sg;
3615         unsigned long start_vpfn;
3616         struct intel_iommu *iommu;
3617
3618         BUG_ON(dir == DMA_NONE);
3619         if (iommu_no_mapping(dev))
3620                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3621
3622         domain = get_valid_domain_for_dev(dev);
3623         if (!domain)
3624                 return 0;
3625
3626         iommu = domain_get_iommu(domain);
3627
3628         for_each_sg(sglist, sg, nelems, i)
3629                 size += aligned_nrpages(sg->offset, sg->length);
3630
3631         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3632                                 *dev->dma_mask);
3633         if (!iova) {
3634                 sglist->dma_length = 0;
3635                 return 0;
3636         }
3637
3638         /*
3639          * Check if DMAR supports zero-length reads on write only
3640          * mappings..
3641          */
3642         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3643                         !cap_zlr(iommu->cap))
3644                 prot |= DMA_PTE_READ;
3645         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3646                 prot |= DMA_PTE_WRITE;
3647
3648         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3649
3650         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3651         if (unlikely(ret)) {
3652                 dma_pte_free_pagetable(domain, start_vpfn,
3653                                        start_vpfn + size - 1);
3654                 __free_iova(&domain->iovad, iova);
3655                 return 0;
3656         }
3657
3658         /* it's a non-present to present mapping. Only flush if caching mode */
3659         if (cap_caching_mode(iommu->cap))
3660                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3661         else
3662                 iommu_flush_write_buffer(iommu);
3663
3664         return nelems;
3665 }
3666
3667 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3668 {
3669         return !dma_addr;
3670 }
3671
3672 struct dma_map_ops intel_dma_ops = {
3673         .alloc = intel_alloc_coherent,
3674         .free = intel_free_coherent,
3675         .map_sg = intel_map_sg,
3676         .unmap_sg = intel_unmap_sg,
3677         .map_page = intel_map_page,
3678         .unmap_page = intel_unmap_page,
3679         .mapping_error = intel_mapping_error,
3680 };
3681
3682 static inline int iommu_domain_cache_init(void)
3683 {
3684         int ret = 0;
3685
3686         iommu_domain_cache = kmem_cache_create("iommu_domain",
3687                                          sizeof(struct dmar_domain),
3688                                          0,
3689                                          SLAB_HWCACHE_ALIGN,
3690
3691                                          NULL);
3692         if (!iommu_domain_cache) {
3693                 pr_err("Couldn't create iommu_domain cache\n");
3694                 ret = -ENOMEM;
3695         }
3696
3697         return ret;
3698 }
3699
3700 static inline int iommu_devinfo_cache_init(void)
3701 {
3702         int ret = 0;
3703
3704         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3705                                          sizeof(struct device_domain_info),
3706                                          0,
3707                                          SLAB_HWCACHE_ALIGN,
3708                                          NULL);
3709         if (!iommu_devinfo_cache) {
3710                 pr_err("Couldn't create devinfo cache\n");
3711                 ret = -ENOMEM;
3712         }
3713
3714         return ret;
3715 }
3716
3717 static int __init iommu_init_mempool(void)
3718 {
3719         int ret;
3720         ret = iova_cache_get();
3721         if (ret)
3722                 return ret;
3723
3724         ret = iommu_domain_cache_init();
3725         if (ret)
3726                 goto domain_error;
3727
3728         ret = iommu_devinfo_cache_init();
3729         if (!ret)
3730                 return ret;
3731
3732         kmem_cache_destroy(iommu_domain_cache);
3733 domain_error:
3734         iova_cache_put();
3735
3736         return -ENOMEM;
3737 }
3738
3739 static void __init iommu_exit_mempool(void)
3740 {
3741         kmem_cache_destroy(iommu_devinfo_cache);
3742         kmem_cache_destroy(iommu_domain_cache);
3743         iova_cache_put();
3744 }
3745
3746 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3747 {
3748         struct dmar_drhd_unit *drhd;
3749         u32 vtbar;
3750         int rc;
3751
3752         /* We know that this device on this chipset has its own IOMMU.
3753          * If we find it under a different IOMMU, then the BIOS is lying
3754          * to us. Hope that the IOMMU for this device is actually
3755          * disabled, and it needs no translation...
3756          */
3757         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3758         if (rc) {
3759                 /* "can't" happen */
3760                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3761                 return;
3762         }
3763         vtbar &= 0xffff0000;
3764
3765         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3766         drhd = dmar_find_matched_drhd_unit(pdev);
3767         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3768                             TAINT_FIRMWARE_WORKAROUND,
3769                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3770                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3771 }
3772 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3773
3774 static void __init init_no_remapping_devices(void)
3775 {
3776         struct dmar_drhd_unit *drhd;
3777         struct device *dev;
3778         int i;
3779
3780         for_each_drhd_unit(drhd) {
3781                 if (!drhd->include_all) {
3782                         for_each_active_dev_scope(drhd->devices,
3783                                                   drhd->devices_cnt, i, dev)
3784                                 break;
3785                         /* ignore DMAR unit if no devices exist */
3786                         if (i == drhd->devices_cnt)
3787                                 drhd->ignored = 1;
3788                 }
3789         }
3790
3791         for_each_active_drhd_unit(drhd) {
3792                 if (drhd->include_all)
3793                         continue;
3794
3795                 for_each_active_dev_scope(drhd->devices,
3796                                           drhd->devices_cnt, i, dev)
3797                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3798                                 break;
3799                 if (i < drhd->devices_cnt)
3800                         continue;
3801
3802                 /* This IOMMU has *only* gfx devices. Either bypass it or
3803                    set the gfx_mapped flag, as appropriate */
3804                 if (dmar_map_gfx) {
3805                         intel_iommu_gfx_mapped = 1;
3806                 } else {
3807                         drhd->ignored = 1;
3808                         for_each_active_dev_scope(drhd->devices,
3809                                                   drhd->devices_cnt, i, dev)
3810                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3811                 }
3812         }
3813 }
3814
3815 #ifdef CONFIG_SUSPEND
3816 static int init_iommu_hw(void)
3817 {
3818         struct dmar_drhd_unit *drhd;
3819         struct intel_iommu *iommu = NULL;
3820
3821         for_each_active_iommu(iommu, drhd)
3822                 if (iommu->qi)
3823                         dmar_reenable_qi(iommu);
3824
3825         for_each_iommu(iommu, drhd) {
3826                 if (drhd->ignored) {
3827                         /*
3828                          * we always have to disable PMRs or DMA may fail on
3829                          * this device
3830                          */
3831                         if (force_on)
3832                                 iommu_disable_protect_mem_regions(iommu);
3833                         continue;
3834                 }
3835         
3836                 iommu_flush_write_buffer(iommu);
3837
3838                 iommu_set_root_entry(iommu);
3839
3840                 iommu->flush.flush_context(iommu, 0, 0, 0,
3841                                            DMA_CCMD_GLOBAL_INVL);
3842                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3843                 iommu_enable_translation(iommu);
3844                 iommu_disable_protect_mem_regions(iommu);
3845         }
3846
3847         return 0;
3848 }
3849
3850 static void iommu_flush_all(void)
3851 {
3852         struct dmar_drhd_unit *drhd;
3853         struct intel_iommu *iommu;
3854
3855         for_each_active_iommu(iommu, drhd) {
3856                 iommu->flush.flush_context(iommu, 0, 0, 0,
3857                                            DMA_CCMD_GLOBAL_INVL);
3858                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3859                                          DMA_TLB_GLOBAL_FLUSH);
3860         }
3861 }
3862
3863 static int iommu_suspend(void)
3864 {
3865         struct dmar_drhd_unit *drhd;
3866         struct intel_iommu *iommu = NULL;
3867         unsigned long flag;
3868
3869         for_each_active_iommu(iommu, drhd) {
3870                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3871                                                  GFP_ATOMIC);
3872                 if (!iommu->iommu_state)
3873                         goto nomem;
3874         }
3875
3876         iommu_flush_all();
3877
3878         for_each_active_iommu(iommu, drhd) {
3879                 iommu_disable_translation(iommu);
3880
3881                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3882
3883                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3884                         readl(iommu->reg + DMAR_FECTL_REG);
3885                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3886                         readl(iommu->reg + DMAR_FEDATA_REG);
3887                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3888                         readl(iommu->reg + DMAR_FEADDR_REG);
3889                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3890                         readl(iommu->reg + DMAR_FEUADDR_REG);
3891
3892                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3893         }
3894         return 0;
3895
3896 nomem:
3897         for_each_active_iommu(iommu, drhd)
3898                 kfree(iommu->iommu_state);
3899
3900         return -ENOMEM;
3901 }
3902
3903 static void iommu_resume(void)
3904 {
3905         struct dmar_drhd_unit *drhd;
3906         struct intel_iommu *iommu = NULL;
3907         unsigned long flag;
3908
3909         if (init_iommu_hw()) {
3910                 if (force_on)
3911                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3912                 else
3913                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3914                 return;
3915         }
3916
3917         for_each_active_iommu(iommu, drhd) {
3918
3919                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3920
3921                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3922                         iommu->reg + DMAR_FECTL_REG);
3923                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3924                         iommu->reg + DMAR_FEDATA_REG);
3925                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3926                         iommu->reg + DMAR_FEADDR_REG);
3927                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3928                         iommu->reg + DMAR_FEUADDR_REG);
3929
3930                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3931         }
3932
3933         for_each_active_iommu(iommu, drhd)
3934                 kfree(iommu->iommu_state);
3935 }
3936
3937 static struct syscore_ops iommu_syscore_ops = {
3938         .resume         = iommu_resume,
3939         .suspend        = iommu_suspend,
3940 };
3941
3942 static void __init init_iommu_pm_ops(void)
3943 {
3944         register_syscore_ops(&iommu_syscore_ops);
3945 }
3946
3947 #else
3948 static inline void init_iommu_pm_ops(void) {}
3949 #endif  /* CONFIG_PM */
3950
3951
3952 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3953 {
3954         struct acpi_dmar_reserved_memory *rmrr;
3955         struct dmar_rmrr_unit *rmrru;
3956
3957         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3958         if (!rmrru)
3959                 return -ENOMEM;
3960
3961         rmrru->hdr = header;
3962         rmrr = (struct acpi_dmar_reserved_memory *)header;
3963         rmrru->base_address = rmrr->base_address;
3964         rmrru->end_address = rmrr->end_address;
3965         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3966                                 ((void *)rmrr) + rmrr->header.length,
3967                                 &rmrru->devices_cnt);
3968         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3969                 kfree(rmrru);
3970                 return -ENOMEM;
3971         }
3972
3973         list_add(&rmrru->list, &dmar_rmrr_units);
3974
3975         return 0;
3976 }
3977
3978 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3979 {
3980         struct dmar_atsr_unit *atsru;
3981         struct acpi_dmar_atsr *tmp;
3982
3983         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3984                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3985                 if (atsr->segment != tmp->segment)
3986                         continue;
3987                 if (atsr->header.length != tmp->header.length)
3988                         continue;
3989                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3990                         return atsru;
3991         }
3992
3993         return NULL;
3994 }
3995
3996 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3997 {
3998         struct acpi_dmar_atsr *atsr;
3999         struct dmar_atsr_unit *atsru;
4000
4001         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4002                 return 0;
4003
4004         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4005         atsru = dmar_find_atsr(atsr);
4006         if (atsru)
4007                 return 0;
4008
4009         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4010         if (!atsru)
4011                 return -ENOMEM;
4012
4013         /*
4014          * If memory is allocated from slab by ACPI _DSM method, we need to
4015          * copy the memory content because the memory buffer will be freed
4016          * on return.
4017          */
4018         atsru->hdr = (void *)(atsru + 1);
4019         memcpy(atsru->hdr, hdr, hdr->length);
4020         atsru->include_all = atsr->flags & 0x1;
4021         if (!atsru->include_all) {
4022                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4023                                 (void *)atsr + atsr->header.length,
4024                                 &atsru->devices_cnt);
4025                 if (atsru->devices_cnt && atsru->devices == NULL) {
4026                         kfree(atsru);
4027                         return -ENOMEM;
4028                 }
4029         }
4030
4031         list_add_rcu(&atsru->list, &dmar_atsr_units);
4032
4033         return 0;
4034 }
4035
4036 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4037 {
4038         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4039         kfree(atsru);
4040 }
4041
4042 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4043 {
4044         struct acpi_dmar_atsr *atsr;
4045         struct dmar_atsr_unit *atsru;
4046
4047         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4048         atsru = dmar_find_atsr(atsr);
4049         if (atsru) {
4050                 list_del_rcu(&atsru->list);
4051                 synchronize_rcu();
4052                 intel_iommu_free_atsr(atsru);
4053         }
4054
4055         return 0;
4056 }
4057
4058 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4059 {
4060         int i;
4061         struct device *dev;
4062         struct acpi_dmar_atsr *atsr;
4063         struct dmar_atsr_unit *atsru;
4064
4065         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4066         atsru = dmar_find_atsr(atsr);
4067         if (!atsru)
4068                 return 0;
4069
4070         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4071                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4072                                           i, dev)
4073                         return -EBUSY;
4074
4075         return 0;
4076 }
4077
4078 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4079 {
4080         int sp, ret = 0;
4081         struct intel_iommu *iommu = dmaru->iommu;
4082
4083         if (g_iommus[iommu->seq_id])
4084                 return 0;
4085
4086         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4087                 pr_warn("%s: Doesn't support hardware pass through.\n",
4088                         iommu->name);
4089                 return -ENXIO;
4090         }
4091         if (!ecap_sc_support(iommu->ecap) &&
4092             domain_update_iommu_snooping(iommu)) {
4093                 pr_warn("%s: Doesn't support snooping.\n",
4094                         iommu->name);
4095                 return -ENXIO;
4096         }
4097         sp = domain_update_iommu_superpage(iommu) - 1;
4098         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4099                 pr_warn("%s: Doesn't support large page.\n",
4100                         iommu->name);
4101                 return -ENXIO;
4102         }
4103
4104         /*
4105          * Disable translation if already enabled prior to OS handover.
4106          */
4107         if (iommu->gcmd & DMA_GCMD_TE)
4108                 iommu_disable_translation(iommu);
4109
4110         g_iommus[iommu->seq_id] = iommu;
4111         ret = iommu_init_domains(iommu);
4112         if (ret == 0)
4113                 ret = iommu_alloc_root_entry(iommu);
4114         if (ret)
4115                 goto out;
4116
4117         if (dmaru->ignored) {
4118                 /*
4119                  * we always have to disable PMRs or DMA may fail on this device
4120                  */
4121                 if (force_on)
4122                         iommu_disable_protect_mem_regions(iommu);
4123                 return 0;
4124         }
4125
4126         intel_iommu_init_qi(iommu);
4127         iommu_flush_write_buffer(iommu);
4128         ret = dmar_set_interrupt(iommu);
4129         if (ret)
4130                 goto disable_iommu;
4131
4132         iommu_set_root_entry(iommu);
4133         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4134         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4135         iommu_enable_translation(iommu);
4136
4137         iommu_disable_protect_mem_regions(iommu);
4138         return 0;
4139
4140 disable_iommu:
4141         disable_dmar_iommu(iommu);
4142 out:
4143         free_dmar_iommu(iommu);
4144         return ret;
4145 }
4146
4147 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4148 {
4149         int ret = 0;
4150         struct intel_iommu *iommu = dmaru->iommu;
4151
4152         if (!intel_iommu_enabled)
4153                 return 0;
4154         if (iommu == NULL)
4155                 return -EINVAL;
4156
4157         if (insert) {
4158                 ret = intel_iommu_add(dmaru);
4159         } else {
4160                 disable_dmar_iommu(iommu);
4161                 free_dmar_iommu(iommu);
4162         }
4163
4164         return ret;
4165 }
4166
4167 static void intel_iommu_free_dmars(void)
4168 {
4169         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4170         struct dmar_atsr_unit *atsru, *atsr_n;
4171
4172         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4173                 list_del(&rmrru->list);
4174                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4175                 kfree(rmrru);
4176         }
4177
4178         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4179                 list_del(&atsru->list);
4180                 intel_iommu_free_atsr(atsru);
4181         }
4182 }
4183
4184 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4185 {
4186         int i, ret = 1;
4187         struct pci_bus *bus;
4188         struct pci_dev *bridge = NULL;
4189         struct device *tmp;
4190         struct acpi_dmar_atsr *atsr;
4191         struct dmar_atsr_unit *atsru;
4192
4193         dev = pci_physfn(dev);
4194         for (bus = dev->bus; bus; bus = bus->parent) {
4195                 bridge = bus->self;
4196                 if (!bridge || !pci_is_pcie(bridge) ||
4197                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4198                         return 0;
4199                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4200                         break;
4201         }
4202         if (!bridge)
4203                 return 0;
4204
4205         rcu_read_lock();
4206         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4207                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4208                 if (atsr->segment != pci_domain_nr(dev->bus))
4209                         continue;
4210
4211                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4212                         if (tmp == &bridge->dev)
4213                                 goto out;
4214
4215                 if (atsru->include_all)
4216                         goto out;
4217         }
4218         ret = 0;
4219 out:
4220         rcu_read_unlock();
4221
4222         return ret;
4223 }
4224
4225 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4226 {
4227         int ret = 0;
4228         struct dmar_rmrr_unit *rmrru;
4229         struct dmar_atsr_unit *atsru;
4230         struct acpi_dmar_atsr *atsr;
4231         struct acpi_dmar_reserved_memory *rmrr;
4232
4233         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4234                 return 0;
4235
4236         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4237                 rmrr = container_of(rmrru->hdr,
4238                                     struct acpi_dmar_reserved_memory, header);
4239                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4240                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4241                                 ((void *)rmrr) + rmrr->header.length,
4242                                 rmrr->segment, rmrru->devices,
4243                                 rmrru->devices_cnt);
4244                         if(ret < 0)
4245                                 return ret;
4246                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4247                         dmar_remove_dev_scope(info, rmrr->segment,
4248                                 rmrru->devices, rmrru->devices_cnt);
4249                 }
4250         }
4251
4252         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4253                 if (atsru->include_all)
4254                         continue;
4255
4256                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4257                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4258                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4259                                         (void *)atsr + atsr->header.length,
4260                                         atsr->segment, atsru->devices,
4261                                         atsru->devices_cnt);
4262                         if (ret > 0)
4263                                 break;
4264                         else if(ret < 0)
4265                                 return ret;
4266                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4267                         if (dmar_remove_dev_scope(info, atsr->segment,
4268                                         atsru->devices, atsru->devices_cnt))
4269                                 break;
4270                 }
4271         }
4272
4273         return 0;
4274 }
4275
4276 /*
4277  * Here we only respond to action of unbound device from driver.
4278  *
4279  * Added device is not attached to its DMAR domain here yet. That will happen
4280  * when mapping the device to iova.
4281  */
4282 static int device_notifier(struct notifier_block *nb,
4283                                   unsigned long action, void *data)
4284 {
4285         struct device *dev = data;
4286         struct dmar_domain *domain;
4287
4288         if (iommu_dummy(dev))
4289                 return 0;
4290
4291         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4292                 return 0;
4293
4294         domain = find_domain(dev);
4295         if (!domain)
4296                 return 0;
4297
4298         dmar_remove_one_dev_info(domain, dev);
4299         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4300                 domain_exit(domain);
4301
4302         return 0;
4303 }
4304
4305 static struct notifier_block device_nb = {
4306         .notifier_call = device_notifier,
4307 };
4308
4309 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4310                                        unsigned long val, void *v)
4311 {
4312         struct memory_notify *mhp = v;
4313         unsigned long long start, end;
4314         unsigned long start_vpfn, last_vpfn;
4315
4316         switch (val) {
4317         case MEM_GOING_ONLINE:
4318                 start = mhp->start_pfn << PAGE_SHIFT;
4319                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4320                 if (iommu_domain_identity_map(si_domain, start, end)) {
4321                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4322                                 start, end);
4323                         return NOTIFY_BAD;
4324                 }
4325                 break;
4326
4327         case MEM_OFFLINE:
4328         case MEM_CANCEL_ONLINE:
4329                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4330                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4331                 while (start_vpfn <= last_vpfn) {
4332                         struct iova *iova;
4333                         struct dmar_drhd_unit *drhd;
4334                         struct intel_iommu *iommu;
4335                         struct page *freelist;
4336
4337                         iova = find_iova(&si_domain->iovad, start_vpfn);
4338                         if (iova == NULL) {
4339                                 pr_debug("Failed get IOVA for PFN %lx\n",
4340                                          start_vpfn);
4341                                 break;
4342                         }
4343
4344                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4345                                                      start_vpfn, last_vpfn);
4346                         if (iova == NULL) {
4347                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4348                                         start_vpfn, last_vpfn);
4349                                 return NOTIFY_BAD;
4350                         }
4351
4352                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4353                                                iova->pfn_hi);
4354
4355                         rcu_read_lock();
4356                         for_each_active_iommu(iommu, drhd)
4357                                 iommu_flush_iotlb_psi(iommu, si_domain,
4358                                         iova->pfn_lo, iova_size(iova),
4359                                         !freelist, 0);
4360                         rcu_read_unlock();
4361                         dma_free_pagelist(freelist);
4362
4363                         start_vpfn = iova->pfn_hi + 1;
4364                         free_iova_mem(iova);
4365                 }
4366                 break;
4367         }
4368
4369         return NOTIFY_OK;
4370 }
4371
4372 static struct notifier_block intel_iommu_memory_nb = {
4373         .notifier_call = intel_iommu_memory_notifier,
4374         .priority = 0
4375 };
4376
4377
4378 static ssize_t intel_iommu_show_version(struct device *dev,
4379                                         struct device_attribute *attr,
4380                                         char *buf)
4381 {
4382         struct intel_iommu *iommu = dev_get_drvdata(dev);
4383         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4384         return sprintf(buf, "%d:%d\n",
4385                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4386 }
4387 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4388
4389 static ssize_t intel_iommu_show_address(struct device *dev,
4390                                         struct device_attribute *attr,
4391                                         char *buf)
4392 {
4393         struct intel_iommu *iommu = dev_get_drvdata(dev);
4394         return sprintf(buf, "%llx\n", iommu->reg_phys);
4395 }
4396 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4397
4398 static ssize_t intel_iommu_show_cap(struct device *dev,
4399                                     struct device_attribute *attr,
4400                                     char *buf)
4401 {
4402         struct intel_iommu *iommu = dev_get_drvdata(dev);
4403         return sprintf(buf, "%llx\n", iommu->cap);
4404 }
4405 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4406
4407 static ssize_t intel_iommu_show_ecap(struct device *dev,
4408                                     struct device_attribute *attr,
4409                                     char *buf)
4410 {
4411         struct intel_iommu *iommu = dev_get_drvdata(dev);
4412         return sprintf(buf, "%llx\n", iommu->ecap);
4413 }
4414 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4415
4416 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4417                                       struct device_attribute *attr,
4418                                       char *buf)
4419 {
4420         struct intel_iommu *iommu = dev_get_drvdata(dev);
4421         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4422 }
4423 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4424
4425 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4426                                            struct device_attribute *attr,
4427                                            char *buf)
4428 {
4429         struct intel_iommu *iommu = dev_get_drvdata(dev);
4430         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4431                                                   cap_ndoms(iommu->cap)));
4432 }
4433 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4434
4435 static struct attribute *intel_iommu_attrs[] = {
4436         &dev_attr_version.attr,
4437         &dev_attr_address.attr,
4438         &dev_attr_cap.attr,
4439         &dev_attr_ecap.attr,
4440         &dev_attr_domains_supported.attr,
4441         &dev_attr_domains_used.attr,
4442         NULL,
4443 };
4444
4445 static struct attribute_group intel_iommu_group = {
4446         .name = "intel-iommu",
4447         .attrs = intel_iommu_attrs,
4448 };
4449
4450 const struct attribute_group *intel_iommu_groups[] = {
4451         &intel_iommu_group,
4452         NULL,
4453 };
4454
4455 int __init intel_iommu_init(void)
4456 {
4457         int ret = -ENODEV;
4458         struct dmar_drhd_unit *drhd;
4459         struct intel_iommu *iommu;
4460
4461         /* VT-d is required for a TXT/tboot launch, so enforce that */
4462         force_on = tboot_force_iommu();
4463
4464         if (iommu_init_mempool()) {
4465                 if (force_on)
4466                         panic("tboot: Failed to initialize iommu memory\n");
4467                 return -ENOMEM;
4468         }
4469
4470         down_write(&dmar_global_lock);
4471         if (dmar_table_init()) {
4472                 if (force_on)
4473                         panic("tboot: Failed to initialize DMAR table\n");
4474                 goto out_free_dmar;
4475         }
4476
4477         if (dmar_dev_scope_init() < 0) {
4478                 if (force_on)
4479                         panic("tboot: Failed to initialize DMAR device scope\n");
4480                 goto out_free_dmar;
4481         }
4482
4483         if (no_iommu || dmar_disabled)
4484                 goto out_free_dmar;
4485
4486         if (list_empty(&dmar_rmrr_units))
4487                 pr_info("No RMRR found\n");
4488
4489         if (list_empty(&dmar_atsr_units))
4490                 pr_info("No ATSR found\n");
4491
4492         if (dmar_init_reserved_ranges()) {
4493                 if (force_on)
4494                         panic("tboot: Failed to reserve iommu ranges\n");
4495                 goto out_free_reserved_range;
4496         }
4497
4498         init_no_remapping_devices();
4499
4500         ret = init_dmars();
4501         if (ret) {
4502                 if (force_on)
4503                         panic("tboot: Failed to initialize DMARs\n");
4504                 pr_err("Initialization failed\n");
4505                 goto out_free_reserved_range;
4506         }
4507         up_write(&dmar_global_lock);
4508         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4509
4510         init_timer(&unmap_timer);
4511 #ifdef CONFIG_SWIOTLB
4512         swiotlb = 0;
4513 #endif
4514         dma_ops = &intel_dma_ops;
4515
4516         init_iommu_pm_ops();
4517
4518         for_each_active_iommu(iommu, drhd)
4519                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4520                                                        intel_iommu_groups,
4521                                                        "%s", iommu->name);
4522
4523         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4524         bus_register_notifier(&pci_bus_type, &device_nb);
4525         if (si_domain && !hw_pass_through)
4526                 register_memory_notifier(&intel_iommu_memory_nb);
4527
4528         intel_iommu_enabled = 1;
4529
4530         return 0;
4531
4532 out_free_reserved_range:
4533         put_iova_domain(&reserved_iova_list);
4534 out_free_dmar:
4535         intel_iommu_free_dmars();
4536         up_write(&dmar_global_lock);
4537         iommu_exit_mempool();
4538         return ret;
4539 }
4540
4541 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4542 {
4543         struct intel_iommu *iommu = opaque;
4544
4545         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4546         return 0;
4547 }
4548
4549 /*
4550  * NB - intel-iommu lacks any sort of reference counting for the users of
4551  * dependent devices.  If multiple endpoints have intersecting dependent
4552  * devices, unbinding the driver from any one of them will possibly leave
4553  * the others unable to operate.
4554  */
4555 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4556 {
4557         if (!iommu || !dev || !dev_is_pci(dev))
4558                 return;
4559
4560         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4561 }
4562
4563 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4564 {
4565         struct intel_iommu *iommu;
4566         unsigned long flags;
4567
4568         assert_spin_locked(&device_domain_lock);
4569
4570         if (WARN_ON(!info))
4571                 return;
4572
4573         iommu = info->iommu;
4574
4575         if (info->dev) {
4576                 iommu_disable_dev_iotlb(info);
4577                 domain_context_clear(iommu, info->dev);
4578         }
4579
4580         unlink_domain_info(info);
4581
4582         spin_lock_irqsave(&iommu->lock, flags);
4583         domain_detach_iommu(info->domain, iommu);
4584         spin_unlock_irqrestore(&iommu->lock, flags);
4585
4586         free_devinfo_mem(info);
4587 }
4588
4589 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4590                                      struct device *dev)
4591 {
4592         struct device_domain_info *info;
4593         unsigned long flags;
4594
4595         spin_lock_irqsave(&device_domain_lock, flags);
4596         info = dev->archdata.iommu;
4597         __dmar_remove_one_dev_info(info);
4598         spin_unlock_irqrestore(&device_domain_lock, flags);
4599 }
4600
4601 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4602 {
4603         int adjust_width;
4604
4605         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4606                         DMA_32BIT_PFN);
4607         domain_reserve_special_ranges(domain);
4608
4609         /* calculate AGAW */
4610         domain->gaw = guest_width;
4611         adjust_width = guestwidth_to_adjustwidth(guest_width);
4612         domain->agaw = width_to_agaw(adjust_width);
4613
4614         domain->iommu_coherency = 0;
4615         domain->iommu_snooping = 0;
4616         domain->iommu_superpage = 0;
4617         domain->max_addr = 0;
4618
4619         /* always allocate the top pgd */
4620         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4621         if (!domain->pgd)
4622                 return -ENOMEM;
4623         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4624         return 0;
4625 }
4626
4627 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4628 {
4629         struct dmar_domain *dmar_domain;
4630         struct iommu_domain *domain;
4631
4632         if (type != IOMMU_DOMAIN_UNMANAGED)
4633                 return NULL;
4634
4635         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4636         if (!dmar_domain) {
4637                 pr_err("Can't allocate dmar_domain\n");
4638                 return NULL;
4639         }
4640         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4641                 pr_err("Domain initialization failed\n");
4642                 domain_exit(dmar_domain);
4643                 return NULL;
4644         }
4645         domain_update_iommu_cap(dmar_domain);
4646
4647         domain = &dmar_domain->domain;
4648         domain->geometry.aperture_start = 0;
4649         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4650         domain->geometry.force_aperture = true;
4651
4652         return domain;
4653 }
4654
4655 static void intel_iommu_domain_free(struct iommu_domain *domain)
4656 {
4657         domain_exit(to_dmar_domain(domain));
4658 }
4659
4660 static int intel_iommu_attach_device(struct iommu_domain *domain,
4661                                      struct device *dev)
4662 {
4663         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4664         struct intel_iommu *iommu;
4665         int addr_width;
4666         u8 bus, devfn;
4667
4668         if (device_is_rmrr_locked(dev)) {
4669                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4670                 return -EPERM;
4671         }
4672
4673         /* normally dev is not mapped */
4674         if (unlikely(domain_context_mapped(dev))) {
4675                 struct dmar_domain *old_domain;
4676
4677                 old_domain = find_domain(dev);
4678                 if (old_domain) {
4679                         rcu_read_lock();
4680                         dmar_remove_one_dev_info(old_domain, dev);
4681                         rcu_read_unlock();
4682
4683                         if (!domain_type_is_vm_or_si(old_domain) &&
4684                              list_empty(&old_domain->devices))
4685                                 domain_exit(old_domain);
4686                 }
4687         }
4688
4689         iommu = device_to_iommu(dev, &bus, &devfn);
4690         if (!iommu)
4691                 return -ENODEV;
4692
4693         /* check if this iommu agaw is sufficient for max mapped address */
4694         addr_width = agaw_to_width(iommu->agaw);
4695         if (addr_width > cap_mgaw(iommu->cap))
4696                 addr_width = cap_mgaw(iommu->cap);
4697
4698         if (dmar_domain->max_addr > (1LL << addr_width)) {
4699                 pr_err("%s: iommu width (%d) is not "
4700                        "sufficient for the mapped address (%llx)\n",
4701                        __func__, addr_width, dmar_domain->max_addr);
4702                 return -EFAULT;
4703         }
4704         dmar_domain->gaw = addr_width;
4705
4706         /*
4707          * Knock out extra levels of page tables if necessary
4708          */
4709         while (iommu->agaw < dmar_domain->agaw) {
4710                 struct dma_pte *pte;
4711
4712                 pte = dmar_domain->pgd;
4713                 if (dma_pte_present(pte)) {
4714                         dmar_domain->pgd = (struct dma_pte *)
4715                                 phys_to_virt(dma_pte_addr(pte));
4716                         free_pgtable_page(pte);
4717                 }
4718                 dmar_domain->agaw--;
4719         }
4720
4721         return domain_add_dev_info(dmar_domain, dev);
4722 }
4723
4724 static void intel_iommu_detach_device(struct iommu_domain *domain,
4725                                       struct device *dev)
4726 {
4727         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4728 }
4729
4730 static int intel_iommu_map(struct iommu_domain *domain,
4731                            unsigned long iova, phys_addr_t hpa,
4732                            size_t size, int iommu_prot)
4733 {
4734         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4735         u64 max_addr;
4736         int prot = 0;
4737         int ret;
4738
4739         if (iommu_prot & IOMMU_READ)
4740                 prot |= DMA_PTE_READ;
4741         if (iommu_prot & IOMMU_WRITE)
4742                 prot |= DMA_PTE_WRITE;
4743         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4744                 prot |= DMA_PTE_SNP;
4745
4746         max_addr = iova + size;
4747         if (dmar_domain->max_addr < max_addr) {
4748                 u64 end;
4749
4750                 /* check if minimum agaw is sufficient for mapped address */
4751                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4752                 if (end < max_addr) {
4753                         pr_err("%s: iommu width (%d) is not "
4754                                "sufficient for the mapped address (%llx)\n",
4755                                __func__, dmar_domain->gaw, max_addr);
4756                         return -EFAULT;
4757                 }
4758                 dmar_domain->max_addr = max_addr;
4759         }
4760         /* Round up size to next multiple of PAGE_SIZE, if it and
4761            the low bits of hpa would take us onto the next page */
4762         size = aligned_nrpages(hpa, size);
4763         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4764                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4765         return ret;
4766 }
4767
4768 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4769                                 unsigned long iova, size_t size)
4770 {
4771         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4772         struct page *freelist = NULL;
4773         struct intel_iommu *iommu;
4774         unsigned long start_pfn, last_pfn;
4775         unsigned int npages;
4776         int iommu_id, level = 0;
4777
4778         /* Cope with horrid API which requires us to unmap more than the
4779            size argument if it happens to be a large-page mapping. */
4780         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4781
4782         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4783                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4784
4785         start_pfn = iova >> VTD_PAGE_SHIFT;
4786         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4787
4788         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4789
4790         npages = last_pfn - start_pfn + 1;
4791
4792         for_each_domain_iommu(iommu_id, dmar_domain) {
4793                 iommu = g_iommus[iommu_id];
4794
4795                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4796                                       start_pfn, npages, !freelist, 0);
4797         }
4798
4799         dma_free_pagelist(freelist);
4800
4801         if (dmar_domain->max_addr == iova + size)
4802                 dmar_domain->max_addr = iova;
4803
4804         return size;
4805 }
4806
4807 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4808                                             dma_addr_t iova)
4809 {
4810         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4811         struct dma_pte *pte;
4812         int level = 0;
4813         u64 phys = 0;
4814
4815         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4816         if (pte)
4817                 phys = dma_pte_addr(pte);
4818
4819         return phys;
4820 }
4821
4822 static bool intel_iommu_capable(enum iommu_cap cap)
4823 {
4824         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4825                 return domain_update_iommu_snooping(NULL) == 1;
4826         if (cap == IOMMU_CAP_INTR_REMAP)
4827                 return irq_remapping_enabled == 1;
4828
4829         return false;
4830 }
4831
4832 static int intel_iommu_add_device(struct device *dev)
4833 {
4834         struct intel_iommu *iommu;
4835         struct iommu_group *group;
4836         u8 bus, devfn;
4837
4838         iommu = device_to_iommu(dev, &bus, &devfn);
4839         if (!iommu)
4840                 return -ENODEV;
4841
4842         iommu_device_link(iommu->iommu_dev, dev);
4843
4844         group = iommu_group_get_for_dev(dev);
4845
4846         if (IS_ERR(group))
4847                 return PTR_ERR(group);
4848
4849         iommu_group_put(group);
4850         return 0;
4851 }
4852
4853 static void intel_iommu_remove_device(struct device *dev)
4854 {
4855         struct intel_iommu *iommu;
4856         u8 bus, devfn;
4857
4858         iommu = device_to_iommu(dev, &bus, &devfn);
4859         if (!iommu)
4860                 return;
4861
4862         iommu_group_remove_device(dev);
4863
4864         iommu_device_unlink(iommu->iommu_dev, dev);
4865 }
4866
4867 static const struct iommu_ops intel_iommu_ops = {
4868         .capable        = intel_iommu_capable,
4869         .domain_alloc   = intel_iommu_domain_alloc,
4870         .domain_free    = intel_iommu_domain_free,
4871         .attach_dev     = intel_iommu_attach_device,
4872         .detach_dev     = intel_iommu_detach_device,
4873         .map            = intel_iommu_map,
4874         .unmap          = intel_iommu_unmap,
4875         .map_sg         = default_iommu_map_sg,
4876         .iova_to_phys   = intel_iommu_iova_to_phys,
4877         .add_device     = intel_iommu_add_device,
4878         .remove_device  = intel_iommu_remove_device,
4879         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4880 };
4881
4882 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4883 {
4884         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4885         pr_info("Disabling IOMMU for graphics on this chipset\n");
4886         dmar_map_gfx = 0;
4887 }
4888
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4896
4897 static void quirk_iommu_rwbf(struct pci_dev *dev)
4898 {
4899         /*
4900          * Mobile 4 Series Chipset neglects to set RWBF capability,
4901          * but needs it. Same seems to hold for the desktop versions.
4902          */
4903         pr_info("Forcing write-buffer flush capability\n");
4904         rwbf_quirk = 1;
4905 }
4906
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4914
4915 #define GGC 0x52
4916 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4917 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4918 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4919 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4920 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4921 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4922 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4923 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4924
4925 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4926 {
4927         unsigned short ggc;
4928
4929         if (pci_read_config_word(dev, GGC, &ggc))
4930                 return;
4931
4932         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4933                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4934                 dmar_map_gfx = 0;
4935         } else if (dmar_map_gfx) {
4936                 /* we have to ensure the gfx device is idle before we flush */
4937                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4938                 intel_iommu_strict = 1;
4939        }
4940 }
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4945
4946 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4947    ISOCH DMAR unit for the Azalia sound device, but not give it any
4948    TLB entries, which causes it to deadlock. Check for that.  We do
4949    this in a function called from init_dmars(), instead of in a PCI
4950    quirk, because we don't want to print the obnoxious "BIOS broken"
4951    message if VT-d is actually disabled.
4952 */
4953 static void __init check_tylersburg_isoch(void)
4954 {
4955         struct pci_dev *pdev;
4956         uint32_t vtisochctrl;
4957
4958         /* If there's no Azalia in the system anyway, forget it. */
4959         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4960         if (!pdev)
4961                 return;
4962         pci_dev_put(pdev);
4963
4964         /* System Management Registers. Might be hidden, in which case
4965            we can't do the sanity check. But that's OK, because the
4966            known-broken BIOSes _don't_ actually hide it, so far. */
4967         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4968         if (!pdev)
4969                 return;
4970
4971         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4972                 pci_dev_put(pdev);
4973                 return;
4974         }
4975
4976         pci_dev_put(pdev);
4977
4978         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4979         if (vtisochctrl & 1)
4980                 return;
4981
4982         /* Drop all bits other than the number of TLB entries */
4983         vtisochctrl &= 0x1c;
4984
4985         /* If we have the recommended number of TLB entries (16), fine. */
4986         if (vtisochctrl == 0x10)
4987                 return;
4988
4989         /* Zero TLB entries? You get to ride the short bus to school. */
4990         if (!vtisochctrl) {
4991                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4992                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4993                      dmi_get_system_info(DMI_BIOS_VENDOR),
4994                      dmi_get_system_info(DMI_BIOS_VERSION),
4995                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4996                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4997                 return;
4998         }
4999
5000         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5001                vtisochctrl);
5002 }