Merge branch 'remotes/lorenzo/pci/iproc'
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 #define dev_fmt(fmt)    pr_fmt(fmt)
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
55
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
58
59 #define ROOT_SIZE               VTD_PAGE_SIZE
60 #define CONTEXT_SIZE            VTD_PAGE_SIZE
61
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66
67 #define IOAPIC_RANGE_START      (0xfee00000)
68 #define IOAPIC_RANGE_END        (0xfeefffff)
69 #define IOVA_START_ADDR         (0x1000)
70
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75
76 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
82                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN          (1)
87
88 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
89
90 /* page table handling */
91 #define LEVEL_STRIDE            (9)
92 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
93
94 /*
95  * This bitmap is used to advertise the page sizes our hardware support
96  * to the IOMMU core, which will then use this information to split
97  * physically contiguous memory regions it is mapping into page sizes
98  * that we support.
99  *
100  * Traditionally the IOMMU core just handed us the mappings directly,
101  * after making sure the size is an order of a 4KiB page and that the
102  * mapping has natural alignment.
103  *
104  * To retain this behavior, we currently advertise that we support
105  * all page sizes that are an order of 4KiB.
106  *
107  * If at some point we'd like to utilize the IOMMU core's new behavior,
108  * we could change this to advertise the real page sizes we support.
109  */
110 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
111
112 static inline int agaw_to_level(int agaw)
113 {
114         return agaw + 2;
115 }
116
117 static inline int agaw_to_width(int agaw)
118 {
119         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 }
121
122 static inline int width_to_agaw(int width)
123 {
124         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 }
126
127 static inline unsigned int level_to_offset_bits(int level)
128 {
129         return (level - 1) * LEVEL_STRIDE;
130 }
131
132 static inline int pfn_level_offset(unsigned long pfn, int level)
133 {
134         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 }
136
137 static inline unsigned long level_mask(int level)
138 {
139         return -1UL << level_to_offset_bits(level);
140 }
141
142 static inline unsigned long level_size(int level)
143 {
144         return 1UL << level_to_offset_bits(level);
145 }
146
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 {
149         return (pfn + level_size(level) - 1) & level_mask(level);
150 }
151
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 {
154         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 }
156
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158    are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 {
161         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 }
163
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 {
166         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 }
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 {
170         return mm_to_dma_pfn(page_to_pfn(pg));
171 }
172 static inline unsigned long virt_to_dma_pfn(void *p)
173 {
174         return page_to_dma_pfn(virt_to_page(p));
175 }
176
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
179
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
182
183 /*
184  * set to 1 to panic kernel if can't successfully enable VT-d
185  * (used when kernel is launched w/ TXT)
186  */
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
190
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193 /*
194  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195  * if marked present.
196  */
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
198 {
199         if (!(re->lo & 1))
200                 return 0;
201
202         return re->lo & VTD_PAGE_MASK;
203 }
204
205 /*
206  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207  * if marked present.
208  */
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
210 {
211         if (!(re->hi & 1))
212                 return 0;
213
214         return re->hi & VTD_PAGE_MASK;
215 }
216
217 static inline void context_clear_pasid_enable(struct context_entry *context)
218 {
219         context->lo &= ~(1ULL << 11);
220 }
221
222 static inline bool context_pasid_enabled(struct context_entry *context)
223 {
224         return !!(context->lo & (1ULL << 11));
225 }
226
227 static inline void context_set_copied(struct context_entry *context)
228 {
229         context->hi |= (1ull << 3);
230 }
231
232 static inline bool context_copied(struct context_entry *context)
233 {
234         return !!(context->hi & (1ULL << 3));
235 }
236
237 static inline bool __context_present(struct context_entry *context)
238 {
239         return (context->lo & 1);
240 }
241
242 bool context_present(struct context_entry *context)
243 {
244         return context_pasid_enabled(context) ?
245              __context_present(context) :
246              __context_present(context) && !context_copied(context);
247 }
248
249 static inline void context_set_present(struct context_entry *context)
250 {
251         context->lo |= 1;
252 }
253
254 static inline void context_set_fault_enable(struct context_entry *context)
255 {
256         context->lo &= (((u64)-1) << 2) | 1;
257 }
258
259 static inline void context_set_translation_type(struct context_entry *context,
260                                                 unsigned long value)
261 {
262         context->lo &= (((u64)-1) << 4) | 3;
263         context->lo |= (value & 3) << 2;
264 }
265
266 static inline void context_set_address_root(struct context_entry *context,
267                                             unsigned long value)
268 {
269         context->lo &= ~VTD_PAGE_MASK;
270         context->lo |= value & VTD_PAGE_MASK;
271 }
272
273 static inline void context_set_address_width(struct context_entry *context,
274                                              unsigned long value)
275 {
276         context->hi |= value & 7;
277 }
278
279 static inline void context_set_domain_id(struct context_entry *context,
280                                          unsigned long value)
281 {
282         context->hi |= (value & ((1 << 16) - 1)) << 8;
283 }
284
285 static inline int context_domain_id(struct context_entry *c)
286 {
287         return((c->hi >> 8) & 0xffff);
288 }
289
290 static inline void context_clear_entry(struct context_entry *context)
291 {
292         context->lo = 0;
293         context->hi = 0;
294 }
295
296 /*
297  * This domain is a statically identity mapping domain.
298  *      1. This domain creats a static 1:1 mapping to all usable memory.
299  *      2. It maps to each iommu if successful.
300  *      3. Each iommu mapps to this domain if successful.
301  */
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
304
305 /*
306  * Domain represents a virtual machine, more than one devices
307  * across iommus may be owned in one domain, e.g. kvm guest.
308  */
309 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
310
311 /* si_domain contains mulitple devices */
312 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
313
314 #define for_each_domain_iommu(idx, domain)                      \
315         for (idx = 0; idx < g_num_of_iommus; idx++)             \
316                 if (domain->iommu_refcnt[idx])
317
318 struct dmar_rmrr_unit {
319         struct list_head list;          /* list of rmrr units   */
320         struct acpi_dmar_header *hdr;   /* ACPI header          */
321         u64     base_address;           /* reserved base address*/
322         u64     end_address;            /* reserved end address */
323         struct dmar_dev_scope *devices; /* target devices */
324         int     devices_cnt;            /* target device count */
325         struct iommu_resv_region *resv; /* reserved region handle */
326 };
327
328 struct dmar_atsr_unit {
329         struct list_head list;          /* list of ATSR units */
330         struct acpi_dmar_header *hdr;   /* ACPI header */
331         struct dmar_dev_scope *devices; /* target devices */
332         int devices_cnt;                /* target device count */
333         u8 include_all:1;               /* include all ports */
334 };
335
336 static LIST_HEAD(dmar_atsr_units);
337 static LIST_HEAD(dmar_rmrr_units);
338
339 #define for_each_rmrr_units(rmrr) \
340         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
341
342 /* bitmap for indexing intel_iommus */
343 static int g_num_of_iommus;
344
345 static void domain_exit(struct dmar_domain *domain);
346 static void domain_remove_dev_info(struct dmar_domain *domain);
347 static void dmar_remove_one_dev_info(struct device *dev);
348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
349 static void domain_context_clear(struct intel_iommu *iommu,
350                                  struct device *dev);
351 static int domain_detach_iommu(struct dmar_domain *domain,
352                                struct intel_iommu *iommu);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
362
363 static int dmar_map_gfx = 1;
364 static int dmar_forcedac;
365 static int intel_iommu_strict;
366 static int intel_iommu_superpage = 1;
367 static int intel_iommu_sm;
368 static int iommu_identity_mapping;
369
370 #define IDENTMAP_ALL            1
371 #define IDENTMAP_GFX            2
372 #define IDENTMAP_AZALIA         4
373
374 #define sm_supported(iommu)     (intel_iommu_sm && ecap_smts((iommu)->ecap))
375 #define pasid_supported(iommu)  (sm_supported(iommu) &&                 \
376                                  ecap_pasid((iommu)->ecap))
377
378 int intel_iommu_gfx_mapped;
379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
380
381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
382 static DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 /*
386  * Iterate over elements in device_domain_list and call the specified
387  * callback @fn against each element.
388  */
389 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
390                                      void *data), void *data)
391 {
392         int ret = 0;
393         unsigned long flags;
394         struct device_domain_info *info;
395
396         spin_lock_irqsave(&device_domain_lock, flags);
397         list_for_each_entry(info, &device_domain_list, global) {
398                 ret = fn(info, data);
399                 if (ret) {
400                         spin_unlock_irqrestore(&device_domain_lock, flags);
401                         return ret;
402                 }
403         }
404         spin_unlock_irqrestore(&device_domain_lock, flags);
405
406         return 0;
407 }
408
409 const struct iommu_ops intel_iommu_ops;
410
411 static bool translation_pre_enabled(struct intel_iommu *iommu)
412 {
413         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
414 }
415
416 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
417 {
418         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
419 }
420
421 static void init_translation_status(struct intel_iommu *iommu)
422 {
423         u32 gsts;
424
425         gsts = readl(iommu->reg + DMAR_GSTS_REG);
426         if (gsts & DMA_GSTS_TES)
427                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
428 }
429
430 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
431 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
432 {
433         return container_of(dom, struct dmar_domain, domain);
434 }
435
436 static int __init intel_iommu_setup(char *str)
437 {
438         if (!str)
439                 return -EINVAL;
440         while (*str) {
441                 if (!strncmp(str, "on", 2)) {
442                         dmar_disabled = 0;
443                         pr_info("IOMMU enabled\n");
444                 } else if (!strncmp(str, "off", 3)) {
445                         dmar_disabled = 1;
446                         no_platform_optin = 1;
447                         pr_info("IOMMU disabled\n");
448                 } else if (!strncmp(str, "igfx_off", 8)) {
449                         dmar_map_gfx = 0;
450                         pr_info("Disable GFX device mapping\n");
451                 } else if (!strncmp(str, "forcedac", 8)) {
452                         pr_info("Forcing DAC for PCI devices\n");
453                         dmar_forcedac = 1;
454                 } else if (!strncmp(str, "strict", 6)) {
455                         pr_info("Disable batched IOTLB flush\n");
456                         intel_iommu_strict = 1;
457                 } else if (!strncmp(str, "sp_off", 6)) {
458                         pr_info("Disable supported super page\n");
459                         intel_iommu_superpage = 0;
460                 } else if (!strncmp(str, "sm_on", 5)) {
461                         pr_info("Intel-IOMMU: scalable mode supported\n");
462                         intel_iommu_sm = 1;
463                 } else if (!strncmp(str, "tboot_noforce", 13)) {
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466                         intel_iommu_tboot_noforce = 1;
467                 }
468
469                 str += strcspn(str, ",");
470                 while (*str == ',')
471                         str++;
472         }
473         return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482         struct dmar_domain **domains;
483         int idx = did >> 8;
484
485         domains = iommu->domains[idx];
486         if (!domains)
487                 return NULL;
488
489         return domains[did & 0xff];
490 }
491
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493                              struct dmar_domain *domain)
494 {
495         struct dmar_domain **domains;
496         int idx = did >> 8;
497
498         if (!iommu->domains[idx]) {
499                 size_t size = 256 * sizeof(struct dmar_domain *);
500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501         }
502
503         domains = iommu->domains[idx];
504         if (WARN_ON(!domains))
505                 return;
506         else
507                 domains[did & 0xff] = domain;
508 }
509
510 void *alloc_pgtable_page(int node)
511 {
512         struct page *page;
513         void *vaddr = NULL;
514
515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516         if (page)
517                 vaddr = page_address(page);
518         return vaddr;
519 }
520
521 void free_pgtable_page(void *vaddr)
522 {
523         free_page((unsigned long)vaddr);
524 }
525
526 static inline void *alloc_domain_mem(void)
527 {
528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530
531 static void free_domain_mem(void *vaddr)
532 {
533         kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535
536 static inline void * alloc_devinfo_mem(void)
537 {
538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543         kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545
546 static inline int domain_type_is_vm(struct dmar_domain *domain)
547 {
548         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
549 }
550
551 static inline int domain_type_is_si(struct dmar_domain *domain)
552 {
553         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
554 }
555
556 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
557 {
558         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
559                                 DOMAIN_FLAG_STATIC_IDENTITY);
560 }
561
562 static inline int domain_pfn_supported(struct dmar_domain *domain,
563                                        unsigned long pfn)
564 {
565         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
566
567         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
568 }
569
570 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
571 {
572         unsigned long sagaw;
573         int agaw = -1;
574
575         sagaw = cap_sagaw(iommu->cap);
576         for (agaw = width_to_agaw(max_gaw);
577              agaw >= 0; agaw--) {
578                 if (test_bit(agaw, &sagaw))
579                         break;
580         }
581
582         return agaw;
583 }
584
585 /*
586  * Calculate max SAGAW for each iommu.
587  */
588 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
589 {
590         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
591 }
592
593 /*
594  * calculate agaw for each iommu.
595  * "SAGAW" may be different across iommus, use a default agaw, and
596  * get a supported less agaw for iommus that don't support the default agaw.
597  */
598 int iommu_calculate_agaw(struct intel_iommu *iommu)
599 {
600         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
601 }
602
603 /* This functionin only returns single iommu in a domain */
604 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
605 {
606         int iommu_id;
607
608         /* si_domain and vm domain should not get here. */
609         BUG_ON(domain_type_is_vm_or_si(domain));
610         for_each_domain_iommu(iommu_id, domain)
611                 break;
612
613         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
614                 return NULL;
615
616         return g_iommus[iommu_id];
617 }
618
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
620 {
621         struct dmar_drhd_unit *drhd;
622         struct intel_iommu *iommu;
623         bool found = false;
624         int i;
625
626         domain->iommu_coherency = 1;
627
628         for_each_domain_iommu(i, domain) {
629                 found = true;
630                 if (!ecap_coherent(g_iommus[i]->ecap)) {
631                         domain->iommu_coherency = 0;
632                         break;
633                 }
634         }
635         if (found)
636                 return;
637
638         /* No hardware attached; use lowest common denominator */
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (!ecap_coherent(iommu->ecap)) {
642                         domain->iommu_coherency = 0;
643                         break;
644                 }
645         }
646         rcu_read_unlock();
647 }
648
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
650 {
651         struct dmar_drhd_unit *drhd;
652         struct intel_iommu *iommu;
653         int ret = 1;
654
655         rcu_read_lock();
656         for_each_active_iommu(iommu, drhd) {
657                 if (iommu != skip) {
658                         if (!ecap_sc_support(iommu->ecap)) {
659                                 ret = 0;
660                                 break;
661                         }
662                 }
663         }
664         rcu_read_unlock();
665
666         return ret;
667 }
668
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
670 {
671         struct dmar_drhd_unit *drhd;
672         struct intel_iommu *iommu;
673         int mask = 0xf;
674
675         if (!intel_iommu_superpage) {
676                 return 0;
677         }
678
679         /* set iommu_superpage to the smallest common denominator */
680         rcu_read_lock();
681         for_each_active_iommu(iommu, drhd) {
682                 if (iommu != skip) {
683                         mask &= cap_super_page_val(iommu->cap);
684                         if (!mask)
685                                 break;
686                 }
687         }
688         rcu_read_unlock();
689
690         return fls(mask);
691 }
692
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
695 {
696         domain_update_iommu_coherency(domain);
697         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
699 }
700
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
702                                          u8 devfn, int alloc)
703 {
704         struct root_entry *root = &iommu->root_entry[bus];
705         struct context_entry *context;
706         u64 *entry;
707
708         entry = &root->lo;
709         if (sm_supported(iommu)) {
710                 if (devfn >= 0x80) {
711                         devfn -= 0x80;
712                         entry = &root->hi;
713                 }
714                 devfn *= 2;
715         }
716         if (*entry & 1)
717                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
718         else {
719                 unsigned long phy_addr;
720                 if (!alloc)
721                         return NULL;
722
723                 context = alloc_pgtable_page(iommu->node);
724                 if (!context)
725                         return NULL;
726
727                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728                 phy_addr = virt_to_phys((void *)context);
729                 *entry = phy_addr | 1;
730                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
731         }
732         return &context[devfn];
733 }
734
735 static int iommu_dummy(struct device *dev)
736 {
737         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
738 }
739
740 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
741 {
742         struct dmar_drhd_unit *drhd = NULL;
743         struct intel_iommu *iommu;
744         struct device *tmp;
745         struct pci_dev *ptmp, *pdev = NULL;
746         u16 segment = 0;
747         int i;
748
749         if (iommu_dummy(dev))
750                 return NULL;
751
752         if (dev_is_pci(dev)) {
753                 struct pci_dev *pf_pdev;
754
755                 pdev = to_pci_dev(dev);
756
757 #ifdef CONFIG_X86
758                 /* VMD child devices currently cannot be handled individually */
759                 if (is_vmd(pdev->bus))
760                         return NULL;
761 #endif
762
763                 /* VFs aren't listed in scope tables; we need to look up
764                  * the PF instead to find the IOMMU. */
765                 pf_pdev = pci_physfn(pdev);
766                 dev = &pf_pdev->dev;
767                 segment = pci_domain_nr(pdev->bus);
768         } else if (has_acpi_companion(dev))
769                 dev = &ACPI_COMPANION(dev)->dev;
770
771         rcu_read_lock();
772         for_each_active_iommu(iommu, drhd) {
773                 if (pdev && segment != drhd->segment)
774                         continue;
775
776                 for_each_active_dev_scope(drhd->devices,
777                                           drhd->devices_cnt, i, tmp) {
778                         if (tmp == dev) {
779                                 /* For a VF use its original BDF# not that of the PF
780                                  * which we used for the IOMMU lookup. Strictly speaking
781                                  * we could do this for all PCI devices; we only need to
782                                  * get the BDF# from the scope table for ACPI matches. */
783                                 if (pdev && pdev->is_virtfn)
784                                         goto got_pdev;
785
786                                 *bus = drhd->devices[i].bus;
787                                 *devfn = drhd->devices[i].devfn;
788                                 goto out;
789                         }
790
791                         if (!pdev || !dev_is_pci(tmp))
792                                 continue;
793
794                         ptmp = to_pci_dev(tmp);
795                         if (ptmp->subordinate &&
796                             ptmp->subordinate->number <= pdev->bus->number &&
797                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
798                                 goto got_pdev;
799                 }
800
801                 if (pdev && drhd->include_all) {
802                 got_pdev:
803                         *bus = pdev->bus->number;
804                         *devfn = pdev->devfn;
805                         goto out;
806                 }
807         }
808         iommu = NULL;
809  out:
810         rcu_read_unlock();
811
812         return iommu;
813 }
814
815 static void domain_flush_cache(struct dmar_domain *domain,
816                                void *addr, int size)
817 {
818         if (!domain->iommu_coherency)
819                 clflush_cache_range(addr, size);
820 }
821
822 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
823 {
824         struct context_entry *context;
825         int ret = 0;
826         unsigned long flags;
827
828         spin_lock_irqsave(&iommu->lock, flags);
829         context = iommu_context_addr(iommu, bus, devfn, 0);
830         if (context)
831                 ret = context_present(context);
832         spin_unlock_irqrestore(&iommu->lock, flags);
833         return ret;
834 }
835
836 static void free_context_table(struct intel_iommu *iommu)
837 {
838         int i;
839         unsigned long flags;
840         struct context_entry *context;
841
842         spin_lock_irqsave(&iommu->lock, flags);
843         if (!iommu->root_entry) {
844                 goto out;
845         }
846         for (i = 0; i < ROOT_ENTRY_NR; i++) {
847                 context = iommu_context_addr(iommu, i, 0, 0);
848                 if (context)
849                         free_pgtable_page(context);
850
851                 if (!sm_supported(iommu))
852                         continue;
853
854                 context = iommu_context_addr(iommu, i, 0x80, 0);
855                 if (context)
856                         free_pgtable_page(context);
857
858         }
859         free_pgtable_page(iommu->root_entry);
860         iommu->root_entry = NULL;
861 out:
862         spin_unlock_irqrestore(&iommu->lock, flags);
863 }
864
865 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
866                                       unsigned long pfn, int *target_level)
867 {
868         struct dma_pte *parent, *pte;
869         int level = agaw_to_level(domain->agaw);
870         int offset;
871
872         BUG_ON(!domain->pgd);
873
874         if (!domain_pfn_supported(domain, pfn))
875                 /* Address beyond IOMMU's addressing capabilities. */
876                 return NULL;
877
878         parent = domain->pgd;
879
880         while (1) {
881                 void *tmp_page;
882
883                 offset = pfn_level_offset(pfn, level);
884                 pte = &parent[offset];
885                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
886                         break;
887                 if (level == *target_level)
888                         break;
889
890                 if (!dma_pte_present(pte)) {
891                         uint64_t pteval;
892
893                         tmp_page = alloc_pgtable_page(domain->nid);
894
895                         if (!tmp_page)
896                                 return NULL;
897
898                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
899                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
900                         if (cmpxchg64(&pte->val, 0ULL, pteval))
901                                 /* Someone else set it while we were thinking; use theirs. */
902                                 free_pgtable_page(tmp_page);
903                         else
904                                 domain_flush_cache(domain, pte, sizeof(*pte));
905                 }
906                 if (level == 1)
907                         break;
908
909                 parent = phys_to_virt(dma_pte_addr(pte));
910                 level--;
911         }
912
913         if (!*target_level)
914                 *target_level = level;
915
916         return pte;
917 }
918
919
920 /* return address's pte at specific level */
921 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
922                                          unsigned long pfn,
923                                          int level, int *large_page)
924 {
925         struct dma_pte *parent, *pte;
926         int total = agaw_to_level(domain->agaw);
927         int offset;
928
929         parent = domain->pgd;
930         while (level <= total) {
931                 offset = pfn_level_offset(pfn, total);
932                 pte = &parent[offset];
933                 if (level == total)
934                         return pte;
935
936                 if (!dma_pte_present(pte)) {
937                         *large_page = total;
938                         break;
939                 }
940
941                 if (dma_pte_superpage(pte)) {
942                         *large_page = total;
943                         return pte;
944                 }
945
946                 parent = phys_to_virt(dma_pte_addr(pte));
947                 total--;
948         }
949         return NULL;
950 }
951
952 /* clear last level pte, a tlb flush should be followed */
953 static void dma_pte_clear_range(struct dmar_domain *domain,
954                                 unsigned long start_pfn,
955                                 unsigned long last_pfn)
956 {
957         unsigned int large_page;
958         struct dma_pte *first_pte, *pte;
959
960         BUG_ON(!domain_pfn_supported(domain, start_pfn));
961         BUG_ON(!domain_pfn_supported(domain, last_pfn));
962         BUG_ON(start_pfn > last_pfn);
963
964         /* we don't need lock here; nobody else touches the iova range */
965         do {
966                 large_page = 1;
967                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
968                 if (!pte) {
969                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
970                         continue;
971                 }
972                 do {
973                         dma_clear_pte(pte);
974                         start_pfn += lvl_to_nr_pages(large_page);
975                         pte++;
976                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
977
978                 domain_flush_cache(domain, first_pte,
979                                    (void *)pte - (void *)first_pte);
980
981         } while (start_pfn && start_pfn <= last_pfn);
982 }
983
984 static void dma_pte_free_level(struct dmar_domain *domain, int level,
985                                int retain_level, struct dma_pte *pte,
986                                unsigned long pfn, unsigned long start_pfn,
987                                unsigned long last_pfn)
988 {
989         pfn = max(start_pfn, pfn);
990         pte = &pte[pfn_level_offset(pfn, level)];
991
992         do {
993                 unsigned long level_pfn;
994                 struct dma_pte *level_pte;
995
996                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
997                         goto next;
998
999                 level_pfn = pfn & level_mask(level);
1000                 level_pte = phys_to_virt(dma_pte_addr(pte));
1001
1002                 if (level > 2) {
1003                         dma_pte_free_level(domain, level - 1, retain_level,
1004                                            level_pte, level_pfn, start_pfn,
1005                                            last_pfn);
1006                 }
1007
1008                 /*
1009                  * Free the page table if we're below the level we want to
1010                  * retain and the range covers the entire table.
1011                  */
1012                 if (level < retain_level && !(start_pfn > level_pfn ||
1013                       last_pfn < level_pfn + level_size(level) - 1)) {
1014                         dma_clear_pte(pte);
1015                         domain_flush_cache(domain, pte, sizeof(*pte));
1016                         free_pgtable_page(level_pte);
1017                 }
1018 next:
1019                 pfn += level_size(level);
1020         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1021 }
1022
1023 /*
1024  * clear last level (leaf) ptes and free page table pages below the
1025  * level we wish to keep intact.
1026  */
1027 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1028                                    unsigned long start_pfn,
1029                                    unsigned long last_pfn,
1030                                    int retain_level)
1031 {
1032         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1033         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1034         BUG_ON(start_pfn > last_pfn);
1035
1036         dma_pte_clear_range(domain, start_pfn, last_pfn);
1037
1038         /* We don't need lock here; nobody else touches the iova range */
1039         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1040                            domain->pgd, 0, start_pfn, last_pfn);
1041
1042         /* free pgd */
1043         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1044                 free_pgtable_page(domain->pgd);
1045                 domain->pgd = NULL;
1046         }
1047 }
1048
1049 /* When a page at a given level is being unlinked from its parent, we don't
1050    need to *modify* it at all. All we need to do is make a list of all the
1051    pages which can be freed just as soon as we've flushed the IOTLB and we
1052    know the hardware page-walk will no longer touch them.
1053    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1054    be freed. */
1055 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1056                                             int level, struct dma_pte *pte,
1057                                             struct page *freelist)
1058 {
1059         struct page *pg;
1060
1061         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1062         pg->freelist = freelist;
1063         freelist = pg;
1064
1065         if (level == 1)
1066                 return freelist;
1067
1068         pte = page_address(pg);
1069         do {
1070                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1071                         freelist = dma_pte_list_pagetables(domain, level - 1,
1072                                                            pte, freelist);
1073                 pte++;
1074         } while (!first_pte_in_page(pte));
1075
1076         return freelist;
1077 }
1078
1079 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1080                                         struct dma_pte *pte, unsigned long pfn,
1081                                         unsigned long start_pfn,
1082                                         unsigned long last_pfn,
1083                                         struct page *freelist)
1084 {
1085         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1086
1087         pfn = max(start_pfn, pfn);
1088         pte = &pte[pfn_level_offset(pfn, level)];
1089
1090         do {
1091                 unsigned long level_pfn;
1092
1093                 if (!dma_pte_present(pte))
1094                         goto next;
1095
1096                 level_pfn = pfn & level_mask(level);
1097
1098                 /* If range covers entire pagetable, free it */
1099                 if (start_pfn <= level_pfn &&
1100                     last_pfn >= level_pfn + level_size(level) - 1) {
1101                         /* These suborbinate page tables are going away entirely. Don't
1102                            bother to clear them; we're just going to *free* them. */
1103                         if (level > 1 && !dma_pte_superpage(pte))
1104                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1105
1106                         dma_clear_pte(pte);
1107                         if (!first_pte)
1108                                 first_pte = pte;
1109                         last_pte = pte;
1110                 } else if (level > 1) {
1111                         /* Recurse down into a level that isn't *entirely* obsolete */
1112                         freelist = dma_pte_clear_level(domain, level - 1,
1113                                                        phys_to_virt(dma_pte_addr(pte)),
1114                                                        level_pfn, start_pfn, last_pfn,
1115                                                        freelist);
1116                 }
1117 next:
1118                 pfn += level_size(level);
1119         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1120
1121         if (first_pte)
1122                 domain_flush_cache(domain, first_pte,
1123                                    (void *)++last_pte - (void *)first_pte);
1124
1125         return freelist;
1126 }
1127
1128 /* We can't just free the pages because the IOMMU may still be walking
1129    the page tables, and may have cached the intermediate levels. The
1130    pages can only be freed after the IOTLB flush has been done. */
1131 static struct page *domain_unmap(struct dmar_domain *domain,
1132                                  unsigned long start_pfn,
1133                                  unsigned long last_pfn)
1134 {
1135         struct page *freelist;
1136
1137         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1138         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1139         BUG_ON(start_pfn > last_pfn);
1140
1141         /* we don't need lock here; nobody else touches the iova range */
1142         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1143                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1144
1145         /* free pgd */
1146         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1147                 struct page *pgd_page = virt_to_page(domain->pgd);
1148                 pgd_page->freelist = freelist;
1149                 freelist = pgd_page;
1150
1151                 domain->pgd = NULL;
1152         }
1153
1154         return freelist;
1155 }
1156
1157 static void dma_free_pagelist(struct page *freelist)
1158 {
1159         struct page *pg;
1160
1161         while ((pg = freelist)) {
1162                 freelist = pg->freelist;
1163                 free_pgtable_page(page_address(pg));
1164         }
1165 }
1166
1167 static void iova_entry_free(unsigned long data)
1168 {
1169         struct page *freelist = (struct page *)data;
1170
1171         dma_free_pagelist(freelist);
1172 }
1173
1174 /* iommu handling */
1175 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1176 {
1177         struct root_entry *root;
1178         unsigned long flags;
1179
1180         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1181         if (!root) {
1182                 pr_err("Allocating root entry for %s failed\n",
1183                         iommu->name);
1184                 return -ENOMEM;
1185         }
1186
1187         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1188
1189         spin_lock_irqsave(&iommu->lock, flags);
1190         iommu->root_entry = root;
1191         spin_unlock_irqrestore(&iommu->lock, flags);
1192
1193         return 0;
1194 }
1195
1196 static void iommu_set_root_entry(struct intel_iommu *iommu)
1197 {
1198         u64 addr;
1199         u32 sts;
1200         unsigned long flag;
1201
1202         addr = virt_to_phys(iommu->root_entry);
1203         if (sm_supported(iommu))
1204                 addr |= DMA_RTADDR_SMT;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1208
1209         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (sts & DMA_GSTS_RTPS), sts);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1216 }
1217
1218 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1219 {
1220         u32 val;
1221         unsigned long flag;
1222
1223         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1224                 return;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1228
1229         /* Make sure hardware complete it */
1230         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231                       readl, (!(val & DMA_GSTS_WBFS)), val);
1232
1233         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234 }
1235
1236 /* return value determine if we need a write buffer flush */
1237 static void __iommu_flush_context(struct intel_iommu *iommu,
1238                                   u16 did, u16 source_id, u8 function_mask,
1239                                   u64 type)
1240 {
1241         u64 val = 0;
1242         unsigned long flag;
1243
1244         switch (type) {
1245         case DMA_CCMD_GLOBAL_INVL:
1246                 val = DMA_CCMD_GLOBAL_INVL;
1247                 break;
1248         case DMA_CCMD_DOMAIN_INVL:
1249                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1250                 break;
1251         case DMA_CCMD_DEVICE_INVL:
1252                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1253                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1254                 break;
1255         default:
1256                 BUG();
1257         }
1258         val |= DMA_CCMD_ICC;
1259
1260         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1262
1263         /* Make sure hardware complete it */
1264         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1265                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1266
1267         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1272                                 u64 addr, unsigned int size_order, u64 type)
1273 {
1274         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1275         u64 val = 0, val_iva = 0;
1276         unsigned long flag;
1277
1278         switch (type) {
1279         case DMA_TLB_GLOBAL_FLUSH:
1280                 /* global flush doesn't need set IVA_REG */
1281                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1282                 break;
1283         case DMA_TLB_DSI_FLUSH:
1284                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1285                 break;
1286         case DMA_TLB_PSI_FLUSH:
1287                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288                 /* IH bit is passed in as part of address */
1289                 val_iva = size_order | addr;
1290                 break;
1291         default:
1292                 BUG();
1293         }
1294         /* Note: set drain read/write */
1295 #if 0
1296         /*
1297          * This is probably to be super secure.. Looks like we can
1298          * ignore it without any impact.
1299          */
1300         if (cap_read_drain(iommu->cap))
1301                 val |= DMA_TLB_READ_DRAIN;
1302 #endif
1303         if (cap_write_drain(iommu->cap))
1304                 val |= DMA_TLB_WRITE_DRAIN;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         /* Note: Only uses first TLB reg currently */
1308         if (val_iva)
1309                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1310         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1311
1312         /* Make sure hardware complete it */
1313         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1314                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1315
1316         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1317
1318         /* check IOTLB invalidation granularity */
1319         if (DMA_TLB_IAIG(val) == 0)
1320                 pr_err("Flush IOTLB failed\n");
1321         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1322                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1323                         (unsigned long long)DMA_TLB_IIRG(type),
1324                         (unsigned long long)DMA_TLB_IAIG(val));
1325 }
1326
1327 static struct device_domain_info *
1328 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1329                          u8 bus, u8 devfn)
1330 {
1331         struct device_domain_info *info;
1332
1333         assert_spin_locked(&device_domain_lock);
1334
1335         if (!iommu->qi)
1336                 return NULL;
1337
1338         list_for_each_entry(info, &domain->devices, link)
1339                 if (info->iommu == iommu && info->bus == bus &&
1340                     info->devfn == devfn) {
1341                         if (info->ats_supported && info->dev)
1342                                 return info;
1343                         break;
1344                 }
1345
1346         return NULL;
1347 }
1348
1349 static void domain_update_iotlb(struct dmar_domain *domain)
1350 {
1351         struct device_domain_info *info;
1352         bool has_iotlb_device = false;
1353
1354         assert_spin_locked(&device_domain_lock);
1355
1356         list_for_each_entry(info, &domain->devices, link) {
1357                 struct pci_dev *pdev;
1358
1359                 if (!info->dev || !dev_is_pci(info->dev))
1360                         continue;
1361
1362                 pdev = to_pci_dev(info->dev);
1363                 if (pdev->ats_enabled) {
1364                         has_iotlb_device = true;
1365                         break;
1366                 }
1367         }
1368
1369         domain->has_iotlb_device = has_iotlb_device;
1370 }
1371
1372 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1373 {
1374         struct pci_dev *pdev;
1375
1376         assert_spin_locked(&device_domain_lock);
1377
1378         if (!info || !dev_is_pci(info->dev))
1379                 return;
1380
1381         pdev = to_pci_dev(info->dev);
1382         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1383          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1384          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1385          * reserved, which should be set to 0.
1386          */
1387         if (!ecap_dit(info->iommu->ecap))
1388                 info->pfsid = 0;
1389         else {
1390                 struct pci_dev *pf_pdev;
1391
1392                 /* pdev will be returned if device is not a vf */
1393                 pf_pdev = pci_physfn(pdev);
1394                 info->pfsid = pci_dev_id(pf_pdev);
1395         }
1396
1397 #ifdef CONFIG_INTEL_IOMMU_SVM
1398         /* The PCIe spec, in its wisdom, declares that the behaviour of
1399            the device if you enable PASID support after ATS support is
1400            undefined. So always enable PASID support on devices which
1401            have it, even if we can't yet know if we're ever going to
1402            use it. */
1403         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1404                 info->pasid_enabled = 1;
1405
1406         if (info->pri_supported &&
1407             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1408             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1409                 info->pri_enabled = 1;
1410 #endif
1411         if (!pdev->untrusted && info->ats_supported &&
1412             pci_ats_page_aligned(pdev) &&
1413             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1414                 info->ats_enabled = 1;
1415                 domain_update_iotlb(info->domain);
1416                 info->ats_qdep = pci_ats_queue_depth(pdev);
1417         }
1418 }
1419
1420 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1421 {
1422         struct pci_dev *pdev;
1423
1424         assert_spin_locked(&device_domain_lock);
1425
1426         if (!dev_is_pci(info->dev))
1427                 return;
1428
1429         pdev = to_pci_dev(info->dev);
1430
1431         if (info->ats_enabled) {
1432                 pci_disable_ats(pdev);
1433                 info->ats_enabled = 0;
1434                 domain_update_iotlb(info->domain);
1435         }
1436 #ifdef CONFIG_INTEL_IOMMU_SVM
1437         if (info->pri_enabled) {
1438                 pci_disable_pri(pdev);
1439                 info->pri_enabled = 0;
1440         }
1441         if (info->pasid_enabled) {
1442                 pci_disable_pasid(pdev);
1443                 info->pasid_enabled = 0;
1444         }
1445 #endif
1446 }
1447
1448 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1449                                   u64 addr, unsigned mask)
1450 {
1451         u16 sid, qdep;
1452         unsigned long flags;
1453         struct device_domain_info *info;
1454
1455         if (!domain->has_iotlb_device)
1456                 return;
1457
1458         spin_lock_irqsave(&device_domain_lock, flags);
1459         list_for_each_entry(info, &domain->devices, link) {
1460                 if (!info->ats_enabled)
1461                         continue;
1462
1463                 sid = info->bus << 8 | info->devfn;
1464                 qdep = info->ats_qdep;
1465                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1466                                 qdep, addr, mask);
1467         }
1468         spin_unlock_irqrestore(&device_domain_lock, flags);
1469 }
1470
1471 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1472                                   struct dmar_domain *domain,
1473                                   unsigned long pfn, unsigned int pages,
1474                                   int ih, int map)
1475 {
1476         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1477         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1478         u16 did = domain->iommu_did[iommu->seq_id];
1479
1480         BUG_ON(pages == 0);
1481
1482         if (ih)
1483                 ih = 1 << 6;
1484         /*
1485          * Fallback to domain selective flush if no PSI support or the size is
1486          * too big.
1487          * PSI requires page size to be 2 ^ x, and the base address is naturally
1488          * aligned to the size
1489          */
1490         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1491                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1492                                                 DMA_TLB_DSI_FLUSH);
1493         else
1494                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1495                                                 DMA_TLB_PSI_FLUSH);
1496
1497         /*
1498          * In caching mode, changes of pages from non-present to present require
1499          * flush. However, device IOTLB doesn't need to be flushed in this case.
1500          */
1501         if (!cap_caching_mode(iommu->cap) || !map)
1502                 iommu_flush_dev_iotlb(domain, addr, mask);
1503 }
1504
1505 /* Notification for newly created mappings */
1506 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1507                                         struct dmar_domain *domain,
1508                                         unsigned long pfn, unsigned int pages)
1509 {
1510         /* It's a non-present to present mapping. Only flush if caching mode */
1511         if (cap_caching_mode(iommu->cap))
1512                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1513         else
1514                 iommu_flush_write_buffer(iommu);
1515 }
1516
1517 static void iommu_flush_iova(struct iova_domain *iovad)
1518 {
1519         struct dmar_domain *domain;
1520         int idx;
1521
1522         domain = container_of(iovad, struct dmar_domain, iovad);
1523
1524         for_each_domain_iommu(idx, domain) {
1525                 struct intel_iommu *iommu = g_iommus[idx];
1526                 u16 did = domain->iommu_did[iommu->seq_id];
1527
1528                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1529
1530                 if (!cap_caching_mode(iommu->cap))
1531                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1532                                               0, MAX_AGAW_PFN_WIDTH);
1533         }
1534 }
1535
1536 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1537 {
1538         u32 pmen;
1539         unsigned long flags;
1540
1541         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1543         pmen &= ~DMA_PMEN_EPM;
1544         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1545
1546         /* wait for the protected region status bit to clear */
1547         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1548                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1549
1550         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1551 }
1552
1553 static void iommu_enable_translation(struct intel_iommu *iommu)
1554 {
1555         u32 sts;
1556         unsigned long flags;
1557
1558         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1559         iommu->gcmd |= DMA_GCMD_TE;
1560         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1561
1562         /* Make sure hardware complete it */
1563         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1564                       readl, (sts & DMA_GSTS_TES), sts);
1565
1566         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1567 }
1568
1569 static void iommu_disable_translation(struct intel_iommu *iommu)
1570 {
1571         u32 sts;
1572         unsigned long flag;
1573
1574         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1575         iommu->gcmd &= ~DMA_GCMD_TE;
1576         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1577
1578         /* Make sure hardware complete it */
1579         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1580                       readl, (!(sts & DMA_GSTS_TES)), sts);
1581
1582         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1583 }
1584
1585
1586 static int iommu_init_domains(struct intel_iommu *iommu)
1587 {
1588         u32 ndomains, nlongs;
1589         size_t size;
1590
1591         ndomains = cap_ndoms(iommu->cap);
1592         pr_debug("%s: Number of Domains supported <%d>\n",
1593                  iommu->name, ndomains);
1594         nlongs = BITS_TO_LONGS(ndomains);
1595
1596         spin_lock_init(&iommu->lock);
1597
1598         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1599         if (!iommu->domain_ids) {
1600                 pr_err("%s: Allocating domain id array failed\n",
1601                        iommu->name);
1602                 return -ENOMEM;
1603         }
1604
1605         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1606         iommu->domains = kzalloc(size, GFP_KERNEL);
1607
1608         if (iommu->domains) {
1609                 size = 256 * sizeof(struct dmar_domain *);
1610                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1611         }
1612
1613         if (!iommu->domains || !iommu->domains[0]) {
1614                 pr_err("%s: Allocating domain array failed\n",
1615                        iommu->name);
1616                 kfree(iommu->domain_ids);
1617                 kfree(iommu->domains);
1618                 iommu->domain_ids = NULL;
1619                 iommu->domains    = NULL;
1620                 return -ENOMEM;
1621         }
1622
1623
1624
1625         /*
1626          * If Caching mode is set, then invalid translations are tagged
1627          * with domain-id 0, hence we need to pre-allocate it. We also
1628          * use domain-id 0 as a marker for non-allocated domain-id, so
1629          * make sure it is not used for a real domain.
1630          */
1631         set_bit(0, iommu->domain_ids);
1632
1633         /*
1634          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1635          * entry for first-level or pass-through translation modes should
1636          * be programmed with a domain id different from those used for
1637          * second-level or nested translation. We reserve a domain id for
1638          * this purpose.
1639          */
1640         if (sm_supported(iommu))
1641                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1642
1643         return 0;
1644 }
1645
1646 static void disable_dmar_iommu(struct intel_iommu *iommu)
1647 {
1648         struct device_domain_info *info, *tmp;
1649         unsigned long flags;
1650
1651         if (!iommu->domains || !iommu->domain_ids)
1652                 return;
1653
1654 again:
1655         spin_lock_irqsave(&device_domain_lock, flags);
1656         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1657                 struct dmar_domain *domain;
1658
1659                 if (info->iommu != iommu)
1660                         continue;
1661
1662                 if (!info->dev || !info->domain)
1663                         continue;
1664
1665                 domain = info->domain;
1666
1667                 __dmar_remove_one_dev_info(info);
1668
1669                 if (!domain_type_is_vm_or_si(domain)) {
1670                         /*
1671                          * The domain_exit() function  can't be called under
1672                          * device_domain_lock, as it takes this lock itself.
1673                          * So release the lock here and re-run the loop
1674                          * afterwards.
1675                          */
1676                         spin_unlock_irqrestore(&device_domain_lock, flags);
1677                         domain_exit(domain);
1678                         goto again;
1679                 }
1680         }
1681         spin_unlock_irqrestore(&device_domain_lock, flags);
1682
1683         if (iommu->gcmd & DMA_GCMD_TE)
1684                 iommu_disable_translation(iommu);
1685 }
1686
1687 static void free_dmar_iommu(struct intel_iommu *iommu)
1688 {
1689         if ((iommu->domains) && (iommu->domain_ids)) {
1690                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1691                 int i;
1692
1693                 for (i = 0; i < elems; i++)
1694                         kfree(iommu->domains[i]);
1695                 kfree(iommu->domains);
1696                 kfree(iommu->domain_ids);
1697                 iommu->domains = NULL;
1698                 iommu->domain_ids = NULL;
1699         }
1700
1701         g_iommus[iommu->seq_id] = NULL;
1702
1703         /* free context mapping */
1704         free_context_table(iommu);
1705
1706 #ifdef CONFIG_INTEL_IOMMU_SVM
1707         if (pasid_supported(iommu)) {
1708                 if (ecap_prs(iommu->ecap))
1709                         intel_svm_finish_prq(iommu);
1710         }
1711 #endif
1712 }
1713
1714 static struct dmar_domain *alloc_domain(int flags)
1715 {
1716         struct dmar_domain *domain;
1717
1718         domain = alloc_domain_mem();
1719         if (!domain)
1720                 return NULL;
1721
1722         memset(domain, 0, sizeof(*domain));
1723         domain->nid = NUMA_NO_NODE;
1724         domain->flags = flags;
1725         domain->has_iotlb_device = false;
1726         INIT_LIST_HEAD(&domain->devices);
1727
1728         return domain;
1729 }
1730
1731 /* Must be called with iommu->lock */
1732 static int domain_attach_iommu(struct dmar_domain *domain,
1733                                struct intel_iommu *iommu)
1734 {
1735         unsigned long ndomains;
1736         int num;
1737
1738         assert_spin_locked(&device_domain_lock);
1739         assert_spin_locked(&iommu->lock);
1740
1741         domain->iommu_refcnt[iommu->seq_id] += 1;
1742         domain->iommu_count += 1;
1743         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1744                 ndomains = cap_ndoms(iommu->cap);
1745                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1746
1747                 if (num >= ndomains) {
1748                         pr_err("%s: No free domain ids\n", iommu->name);
1749                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1750                         domain->iommu_count -= 1;
1751                         return -ENOSPC;
1752                 }
1753
1754                 set_bit(num, iommu->domain_ids);
1755                 set_iommu_domain(iommu, num, domain);
1756
1757                 domain->iommu_did[iommu->seq_id] = num;
1758                 domain->nid                      = iommu->node;
1759
1760                 domain_update_iommu_cap(domain);
1761         }
1762
1763         return 0;
1764 }
1765
1766 static int domain_detach_iommu(struct dmar_domain *domain,
1767                                struct intel_iommu *iommu)
1768 {
1769         int num, count;
1770
1771         assert_spin_locked(&device_domain_lock);
1772         assert_spin_locked(&iommu->lock);
1773
1774         domain->iommu_refcnt[iommu->seq_id] -= 1;
1775         count = --domain->iommu_count;
1776         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1777                 num = domain->iommu_did[iommu->seq_id];
1778                 clear_bit(num, iommu->domain_ids);
1779                 set_iommu_domain(iommu, num, NULL);
1780
1781                 domain_update_iommu_cap(domain);
1782                 domain->iommu_did[iommu->seq_id] = 0;
1783         }
1784
1785         return count;
1786 }
1787
1788 static struct iova_domain reserved_iova_list;
1789 static struct lock_class_key reserved_rbtree_key;
1790
1791 static int dmar_init_reserved_ranges(void)
1792 {
1793         struct pci_dev *pdev = NULL;
1794         struct iova *iova;
1795         int i;
1796
1797         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1798
1799         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1800                 &reserved_rbtree_key);
1801
1802         /* IOAPIC ranges shouldn't be accessed by DMA */
1803         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1804                 IOVA_PFN(IOAPIC_RANGE_END));
1805         if (!iova) {
1806                 pr_err("Reserve IOAPIC range failed\n");
1807                 return -ENODEV;
1808         }
1809
1810         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1811         for_each_pci_dev(pdev) {
1812                 struct resource *r;
1813
1814                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1815                         r = &pdev->resource[i];
1816                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1817                                 continue;
1818                         iova = reserve_iova(&reserved_iova_list,
1819                                             IOVA_PFN(r->start),
1820                                             IOVA_PFN(r->end));
1821                         if (!iova) {
1822                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1823                                 return -ENODEV;
1824                         }
1825                 }
1826         }
1827         return 0;
1828 }
1829
1830 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1831 {
1832         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1833 }
1834
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1836 {
1837         int agaw;
1838         int r = (gaw - 12) % 9;
1839
1840         if (r == 0)
1841                 agaw = gaw;
1842         else
1843                 agaw = gaw + 9 - r;
1844         if (agaw > 64)
1845                 agaw = 64;
1846         return agaw;
1847 }
1848
1849 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1850                        int guest_width)
1851 {
1852         int adjust_width, agaw;
1853         unsigned long sagaw;
1854         int err;
1855
1856         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1857
1858         err = init_iova_flush_queue(&domain->iovad,
1859                                     iommu_flush_iova, iova_entry_free);
1860         if (err)
1861                 return err;
1862
1863         domain_reserve_special_ranges(domain);
1864
1865         /* calculate AGAW */
1866         if (guest_width > cap_mgaw(iommu->cap))
1867                 guest_width = cap_mgaw(iommu->cap);
1868         domain->gaw = guest_width;
1869         adjust_width = guestwidth_to_adjustwidth(guest_width);
1870         agaw = width_to_agaw(adjust_width);
1871         sagaw = cap_sagaw(iommu->cap);
1872         if (!test_bit(agaw, &sagaw)) {
1873                 /* hardware doesn't support it, choose a bigger one */
1874                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1875                 agaw = find_next_bit(&sagaw, 5, agaw);
1876                 if (agaw >= 5)
1877                         return -ENODEV;
1878         }
1879         domain->agaw = agaw;
1880
1881         if (ecap_coherent(iommu->ecap))
1882                 domain->iommu_coherency = 1;
1883         else
1884                 domain->iommu_coherency = 0;
1885
1886         if (ecap_sc_support(iommu->ecap))
1887                 domain->iommu_snooping = 1;
1888         else
1889                 domain->iommu_snooping = 0;
1890
1891         if (intel_iommu_superpage)
1892                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1893         else
1894                 domain->iommu_superpage = 0;
1895
1896         domain->nid = iommu->node;
1897
1898         /* always allocate the top pgd */
1899         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1900         if (!domain->pgd)
1901                 return -ENOMEM;
1902         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1903         return 0;
1904 }
1905
1906 static void domain_exit(struct dmar_domain *domain)
1907 {
1908         struct page *freelist;
1909
1910         /* Remove associated devices and clear attached or cached domains */
1911         rcu_read_lock();
1912         domain_remove_dev_info(domain);
1913         rcu_read_unlock();
1914
1915         /* destroy iovas */
1916         put_iova_domain(&domain->iovad);
1917
1918         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1919
1920         dma_free_pagelist(freelist);
1921
1922         free_domain_mem(domain);
1923 }
1924
1925 /*
1926  * Get the PASID directory size for scalable mode context entry.
1927  * Value of X in the PDTS field of a scalable mode context entry
1928  * indicates PASID directory with 2^(X + 7) entries.
1929  */
1930 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1931 {
1932         int pds, max_pde;
1933
1934         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1935         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1936         if (pds < 7)
1937                 return 0;
1938
1939         return pds - 7;
1940 }
1941
1942 /*
1943  * Set the RID_PASID field of a scalable mode context entry. The
1944  * IOMMU hardware will use the PASID value set in this field for
1945  * DMA translations of DMA requests without PASID.
1946  */
1947 static inline void
1948 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1949 {
1950         context->hi |= pasid & ((1 << 20) - 1);
1951         context->hi |= (1 << 20);
1952 }
1953
1954 /*
1955  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1956  * entry.
1957  */
1958 static inline void context_set_sm_dte(struct context_entry *context)
1959 {
1960         context->lo |= (1 << 2);
1961 }
1962
1963 /*
1964  * Set the PRE(Page Request Enable) field of a scalable mode context
1965  * entry.
1966  */
1967 static inline void context_set_sm_pre(struct context_entry *context)
1968 {
1969         context->lo |= (1 << 4);
1970 }
1971
1972 /* Convert value to context PASID directory size field coding. */
1973 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1974
1975 static int domain_context_mapping_one(struct dmar_domain *domain,
1976                                       struct intel_iommu *iommu,
1977                                       struct pasid_table *table,
1978                                       u8 bus, u8 devfn)
1979 {
1980         u16 did = domain->iommu_did[iommu->seq_id];
1981         int translation = CONTEXT_TT_MULTI_LEVEL;
1982         struct device_domain_info *info = NULL;
1983         struct context_entry *context;
1984         unsigned long flags;
1985         int ret;
1986
1987         WARN_ON(did == 0);
1988
1989         if (hw_pass_through && domain_type_is_si(domain))
1990                 translation = CONTEXT_TT_PASS_THROUGH;
1991
1992         pr_debug("Set context mapping for %02x:%02x.%d\n",
1993                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1994
1995         BUG_ON(!domain->pgd);
1996
1997         spin_lock_irqsave(&device_domain_lock, flags);
1998         spin_lock(&iommu->lock);
1999
2000         ret = -ENOMEM;
2001         context = iommu_context_addr(iommu, bus, devfn, 1);
2002         if (!context)
2003                 goto out_unlock;
2004
2005         ret = 0;
2006         if (context_present(context))
2007                 goto out_unlock;
2008
2009         /*
2010          * For kdump cases, old valid entries may be cached due to the
2011          * in-flight DMA and copied pgtable, but there is no unmapping
2012          * behaviour for them, thus we need an explicit cache flush for
2013          * the newly-mapped device. For kdump, at this point, the device
2014          * is supposed to finish reset at its driver probe stage, so no
2015          * in-flight DMA will exist, and we don't need to worry anymore
2016          * hereafter.
2017          */
2018         if (context_copied(context)) {
2019                 u16 did_old = context_domain_id(context);
2020
2021                 if (did_old < cap_ndoms(iommu->cap)) {
2022                         iommu->flush.flush_context(iommu, did_old,
2023                                                    (((u16)bus) << 8) | devfn,
2024                                                    DMA_CCMD_MASK_NOBIT,
2025                                                    DMA_CCMD_DEVICE_INVL);
2026                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2027                                                  DMA_TLB_DSI_FLUSH);
2028                 }
2029         }
2030
2031         context_clear_entry(context);
2032
2033         if (sm_supported(iommu)) {
2034                 unsigned long pds;
2035
2036                 WARN_ON(!table);
2037
2038                 /* Setup the PASID DIR pointer: */
2039                 pds = context_get_sm_pds(table);
2040                 context->lo = (u64)virt_to_phys(table->table) |
2041                                 context_pdts(pds);
2042
2043                 /* Setup the RID_PASID field: */
2044                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2045
2046                 /*
2047                  * Setup the Device-TLB enable bit and Page request
2048                  * Enable bit:
2049                  */
2050                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2051                 if (info && info->ats_supported)
2052                         context_set_sm_dte(context);
2053                 if (info && info->pri_supported)
2054                         context_set_sm_pre(context);
2055         } else {
2056                 struct dma_pte *pgd = domain->pgd;
2057                 int agaw;
2058
2059                 context_set_domain_id(context, did);
2060
2061                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2062                         /*
2063                          * Skip top levels of page tables for iommu which has
2064                          * less agaw than default. Unnecessary for PT mode.
2065                          */
2066                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2067                                 ret = -ENOMEM;
2068                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2069                                 if (!dma_pte_present(pgd))
2070                                         goto out_unlock;
2071                         }
2072
2073                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2074                         if (info && info->ats_supported)
2075                                 translation = CONTEXT_TT_DEV_IOTLB;
2076                         else
2077                                 translation = CONTEXT_TT_MULTI_LEVEL;
2078
2079                         context_set_address_root(context, virt_to_phys(pgd));
2080                         context_set_address_width(context, agaw);
2081                 } else {
2082                         /*
2083                          * In pass through mode, AW must be programmed to
2084                          * indicate the largest AGAW value supported by
2085                          * hardware. And ASR is ignored by hardware.
2086                          */
2087                         context_set_address_width(context, iommu->msagaw);
2088                 }
2089
2090                 context_set_translation_type(context, translation);
2091         }
2092
2093         context_set_fault_enable(context);
2094         context_set_present(context);
2095         domain_flush_cache(domain, context, sizeof(*context));
2096
2097         /*
2098          * It's a non-present to present mapping. If hardware doesn't cache
2099          * non-present entry we only need to flush the write-buffer. If the
2100          * _does_ cache non-present entries, then it does so in the special
2101          * domain #0, which we have to flush:
2102          */
2103         if (cap_caching_mode(iommu->cap)) {
2104                 iommu->flush.flush_context(iommu, 0,
2105                                            (((u16)bus) << 8) | devfn,
2106                                            DMA_CCMD_MASK_NOBIT,
2107                                            DMA_CCMD_DEVICE_INVL);
2108                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2109         } else {
2110                 iommu_flush_write_buffer(iommu);
2111         }
2112         iommu_enable_dev_iotlb(info);
2113
2114         ret = 0;
2115
2116 out_unlock:
2117         spin_unlock(&iommu->lock);
2118         spin_unlock_irqrestore(&device_domain_lock, flags);
2119
2120         return ret;
2121 }
2122
2123 struct domain_context_mapping_data {
2124         struct dmar_domain *domain;
2125         struct intel_iommu *iommu;
2126         struct pasid_table *table;
2127 };
2128
2129 static int domain_context_mapping_cb(struct pci_dev *pdev,
2130                                      u16 alias, void *opaque)
2131 {
2132         struct domain_context_mapping_data *data = opaque;
2133
2134         return domain_context_mapping_one(data->domain, data->iommu,
2135                                           data->table, PCI_BUS_NUM(alias),
2136                                           alias & 0xff);
2137 }
2138
2139 static int
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2141 {
2142         struct domain_context_mapping_data data;
2143         struct pasid_table *table;
2144         struct intel_iommu *iommu;
2145         u8 bus, devfn;
2146
2147         iommu = device_to_iommu(dev, &bus, &devfn);
2148         if (!iommu)
2149                 return -ENODEV;
2150
2151         table = intel_pasid_get_table(dev);
2152
2153         if (!dev_is_pci(dev))
2154                 return domain_context_mapping_one(domain, iommu, table,
2155                                                   bus, devfn);
2156
2157         data.domain = domain;
2158         data.iommu = iommu;
2159         data.table = table;
2160
2161         return pci_for_each_dma_alias(to_pci_dev(dev),
2162                                       &domain_context_mapping_cb, &data);
2163 }
2164
2165 static int domain_context_mapped_cb(struct pci_dev *pdev,
2166                                     u16 alias, void *opaque)
2167 {
2168         struct intel_iommu *iommu = opaque;
2169
2170         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2171 }
2172
2173 static int domain_context_mapped(struct device *dev)
2174 {
2175         struct intel_iommu *iommu;
2176         u8 bus, devfn;
2177
2178         iommu = device_to_iommu(dev, &bus, &devfn);
2179         if (!iommu)
2180                 return -ENODEV;
2181
2182         if (!dev_is_pci(dev))
2183                 return device_context_mapped(iommu, bus, devfn);
2184
2185         return !pci_for_each_dma_alias(to_pci_dev(dev),
2186                                        domain_context_mapped_cb, iommu);
2187 }
2188
2189 /* Returns a number of VTD pages, but aligned to MM page size */
2190 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2191                                             size_t size)
2192 {
2193         host_addr &= ~PAGE_MASK;
2194         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2195 }
2196
2197 /* Return largest possible superpage level for a given mapping */
2198 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2199                                           unsigned long iov_pfn,
2200                                           unsigned long phy_pfn,
2201                                           unsigned long pages)
2202 {
2203         int support, level = 1;
2204         unsigned long pfnmerge;
2205
2206         support = domain->iommu_superpage;
2207
2208         /* To use a large page, the virtual *and* physical addresses
2209            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2210            of them will mean we have to use smaller pages. So just
2211            merge them and check both at once. */
2212         pfnmerge = iov_pfn | phy_pfn;
2213
2214         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2215                 pages >>= VTD_STRIDE_SHIFT;
2216                 if (!pages)
2217                         break;
2218                 pfnmerge >>= VTD_STRIDE_SHIFT;
2219                 level++;
2220                 support--;
2221         }
2222         return level;
2223 }
2224
2225 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2226                             struct scatterlist *sg, unsigned long phys_pfn,
2227                             unsigned long nr_pages, int prot)
2228 {
2229         struct dma_pte *first_pte = NULL, *pte = NULL;
2230         phys_addr_t uninitialized_var(pteval);
2231         unsigned long sg_res = 0;
2232         unsigned int largepage_lvl = 0;
2233         unsigned long lvl_pages = 0;
2234
2235         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2236
2237         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2238                 return -EINVAL;
2239
2240         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2241
2242         if (!sg) {
2243                 sg_res = nr_pages;
2244                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2245         }
2246
2247         while (nr_pages > 0) {
2248                 uint64_t tmp;
2249
2250                 if (!sg_res) {
2251                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2252
2253                         sg_res = aligned_nrpages(sg->offset, sg->length);
2254                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2255                         sg->dma_length = sg->length;
2256                         pteval = (sg_phys(sg) - pgoff) | prot;
2257                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2258                 }
2259
2260                 if (!pte) {
2261                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2262
2263                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2264                         if (!pte)
2265                                 return -ENOMEM;
2266                         /* It is large page*/
2267                         if (largepage_lvl > 1) {
2268                                 unsigned long nr_superpages, end_pfn;
2269
2270                                 pteval |= DMA_PTE_LARGE_PAGE;
2271                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2272
2273                                 nr_superpages = sg_res / lvl_pages;
2274                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2275
2276                                 /*
2277                                  * Ensure that old small page tables are
2278                                  * removed to make room for superpage(s).
2279                                  * We're adding new large pages, so make sure
2280                                  * we don't remove their parent tables.
2281                                  */
2282                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2283                                                        largepage_lvl + 1);
2284                         } else {
2285                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2286                         }
2287
2288                 }
2289                 /* We don't need lock here, nobody else
2290                  * touches the iova range
2291                  */
2292                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2293                 if (tmp) {
2294                         static int dumps = 5;
2295                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2296                                 iov_pfn, tmp, (unsigned long long)pteval);
2297                         if (dumps) {
2298                                 dumps--;
2299                                 debug_dma_dump_mappings(NULL);
2300                         }
2301                         WARN_ON(1);
2302                 }
2303
2304                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2305
2306                 BUG_ON(nr_pages < lvl_pages);
2307                 BUG_ON(sg_res < lvl_pages);
2308
2309                 nr_pages -= lvl_pages;
2310                 iov_pfn += lvl_pages;
2311                 phys_pfn += lvl_pages;
2312                 pteval += lvl_pages * VTD_PAGE_SIZE;
2313                 sg_res -= lvl_pages;
2314
2315                 /* If the next PTE would be the first in a new page, then we
2316                    need to flush the cache on the entries we've just written.
2317                    And then we'll need to recalculate 'pte', so clear it and
2318                    let it get set again in the if (!pte) block above.
2319
2320                    If we're done (!nr_pages) we need to flush the cache too.
2321
2322                    Also if we've been setting superpages, we may need to
2323                    recalculate 'pte' and switch back to smaller pages for the
2324                    end of the mapping, if the trailing size is not enough to
2325                    use another superpage (i.e. sg_res < lvl_pages). */
2326                 pte++;
2327                 if (!nr_pages || first_pte_in_page(pte) ||
2328                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2329                         domain_flush_cache(domain, first_pte,
2330                                            (void *)pte - (void *)first_pte);
2331                         pte = NULL;
2332                 }
2333
2334                 if (!sg_res && nr_pages)
2335                         sg = sg_next(sg);
2336         }
2337         return 0;
2338 }
2339
2340 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2341                          struct scatterlist *sg, unsigned long phys_pfn,
2342                          unsigned long nr_pages, int prot)
2343 {
2344        int ret;
2345        struct intel_iommu *iommu;
2346
2347        /* Do the real mapping first */
2348        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2349        if (ret)
2350                return ret;
2351
2352        /* Notify about the new mapping */
2353        if (domain_type_is_vm(domain)) {
2354                /* VM typed domains can have more than one IOMMUs */
2355                int iommu_id;
2356                for_each_domain_iommu(iommu_id, domain) {
2357                        iommu = g_iommus[iommu_id];
2358                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2359                }
2360        } else {
2361                /* General domains only have one IOMMU */
2362                iommu = domain_get_iommu(domain);
2363                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2364        }
2365
2366        return 0;
2367 }
2368
2369 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2370                                     struct scatterlist *sg, unsigned long nr_pages,
2371                                     int prot)
2372 {
2373         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2374 }
2375
2376 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2377                                      unsigned long phys_pfn, unsigned long nr_pages,
2378                                      int prot)
2379 {
2380         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2381 }
2382
2383 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2384 {
2385         unsigned long flags;
2386         struct context_entry *context;
2387         u16 did_old;
2388
2389         if (!iommu)
2390                 return;
2391
2392         spin_lock_irqsave(&iommu->lock, flags);
2393         context = iommu_context_addr(iommu, bus, devfn, 0);
2394         if (!context) {
2395                 spin_unlock_irqrestore(&iommu->lock, flags);
2396                 return;
2397         }
2398         did_old = context_domain_id(context);
2399         context_clear_entry(context);
2400         __iommu_flush_cache(iommu, context, sizeof(*context));
2401         spin_unlock_irqrestore(&iommu->lock, flags);
2402         iommu->flush.flush_context(iommu,
2403                                    did_old,
2404                                    (((u16)bus) << 8) | devfn,
2405                                    DMA_CCMD_MASK_NOBIT,
2406                                    DMA_CCMD_DEVICE_INVL);
2407         iommu->flush.flush_iotlb(iommu,
2408                                  did_old,
2409                                  0,
2410                                  0,
2411                                  DMA_TLB_DSI_FLUSH);
2412 }
2413
2414 static inline void unlink_domain_info(struct device_domain_info *info)
2415 {
2416         assert_spin_locked(&device_domain_lock);
2417         list_del(&info->link);
2418         list_del(&info->global);
2419         if (info->dev)
2420                 info->dev->archdata.iommu = NULL;
2421 }
2422
2423 static void domain_remove_dev_info(struct dmar_domain *domain)
2424 {
2425         struct device_domain_info *info, *tmp;
2426         unsigned long flags;
2427
2428         spin_lock_irqsave(&device_domain_lock, flags);
2429         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2430                 __dmar_remove_one_dev_info(info);
2431         spin_unlock_irqrestore(&device_domain_lock, flags);
2432 }
2433
2434 /*
2435  * find_domain
2436  * Note: we use struct device->archdata.iommu stores the info
2437  */
2438 static struct dmar_domain *find_domain(struct device *dev)
2439 {
2440         struct device_domain_info *info;
2441
2442         /* No lock here, assumes no domain exit in normal case */
2443         info = dev->archdata.iommu;
2444         if (likely(info))
2445                 return info->domain;
2446         return NULL;
2447 }
2448
2449 static inline struct device_domain_info *
2450 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2451 {
2452         struct device_domain_info *info;
2453
2454         list_for_each_entry(info, &device_domain_list, global)
2455                 if (info->iommu->segment == segment && info->bus == bus &&
2456                     info->devfn == devfn)
2457                         return info;
2458
2459         return NULL;
2460 }
2461
2462 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2463                                                     int bus, int devfn,
2464                                                     struct device *dev,
2465                                                     struct dmar_domain *domain)
2466 {
2467         struct dmar_domain *found = NULL;
2468         struct device_domain_info *info;
2469         unsigned long flags;
2470         int ret;
2471
2472         info = alloc_devinfo_mem();
2473         if (!info)
2474                 return NULL;
2475
2476         info->bus = bus;
2477         info->devfn = devfn;
2478         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2479         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2480         info->ats_qdep = 0;
2481         info->dev = dev;
2482         info->domain = domain;
2483         info->iommu = iommu;
2484         info->pasid_table = NULL;
2485
2486         if (dev && dev_is_pci(dev)) {
2487                 struct pci_dev *pdev = to_pci_dev(info->dev);
2488
2489                 if (!pdev->untrusted &&
2490                     !pci_ats_disabled() &&
2491                     ecap_dev_iotlb_support(iommu->ecap) &&
2492                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2493                     dmar_find_matched_atsr_unit(pdev))
2494                         info->ats_supported = 1;
2495
2496                 if (sm_supported(iommu)) {
2497                         if (pasid_supported(iommu)) {
2498                                 int features = pci_pasid_features(pdev);
2499                                 if (features >= 0)
2500                                         info->pasid_supported = features | 1;
2501                         }
2502
2503                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2504                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2505                                 info->pri_supported = 1;
2506                 }
2507         }
2508
2509         spin_lock_irqsave(&device_domain_lock, flags);
2510         if (dev)
2511                 found = find_domain(dev);
2512
2513         if (!found) {
2514                 struct device_domain_info *info2;
2515                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2516                 if (info2) {
2517                         found      = info2->domain;
2518                         info2->dev = dev;
2519                 }
2520         }
2521
2522         if (found) {
2523                 spin_unlock_irqrestore(&device_domain_lock, flags);
2524                 free_devinfo_mem(info);
2525                 /* Caller must free the original domain */
2526                 return found;
2527         }
2528
2529         spin_lock(&iommu->lock);
2530         ret = domain_attach_iommu(domain, iommu);
2531         spin_unlock(&iommu->lock);
2532
2533         if (ret) {
2534                 spin_unlock_irqrestore(&device_domain_lock, flags);
2535                 free_devinfo_mem(info);
2536                 return NULL;
2537         }
2538
2539         list_add(&info->link, &domain->devices);
2540         list_add(&info->global, &device_domain_list);
2541         if (dev)
2542                 dev->archdata.iommu = info;
2543         spin_unlock_irqrestore(&device_domain_lock, flags);
2544
2545         /* PASID table is mandatory for a PCI device in scalable mode. */
2546         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2547                 ret = intel_pasid_alloc_table(dev);
2548                 if (ret) {
2549                         dev_err(dev, "PASID table allocation failed\n");
2550                         dmar_remove_one_dev_info(dev);
2551                         return NULL;
2552                 }
2553
2554                 /* Setup the PASID entry for requests without PASID: */
2555                 spin_lock(&iommu->lock);
2556                 if (hw_pass_through && domain_type_is_si(domain))
2557                         ret = intel_pasid_setup_pass_through(iommu, domain,
2558                                         dev, PASID_RID2PASID);
2559                 else
2560                         ret = intel_pasid_setup_second_level(iommu, domain,
2561                                         dev, PASID_RID2PASID);
2562                 spin_unlock(&iommu->lock);
2563                 if (ret) {
2564                         dev_err(dev, "Setup RID2PASID failed\n");
2565                         dmar_remove_one_dev_info(dev);
2566                         return NULL;
2567                 }
2568         }
2569
2570         if (dev && domain_context_mapping(domain, dev)) {
2571                 dev_err(dev, "Domain context map failed\n");
2572                 dmar_remove_one_dev_info(dev);
2573                 return NULL;
2574         }
2575
2576         return domain;
2577 }
2578
2579 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2580 {
2581         *(u16 *)opaque = alias;
2582         return 0;
2583 }
2584
2585 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2586 {
2587         struct device_domain_info *info;
2588         struct dmar_domain *domain = NULL;
2589         struct intel_iommu *iommu;
2590         u16 dma_alias;
2591         unsigned long flags;
2592         u8 bus, devfn;
2593
2594         iommu = device_to_iommu(dev, &bus, &devfn);
2595         if (!iommu)
2596                 return NULL;
2597
2598         if (dev_is_pci(dev)) {
2599                 struct pci_dev *pdev = to_pci_dev(dev);
2600
2601                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2602
2603                 spin_lock_irqsave(&device_domain_lock, flags);
2604                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2605                                                       PCI_BUS_NUM(dma_alias),
2606                                                       dma_alias & 0xff);
2607                 if (info) {
2608                         iommu = info->iommu;
2609                         domain = info->domain;
2610                 }
2611                 spin_unlock_irqrestore(&device_domain_lock, flags);
2612
2613                 /* DMA alias already has a domain, use it */
2614                 if (info)
2615                         goto out;
2616         }
2617
2618         /* Allocate and initialize new domain for the device */
2619         domain = alloc_domain(0);
2620         if (!domain)
2621                 return NULL;
2622         if (domain_init(domain, iommu, gaw)) {
2623                 domain_exit(domain);
2624                 return NULL;
2625         }
2626
2627 out:
2628
2629         return domain;
2630 }
2631
2632 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633                                               struct dmar_domain *domain)
2634 {
2635         struct intel_iommu *iommu;
2636         struct dmar_domain *tmp;
2637         u16 req_id, dma_alias;
2638         u8 bus, devfn;
2639
2640         iommu = device_to_iommu(dev, &bus, &devfn);
2641         if (!iommu)
2642                 return NULL;
2643
2644         req_id = ((u16)bus << 8) | devfn;
2645
2646         if (dev_is_pci(dev)) {
2647                 struct pci_dev *pdev = to_pci_dev(dev);
2648
2649                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2650
2651                 /* register PCI DMA alias device */
2652                 if (req_id != dma_alias) {
2653                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654                                         dma_alias & 0xff, NULL, domain);
2655
2656                         if (!tmp || tmp != domain)
2657                                 return tmp;
2658                 }
2659         }
2660
2661         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662         if (!tmp || tmp != domain)
2663                 return tmp;
2664
2665         return domain;
2666 }
2667
2668 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2669 {
2670         struct dmar_domain *domain, *tmp;
2671
2672         domain = find_domain(dev);
2673         if (domain)
2674                 goto out;
2675
2676         domain = find_or_alloc_domain(dev, gaw);
2677         if (!domain)
2678                 goto out;
2679
2680         tmp = set_domain_for_dev(dev, domain);
2681         if (!tmp || domain != tmp) {
2682                 domain_exit(domain);
2683                 domain = tmp;
2684         }
2685
2686 out:
2687
2688         return domain;
2689 }
2690
2691 static int iommu_domain_identity_map(struct dmar_domain *domain,
2692                                      unsigned long long start,
2693                                      unsigned long long end)
2694 {
2695         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2696         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2697
2698         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2699                           dma_to_mm_pfn(last_vpfn))) {
2700                 pr_err("Reserving iova failed\n");
2701                 return -ENOMEM;
2702         }
2703
2704         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2705         /*
2706          * RMRR range might have overlap with physical memory range,
2707          * clear it first
2708          */
2709         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2710
2711         return __domain_mapping(domain, first_vpfn, NULL,
2712                                 first_vpfn, last_vpfn - first_vpfn + 1,
2713                                 DMA_PTE_READ|DMA_PTE_WRITE);
2714 }
2715
2716 static int domain_prepare_identity_map(struct device *dev,
2717                                        struct dmar_domain *domain,
2718                                        unsigned long long start,
2719                                        unsigned long long end)
2720 {
2721         /* For _hardware_ passthrough, don't bother. But for software
2722            passthrough, we do it anyway -- it may indicate a memory
2723            range which is reserved in E820, so which didn't get set
2724            up to start with in si_domain */
2725         if (domain == si_domain && hw_pass_through) {
2726                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2727                          start, end);
2728                 return 0;
2729         }
2730
2731         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2732
2733         if (end < start) {
2734                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2735                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2736                         dmi_get_system_info(DMI_BIOS_VENDOR),
2737                         dmi_get_system_info(DMI_BIOS_VERSION),
2738                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2739                 return -EIO;
2740         }
2741
2742         if (end >> agaw_to_width(domain->agaw)) {
2743                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2744                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2745                      agaw_to_width(domain->agaw),
2746                      dmi_get_system_info(DMI_BIOS_VENDOR),
2747                      dmi_get_system_info(DMI_BIOS_VERSION),
2748                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2749                 return -EIO;
2750         }
2751
2752         return iommu_domain_identity_map(domain, start, end);
2753 }
2754
2755 static int iommu_prepare_identity_map(struct device *dev,
2756                                       unsigned long long start,
2757                                       unsigned long long end)
2758 {
2759         struct dmar_domain *domain;
2760         int ret;
2761
2762         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2763         if (!domain)
2764                 return -ENOMEM;
2765
2766         ret = domain_prepare_identity_map(dev, domain, start, end);
2767         if (ret)
2768                 domain_exit(domain);
2769
2770         return ret;
2771 }
2772
2773 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2774                                          struct device *dev)
2775 {
2776         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2777                 return 0;
2778         return iommu_prepare_identity_map(dev, rmrr->base_address,
2779                                           rmrr->end_address);
2780 }
2781
2782 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2783 static inline void iommu_prepare_isa(void)
2784 {
2785         struct pci_dev *pdev;
2786         int ret;
2787
2788         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2789         if (!pdev)
2790                 return;
2791
2792         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2793         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2794
2795         if (ret)
2796                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2797
2798         pci_dev_put(pdev);
2799 }
2800 #else
2801 static inline void iommu_prepare_isa(void)
2802 {
2803         return;
2804 }
2805 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2806
2807 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2808
2809 static int __init si_domain_init(int hw)
2810 {
2811         int nid, ret;
2812
2813         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2814         if (!si_domain)
2815                 return -EFAULT;
2816
2817         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2818                 domain_exit(si_domain);
2819                 return -EFAULT;
2820         }
2821
2822         pr_debug("Identity mapping domain allocated\n");
2823
2824         if (hw)
2825                 return 0;
2826
2827         for_each_online_node(nid) {
2828                 unsigned long start_pfn, end_pfn;
2829                 int i;
2830
2831                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2832                         ret = iommu_domain_identity_map(si_domain,
2833                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2834                         if (ret)
2835                                 return ret;
2836                 }
2837         }
2838
2839         return 0;
2840 }
2841
2842 static int identity_mapping(struct device *dev)
2843 {
2844         struct device_domain_info *info;
2845
2846         if (likely(!iommu_identity_mapping))
2847                 return 0;
2848
2849         info = dev->archdata.iommu;
2850         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2851                 return (info->domain == si_domain);
2852
2853         return 0;
2854 }
2855
2856 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2857 {
2858         struct dmar_domain *ndomain;
2859         struct intel_iommu *iommu;
2860         u8 bus, devfn;
2861
2862         iommu = device_to_iommu(dev, &bus, &devfn);
2863         if (!iommu)
2864                 return -ENODEV;
2865
2866         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2867         if (ndomain != domain)
2868                 return -EBUSY;
2869
2870         return 0;
2871 }
2872
2873 static bool device_has_rmrr(struct device *dev)
2874 {
2875         struct dmar_rmrr_unit *rmrr;
2876         struct device *tmp;
2877         int i;
2878
2879         rcu_read_lock();
2880         for_each_rmrr_units(rmrr) {
2881                 /*
2882                  * Return TRUE if this RMRR contains the device that
2883                  * is passed in.
2884                  */
2885                 for_each_active_dev_scope(rmrr->devices,
2886                                           rmrr->devices_cnt, i, tmp)
2887                         if (tmp == dev) {
2888                                 rcu_read_unlock();
2889                                 return true;
2890                         }
2891         }
2892         rcu_read_unlock();
2893         return false;
2894 }
2895
2896 /*
2897  * There are a couple cases where we need to restrict the functionality of
2898  * devices associated with RMRRs.  The first is when evaluating a device for
2899  * identity mapping because problems exist when devices are moved in and out
2900  * of domains and their respective RMRR information is lost.  This means that
2901  * a device with associated RMRRs will never be in a "passthrough" domain.
2902  * The second is use of the device through the IOMMU API.  This interface
2903  * expects to have full control of the IOVA space for the device.  We cannot
2904  * satisfy both the requirement that RMRR access is maintained and have an
2905  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2906  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2907  * We therefore prevent devices associated with an RMRR from participating in
2908  * the IOMMU API, which eliminates them from device assignment.
2909  *
2910  * In both cases we assume that PCI USB devices with RMRRs have them largely
2911  * for historical reasons and that the RMRR space is not actively used post
2912  * boot.  This exclusion may change if vendors begin to abuse it.
2913  *
2914  * The same exception is made for graphics devices, with the requirement that
2915  * any use of the RMRR regions will be torn down before assigning the device
2916  * to a guest.
2917  */
2918 static bool device_is_rmrr_locked(struct device *dev)
2919 {
2920         if (!device_has_rmrr(dev))
2921                 return false;
2922
2923         if (dev_is_pci(dev)) {
2924                 struct pci_dev *pdev = to_pci_dev(dev);
2925
2926                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2927                         return false;
2928         }
2929
2930         return true;
2931 }
2932
2933 static int iommu_should_identity_map(struct device *dev, int startup)
2934 {
2935         if (dev_is_pci(dev)) {
2936                 struct pci_dev *pdev = to_pci_dev(dev);
2937
2938                 if (device_is_rmrr_locked(dev))
2939                         return 0;
2940
2941                 /*
2942                  * Prevent any device marked as untrusted from getting
2943                  * placed into the statically identity mapping domain.
2944                  */
2945                 if (pdev->untrusted)
2946                         return 0;
2947
2948                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2949                         return 1;
2950
2951                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2952                         return 1;
2953
2954                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2955                         return 0;
2956
2957                 /*
2958                  * We want to start off with all devices in the 1:1 domain, and
2959                  * take them out later if we find they can't access all of memory.
2960                  *
2961                  * However, we can't do this for PCI devices behind bridges,
2962                  * because all PCI devices behind the same bridge will end up
2963                  * with the same source-id on their transactions.
2964                  *
2965                  * Practically speaking, we can't change things around for these
2966                  * devices at run-time, because we can't be sure there'll be no
2967                  * DMA transactions in flight for any of their siblings.
2968                  *
2969                  * So PCI devices (unless they're on the root bus) as well as
2970                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2971                  * the 1:1 domain, just in _case_ one of their siblings turns out
2972                  * not to be able to map all of memory.
2973                  */
2974                 if (!pci_is_pcie(pdev)) {
2975                         if (!pci_is_root_bus(pdev->bus))
2976                                 return 0;
2977                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2978                                 return 0;
2979                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2980                         return 0;
2981         } else {
2982                 if (device_has_rmrr(dev))
2983                         return 0;
2984         }
2985
2986         /*
2987          * At boot time, we don't yet know if devices will be 64-bit capable.
2988          * Assume that they will — if they turn out not to be, then we can
2989          * take them out of the 1:1 domain later.
2990          */
2991         if (!startup) {
2992                 /*
2993                  * If the device's dma_mask is less than the system's memory
2994                  * size then this is not a candidate for identity mapping.
2995                  */
2996                 u64 dma_mask = *dev->dma_mask;
2997
2998                 if (dev->coherent_dma_mask &&
2999                     dev->coherent_dma_mask < dma_mask)
3000                         dma_mask = dev->coherent_dma_mask;
3001
3002                 return dma_mask >= dma_get_required_mask(dev);
3003         }
3004
3005         return 1;
3006 }
3007
3008 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3009 {
3010         int ret;
3011
3012         if (!iommu_should_identity_map(dev, 1))
3013                 return 0;
3014
3015         ret = domain_add_dev_info(si_domain, dev);
3016         if (!ret)
3017                 dev_info(dev, "%s identity mapping\n",
3018                          hw ? "Hardware" : "Software");
3019         else if (ret == -ENODEV)
3020                 /* device not associated with an iommu */
3021                 ret = 0;
3022
3023         return ret;
3024 }
3025
3026
3027 static int __init iommu_prepare_static_identity_mapping(int hw)
3028 {
3029         struct pci_dev *pdev = NULL;
3030         struct dmar_drhd_unit *drhd;
3031         struct intel_iommu *iommu;
3032         struct device *dev;
3033         int i;
3034         int ret = 0;
3035
3036         for_each_pci_dev(pdev) {
3037                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3038                 if (ret)
3039                         return ret;
3040         }
3041
3042         for_each_active_iommu(iommu, drhd)
3043                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3044                         struct acpi_device_physical_node *pn;
3045                         struct acpi_device *adev;
3046
3047                         if (dev->bus != &acpi_bus_type)
3048                                 continue;
3049
3050                         adev= to_acpi_device(dev);
3051                         mutex_lock(&adev->physical_node_lock);
3052                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3053                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3054                                 if (ret)
3055                                         break;
3056                         }
3057                         mutex_unlock(&adev->physical_node_lock);
3058                         if (ret)
3059                                 return ret;
3060                 }
3061
3062         return 0;
3063 }
3064
3065 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3066 {
3067         /*
3068          * Start from the sane iommu hardware state.
3069          * If the queued invalidation is already initialized by us
3070          * (for example, while enabling interrupt-remapping) then
3071          * we got the things already rolling from a sane state.
3072          */
3073         if (!iommu->qi) {
3074                 /*
3075                  * Clear any previous faults.
3076                  */
3077                 dmar_fault(-1, iommu);
3078                 /*
3079                  * Disable queued invalidation if supported and already enabled
3080                  * before OS handover.
3081                  */
3082                 dmar_disable_qi(iommu);
3083         }
3084
3085         if (dmar_enable_qi(iommu)) {
3086                 /*
3087                  * Queued Invalidate not enabled, use Register Based Invalidate
3088                  */
3089                 iommu->flush.flush_context = __iommu_flush_context;
3090                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3091                 pr_info("%s: Using Register based invalidation\n",
3092                         iommu->name);
3093         } else {
3094                 iommu->flush.flush_context = qi_flush_context;
3095                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3096                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3097         }
3098 }
3099
3100 static int copy_context_table(struct intel_iommu *iommu,
3101                               struct root_entry *old_re,
3102                               struct context_entry **tbl,
3103                               int bus, bool ext)
3104 {
3105         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3106         struct context_entry *new_ce = NULL, ce;
3107         struct context_entry *old_ce = NULL;
3108         struct root_entry re;
3109         phys_addr_t old_ce_phys;
3110
3111         tbl_idx = ext ? bus * 2 : bus;
3112         memcpy(&re, old_re, sizeof(re));
3113
3114         for (devfn = 0; devfn < 256; devfn++) {
3115                 /* First calculate the correct index */
3116                 idx = (ext ? devfn * 2 : devfn) % 256;
3117
3118                 if (idx == 0) {
3119                         /* First save what we may have and clean up */
3120                         if (new_ce) {
3121                                 tbl[tbl_idx] = new_ce;
3122                                 __iommu_flush_cache(iommu, new_ce,
3123                                                     VTD_PAGE_SIZE);
3124                                 pos = 1;
3125                         }
3126
3127                         if (old_ce)
3128                                 memunmap(old_ce);
3129
3130                         ret = 0;
3131                         if (devfn < 0x80)
3132                                 old_ce_phys = root_entry_lctp(&re);
3133                         else
3134                                 old_ce_phys = root_entry_uctp(&re);
3135
3136                         if (!old_ce_phys) {
3137                                 if (ext && devfn == 0) {
3138                                         /* No LCTP, try UCTP */
3139                                         devfn = 0x7f;
3140                                         continue;
3141                                 } else {
3142                                         goto out;
3143                                 }
3144                         }
3145
3146                         ret = -ENOMEM;
3147                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3148                                         MEMREMAP_WB);
3149                         if (!old_ce)
3150                                 goto out;
3151
3152                         new_ce = alloc_pgtable_page(iommu->node);
3153                         if (!new_ce)
3154                                 goto out_unmap;
3155
3156                         ret = 0;
3157                 }
3158
3159                 /* Now copy the context entry */
3160                 memcpy(&ce, old_ce + idx, sizeof(ce));
3161
3162                 if (!__context_present(&ce))
3163                         continue;
3164
3165                 did = context_domain_id(&ce);
3166                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3167                         set_bit(did, iommu->domain_ids);
3168
3169                 /*
3170                  * We need a marker for copied context entries. This
3171                  * marker needs to work for the old format as well as
3172                  * for extended context entries.
3173                  *
3174                  * Bit 67 of the context entry is used. In the old
3175                  * format this bit is available to software, in the
3176                  * extended format it is the PGE bit, but PGE is ignored
3177                  * by HW if PASIDs are disabled (and thus still
3178                  * available).
3179                  *
3180                  * So disable PASIDs first and then mark the entry
3181                  * copied. This means that we don't copy PASID
3182                  * translations from the old kernel, but this is fine as
3183                  * faults there are not fatal.
3184                  */
3185                 context_clear_pasid_enable(&ce);
3186                 context_set_copied(&ce);
3187
3188                 new_ce[idx] = ce;
3189         }
3190
3191         tbl[tbl_idx + pos] = new_ce;
3192
3193         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3194
3195 out_unmap:
3196         memunmap(old_ce);
3197
3198 out:
3199         return ret;
3200 }
3201
3202 static int copy_translation_tables(struct intel_iommu *iommu)
3203 {
3204         struct context_entry **ctxt_tbls;
3205         struct root_entry *old_rt;
3206         phys_addr_t old_rt_phys;
3207         int ctxt_table_entries;
3208         unsigned long flags;
3209         u64 rtaddr_reg;
3210         int bus, ret;
3211         bool new_ext, ext;
3212
3213         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3214         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3215         new_ext    = !!ecap_ecs(iommu->ecap);
3216
3217         /*
3218          * The RTT bit can only be changed when translation is disabled,
3219          * but disabling translation means to open a window for data
3220          * corruption. So bail out and don't copy anything if we would
3221          * have to change the bit.
3222          */
3223         if (new_ext != ext)
3224                 return -EINVAL;
3225
3226         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3227         if (!old_rt_phys)
3228                 return -EINVAL;
3229
3230         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3231         if (!old_rt)
3232                 return -ENOMEM;
3233
3234         /* This is too big for the stack - allocate it from slab */
3235         ctxt_table_entries = ext ? 512 : 256;
3236         ret = -ENOMEM;
3237         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3238         if (!ctxt_tbls)
3239                 goto out_unmap;
3240
3241         for (bus = 0; bus < 256; bus++) {
3242                 ret = copy_context_table(iommu, &old_rt[bus],
3243                                          ctxt_tbls, bus, ext);
3244                 if (ret) {
3245                         pr_err("%s: Failed to copy context table for bus %d\n",
3246                                 iommu->name, bus);
3247                         continue;
3248                 }
3249         }
3250
3251         spin_lock_irqsave(&iommu->lock, flags);
3252
3253         /* Context tables are copied, now write them to the root_entry table */
3254         for (bus = 0; bus < 256; bus++) {
3255                 int idx = ext ? bus * 2 : bus;
3256                 u64 val;
3257
3258                 if (ctxt_tbls[idx]) {
3259                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3260                         iommu->root_entry[bus].lo = val;
3261                 }
3262
3263                 if (!ext || !ctxt_tbls[idx + 1])
3264                         continue;
3265
3266                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3267                 iommu->root_entry[bus].hi = val;
3268         }
3269
3270         spin_unlock_irqrestore(&iommu->lock, flags);
3271
3272         kfree(ctxt_tbls);
3273
3274         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3275
3276         ret = 0;
3277
3278 out_unmap:
3279         memunmap(old_rt);
3280
3281         return ret;
3282 }
3283
3284 static int __init init_dmars(void)
3285 {
3286         struct dmar_drhd_unit *drhd;
3287         struct dmar_rmrr_unit *rmrr;
3288         bool copied_tables = false;
3289         struct device *dev;
3290         struct intel_iommu *iommu;
3291         int i, ret;
3292
3293         /*
3294          * for each drhd
3295          *    allocate root
3296          *    initialize and program root entry to not present
3297          * endfor
3298          */
3299         for_each_drhd_unit(drhd) {
3300                 /*
3301                  * lock not needed as this is only incremented in the single
3302                  * threaded kernel __init code path all other access are read
3303                  * only
3304                  */
3305                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3306                         g_num_of_iommus++;
3307                         continue;
3308                 }
3309                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3310         }
3311
3312         /* Preallocate enough resources for IOMMU hot-addition */
3313         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3314                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3315
3316         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3317                         GFP_KERNEL);
3318         if (!g_iommus) {
3319                 pr_err("Allocating global iommu array failed\n");
3320                 ret = -ENOMEM;
3321                 goto error;
3322         }
3323
3324         for_each_active_iommu(iommu, drhd) {
3325                 /*
3326                  * Find the max pasid size of all IOMMU's in the system.
3327                  * We need to ensure the system pasid table is no bigger
3328                  * than the smallest supported.
3329                  */
3330                 if (pasid_supported(iommu)) {
3331                         u32 temp = 2 << ecap_pss(iommu->ecap);
3332
3333                         intel_pasid_max_id = min_t(u32, temp,
3334                                                    intel_pasid_max_id);
3335                 }
3336
3337                 g_iommus[iommu->seq_id] = iommu;
3338
3339                 intel_iommu_init_qi(iommu);
3340
3341                 ret = iommu_init_domains(iommu);
3342                 if (ret)
3343                         goto free_iommu;
3344
3345                 init_translation_status(iommu);
3346
3347                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3348                         iommu_disable_translation(iommu);
3349                         clear_translation_pre_enabled(iommu);
3350                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3351                                 iommu->name);
3352                 }
3353
3354                 /*
3355                  * TBD:
3356                  * we could share the same root & context tables
3357                  * among all IOMMU's. Need to Split it later.
3358                  */
3359                 ret = iommu_alloc_root_entry(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362
3363                 if (translation_pre_enabled(iommu)) {
3364                         pr_info("Translation already enabled - trying to copy translation structures\n");
3365
3366                         ret = copy_translation_tables(iommu);
3367                         if (ret) {
3368                                 /*
3369                                  * We found the IOMMU with translation
3370                                  * enabled - but failed to copy over the
3371                                  * old root-entry table. Try to proceed
3372                                  * by disabling translation now and
3373                                  * allocating a clean root-entry table.
3374                                  * This might cause DMAR faults, but
3375                                  * probably the dump will still succeed.
3376                                  */
3377                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3378                                        iommu->name);
3379                                 iommu_disable_translation(iommu);
3380                                 clear_translation_pre_enabled(iommu);
3381                         } else {
3382                                 pr_info("Copied translation tables from previous kernel for %s\n",
3383                                         iommu->name);
3384                                 copied_tables = true;
3385                         }
3386                 }
3387
3388                 if (!ecap_pass_through(iommu->ecap))
3389                         hw_pass_through = 0;
3390 #ifdef CONFIG_INTEL_IOMMU_SVM
3391                 if (pasid_supported(iommu))
3392                         intel_svm_init(iommu);
3393 #endif
3394         }
3395
3396         /*
3397          * Now that qi is enabled on all iommus, set the root entry and flush
3398          * caches. This is required on some Intel X58 chipsets, otherwise the
3399          * flush_context function will loop forever and the boot hangs.
3400          */
3401         for_each_active_iommu(iommu, drhd) {
3402                 iommu_flush_write_buffer(iommu);
3403                 iommu_set_root_entry(iommu);
3404                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3405                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3406         }
3407
3408         if (iommu_pass_through)
3409                 iommu_identity_mapping |= IDENTMAP_ALL;
3410
3411 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3412         iommu_identity_mapping |= IDENTMAP_GFX;
3413 #endif
3414
3415         check_tylersburg_isoch();
3416
3417         if (iommu_identity_mapping) {
3418                 ret = si_domain_init(hw_pass_through);
3419                 if (ret)
3420                         goto free_iommu;
3421         }
3422
3423
3424         /*
3425          * If we copied translations from a previous kernel in the kdump
3426          * case, we can not assign the devices to domains now, as that
3427          * would eliminate the old mappings. So skip this part and defer
3428          * the assignment to device driver initialization time.
3429          */
3430         if (copied_tables)
3431                 goto domains_done;
3432
3433         /*
3434          * If pass through is not set or not enabled, setup context entries for
3435          * identity mappings for rmrr, gfx, and isa and may fall back to static
3436          * identity mapping if iommu_identity_mapping is set.
3437          */
3438         if (iommu_identity_mapping) {
3439                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3440                 if (ret) {
3441                         pr_crit("Failed to setup IOMMU pass-through\n");
3442                         goto free_iommu;
3443                 }
3444         }
3445         /*
3446          * For each rmrr
3447          *   for each dev attached to rmrr
3448          *   do
3449          *     locate drhd for dev, alloc domain for dev
3450          *     allocate free domain
3451          *     allocate page table entries for rmrr
3452          *     if context not allocated for bus
3453          *           allocate and init context
3454          *           set present in root table for this bus
3455          *     init context with domain, translation etc
3456          *    endfor
3457          * endfor
3458          */
3459         pr_info("Setting RMRR:\n");
3460         for_each_rmrr_units(rmrr) {
3461                 /* some BIOS lists non-exist devices in DMAR table. */
3462                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3463                                           i, dev) {
3464                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3465                         if (ret)
3466                                 pr_err("Mapping reserved region failed\n");
3467                 }
3468         }
3469
3470         iommu_prepare_isa();
3471
3472 domains_done:
3473
3474         /*
3475          * for each drhd
3476          *   enable fault log
3477          *   global invalidate context cache
3478          *   global invalidate iotlb
3479          *   enable translation
3480          */
3481         for_each_iommu(iommu, drhd) {
3482                 if (drhd->ignored) {
3483                         /*
3484                          * we always have to disable PMRs or DMA may fail on
3485                          * this device
3486                          */
3487                         if (force_on)
3488                                 iommu_disable_protect_mem_regions(iommu);
3489                         continue;
3490                 }
3491
3492                 iommu_flush_write_buffer(iommu);
3493
3494 #ifdef CONFIG_INTEL_IOMMU_SVM
3495                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3496                         ret = intel_svm_enable_prq(iommu);
3497                         if (ret)
3498                                 goto free_iommu;
3499                 }
3500 #endif
3501                 ret = dmar_set_interrupt(iommu);
3502                 if (ret)
3503                         goto free_iommu;
3504
3505                 if (!translation_pre_enabled(iommu))
3506                         iommu_enable_translation(iommu);
3507
3508                 iommu_disable_protect_mem_regions(iommu);
3509         }
3510
3511         return 0;
3512
3513 free_iommu:
3514         for_each_active_iommu(iommu, drhd) {
3515                 disable_dmar_iommu(iommu);
3516                 free_dmar_iommu(iommu);
3517         }
3518
3519         kfree(g_iommus);
3520
3521 error:
3522         return ret;
3523 }
3524
3525 /* This takes a number of _MM_ pages, not VTD pages */
3526 static unsigned long intel_alloc_iova(struct device *dev,
3527                                      struct dmar_domain *domain,
3528                                      unsigned long nrpages, uint64_t dma_mask)
3529 {
3530         unsigned long iova_pfn;
3531
3532         /* Restrict dma_mask to the width that the iommu can handle */
3533         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3534         /* Ensure we reserve the whole size-aligned region */
3535         nrpages = __roundup_pow_of_two(nrpages);
3536
3537         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3538                 /*
3539                  * First try to allocate an io virtual address in
3540                  * DMA_BIT_MASK(32) and if that fails then try allocating
3541                  * from higher range
3542                  */
3543                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3544                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3545                 if (iova_pfn)
3546                         return iova_pfn;
3547         }
3548         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3549                                    IOVA_PFN(dma_mask), true);
3550         if (unlikely(!iova_pfn)) {
3551                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3552                 return 0;
3553         }
3554
3555         return iova_pfn;
3556 }
3557
3558 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3559 {
3560         struct dmar_domain *domain, *tmp;
3561         struct dmar_rmrr_unit *rmrr;
3562         struct device *i_dev;
3563         int i, ret;
3564
3565         domain = find_domain(dev);
3566         if (domain)
3567                 goto out;
3568
3569         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3570         if (!domain)
3571                 goto out;
3572
3573         /* We have a new domain - setup possible RMRRs for the device */
3574         rcu_read_lock();
3575         for_each_rmrr_units(rmrr) {
3576                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3577                                           i, i_dev) {
3578                         if (i_dev != dev)
3579                                 continue;
3580
3581                         ret = domain_prepare_identity_map(dev, domain,
3582                                                           rmrr->base_address,
3583                                                           rmrr->end_address);
3584                         if (ret)
3585                                 dev_err(dev, "Mapping reserved region failed\n");
3586                 }
3587         }
3588         rcu_read_unlock();
3589
3590         tmp = set_domain_for_dev(dev, domain);
3591         if (!tmp || domain != tmp) {
3592                 domain_exit(domain);
3593                 domain = tmp;
3594         }
3595
3596 out:
3597
3598         if (!domain)
3599                 dev_err(dev, "Allocating domain failed\n");
3600
3601
3602         return domain;
3603 }
3604
3605 /* Check if the dev needs to go through non-identity map and unmap process.*/
3606 static int iommu_no_mapping(struct device *dev)
3607 {
3608         int found;
3609
3610         if (iommu_dummy(dev))
3611                 return 1;
3612
3613         if (!iommu_identity_mapping)
3614                 return 0;
3615
3616         found = identity_mapping(dev);
3617         if (found) {
3618                 if (iommu_should_identity_map(dev, 0))
3619                         return 1;
3620                 else {
3621                         /*
3622                          * 32 bit DMA is removed from si_domain and fall back
3623                          * to non-identity mapping.
3624                          */
3625                         dmar_remove_one_dev_info(dev);
3626                         dev_info(dev, "32bit DMA uses non-identity mapping\n");
3627                         return 0;
3628                 }
3629         } else {
3630                 /*
3631                  * In case of a detached 64 bit DMA device from vm, the device
3632                  * is put into si_domain for identity mapping.
3633                  */
3634                 if (iommu_should_identity_map(dev, 0)) {
3635                         int ret;
3636                         ret = domain_add_dev_info(si_domain, dev);
3637                         if (!ret) {
3638                                 dev_info(dev, "64bit DMA uses identity mapping\n");
3639                                 return 1;
3640                         }
3641                 }
3642         }
3643
3644         return 0;
3645 }
3646
3647 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3648                                      size_t size, int dir, u64 dma_mask)
3649 {
3650         struct dmar_domain *domain;
3651         phys_addr_t start_paddr;
3652         unsigned long iova_pfn;
3653         int prot = 0;
3654         int ret;
3655         struct intel_iommu *iommu;
3656         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3657
3658         BUG_ON(dir == DMA_NONE);
3659
3660         if (iommu_no_mapping(dev))
3661                 return paddr;
3662
3663         domain = get_valid_domain_for_dev(dev);
3664         if (!domain)
3665                 return DMA_MAPPING_ERROR;
3666
3667         iommu = domain_get_iommu(domain);
3668         size = aligned_nrpages(paddr, size);
3669
3670         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3671         if (!iova_pfn)
3672                 goto error;
3673
3674         /*
3675          * Check if DMAR supports zero-length reads on write only
3676          * mappings..
3677          */
3678         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3679                         !cap_zlr(iommu->cap))
3680                 prot |= DMA_PTE_READ;
3681         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3682                 prot |= DMA_PTE_WRITE;
3683         /*
3684          * paddr - (paddr + size) might be partial page, we should map the whole
3685          * page.  Note: if two part of one page are separately mapped, we
3686          * might have two guest_addr mapping to the same host paddr, but this
3687          * is not a big problem
3688          */
3689         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3690                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3691         if (ret)
3692                 goto error;
3693
3694         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3695         start_paddr += paddr & ~PAGE_MASK;
3696         return start_paddr;
3697
3698 error:
3699         if (iova_pfn)
3700                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3701         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3702                 size, (unsigned long long)paddr, dir);
3703         return DMA_MAPPING_ERROR;
3704 }
3705
3706 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3707                                  unsigned long offset, size_t size,
3708                                  enum dma_data_direction dir,
3709                                  unsigned long attrs)
3710 {
3711         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3712                                   dir, *dev->dma_mask);
3713 }
3714
3715 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3716                                      size_t size, enum dma_data_direction dir,
3717                                      unsigned long attrs)
3718 {
3719         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3720 }
3721
3722 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3723 {
3724         struct dmar_domain *domain;
3725         unsigned long start_pfn, last_pfn;
3726         unsigned long nrpages;
3727         unsigned long iova_pfn;
3728         struct intel_iommu *iommu;
3729         struct page *freelist;
3730
3731         if (iommu_no_mapping(dev))
3732                 return;
3733
3734         domain = find_domain(dev);
3735         BUG_ON(!domain);
3736
3737         iommu = domain_get_iommu(domain);
3738
3739         iova_pfn = IOVA_PFN(dev_addr);
3740
3741         nrpages = aligned_nrpages(dev_addr, size);
3742         start_pfn = mm_to_dma_pfn(iova_pfn);
3743         last_pfn = start_pfn + nrpages - 1;
3744
3745         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3746
3747         freelist = domain_unmap(domain, start_pfn, last_pfn);
3748
3749         if (intel_iommu_strict) {
3750                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3751                                       nrpages, !freelist, 0);
3752                 /* free iova */
3753                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3754                 dma_free_pagelist(freelist);
3755         } else {
3756                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3757                            (unsigned long)freelist);
3758                 /*
3759                  * queue up the release of the unmap to save the 1/6th of the
3760                  * cpu used up by the iotlb flush operation...
3761                  */
3762         }
3763 }
3764
3765 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3766                              size_t size, enum dma_data_direction dir,
3767                              unsigned long attrs)
3768 {
3769         intel_unmap(dev, dev_addr, size);
3770 }
3771
3772 static void *intel_alloc_coherent(struct device *dev, size_t size,
3773                                   dma_addr_t *dma_handle, gfp_t flags,
3774                                   unsigned long attrs)
3775 {
3776         struct page *page = NULL;
3777         int order;
3778
3779         size = PAGE_ALIGN(size);
3780         order = get_order(size);
3781
3782         if (!iommu_no_mapping(dev))
3783                 flags &= ~(GFP_DMA | GFP_DMA32);
3784         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3785                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3786                         flags |= GFP_DMA;
3787                 else
3788                         flags |= GFP_DMA32;
3789         }
3790
3791         if (gfpflags_allow_blocking(flags)) {
3792                 unsigned int count = size >> PAGE_SHIFT;
3793
3794                 page = dma_alloc_from_contiguous(dev, count, order,
3795                                                  flags & __GFP_NOWARN);
3796                 if (page && iommu_no_mapping(dev) &&
3797                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3798                         dma_release_from_contiguous(dev, page, count);
3799                         page = NULL;
3800                 }
3801         }
3802
3803         if (!page)
3804                 page = alloc_pages(flags, order);
3805         if (!page)
3806                 return NULL;
3807         memset(page_address(page), 0, size);
3808
3809         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3810                                          DMA_BIDIRECTIONAL,
3811                                          dev->coherent_dma_mask);
3812         if (*dma_handle != DMA_MAPPING_ERROR)
3813                 return page_address(page);
3814         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3815                 __free_pages(page, order);
3816
3817         return NULL;
3818 }
3819
3820 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3821                                 dma_addr_t dma_handle, unsigned long attrs)
3822 {
3823         int order;
3824         struct page *page = virt_to_page(vaddr);
3825
3826         size = PAGE_ALIGN(size);
3827         order = get_order(size);
3828
3829         intel_unmap(dev, dma_handle, size);
3830         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3831                 __free_pages(page, order);
3832 }
3833
3834 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3835                            int nelems, enum dma_data_direction dir,
3836                            unsigned long attrs)
3837 {
3838         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3839         unsigned long nrpages = 0;
3840         struct scatterlist *sg;
3841         int i;
3842
3843         for_each_sg(sglist, sg, nelems, i) {
3844                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3845         }
3846
3847         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3848 }
3849
3850 static int intel_nontranslate_map_sg(struct device *hddev,
3851         struct scatterlist *sglist, int nelems, int dir)
3852 {
3853         int i;
3854         struct scatterlist *sg;
3855
3856         for_each_sg(sglist, sg, nelems, i) {
3857                 BUG_ON(!sg_page(sg));
3858                 sg->dma_address = sg_phys(sg);
3859                 sg->dma_length = sg->length;
3860         }
3861         return nelems;
3862 }
3863
3864 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3865                         enum dma_data_direction dir, unsigned long attrs)
3866 {
3867         int i;
3868         struct dmar_domain *domain;
3869         size_t size = 0;
3870         int prot = 0;
3871         unsigned long iova_pfn;
3872         int ret;
3873         struct scatterlist *sg;
3874         unsigned long start_vpfn;
3875         struct intel_iommu *iommu;
3876
3877         BUG_ON(dir == DMA_NONE);
3878         if (iommu_no_mapping(dev))
3879                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3880
3881         domain = get_valid_domain_for_dev(dev);
3882         if (!domain)
3883                 return 0;
3884
3885         iommu = domain_get_iommu(domain);
3886
3887         for_each_sg(sglist, sg, nelems, i)
3888                 size += aligned_nrpages(sg->offset, sg->length);
3889
3890         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3891                                 *dev->dma_mask);
3892         if (!iova_pfn) {
3893                 sglist->dma_length = 0;
3894                 return 0;
3895         }
3896
3897         /*
3898          * Check if DMAR supports zero-length reads on write only
3899          * mappings..
3900          */
3901         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3902                         !cap_zlr(iommu->cap))
3903                 prot |= DMA_PTE_READ;
3904         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3905                 prot |= DMA_PTE_WRITE;
3906
3907         start_vpfn = mm_to_dma_pfn(iova_pfn);
3908
3909         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3910         if (unlikely(ret)) {
3911                 dma_pte_free_pagetable(domain, start_vpfn,
3912                                        start_vpfn + size - 1,
3913                                        agaw_to_level(domain->agaw) + 1);
3914                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3915                 return 0;
3916         }
3917
3918         return nelems;
3919 }
3920
3921 static const struct dma_map_ops intel_dma_ops = {
3922         .alloc = intel_alloc_coherent,
3923         .free = intel_free_coherent,
3924         .map_sg = intel_map_sg,
3925         .unmap_sg = intel_unmap_sg,
3926         .map_page = intel_map_page,
3927         .unmap_page = intel_unmap_page,
3928         .map_resource = intel_map_resource,
3929         .unmap_resource = intel_unmap_page,
3930         .dma_supported = dma_direct_supported,
3931 };
3932
3933 static inline int iommu_domain_cache_init(void)
3934 {
3935         int ret = 0;
3936
3937         iommu_domain_cache = kmem_cache_create("iommu_domain",
3938                                          sizeof(struct dmar_domain),
3939                                          0,
3940                                          SLAB_HWCACHE_ALIGN,
3941
3942                                          NULL);
3943         if (!iommu_domain_cache) {
3944                 pr_err("Couldn't create iommu_domain cache\n");
3945                 ret = -ENOMEM;
3946         }
3947
3948         return ret;
3949 }
3950
3951 static inline int iommu_devinfo_cache_init(void)
3952 {
3953         int ret = 0;
3954
3955         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3956                                          sizeof(struct device_domain_info),
3957                                          0,
3958                                          SLAB_HWCACHE_ALIGN,
3959                                          NULL);
3960         if (!iommu_devinfo_cache) {
3961                 pr_err("Couldn't create devinfo cache\n");
3962                 ret = -ENOMEM;
3963         }
3964
3965         return ret;
3966 }
3967
3968 static int __init iommu_init_mempool(void)
3969 {
3970         int ret;
3971         ret = iova_cache_get();
3972         if (ret)
3973                 return ret;
3974
3975         ret = iommu_domain_cache_init();
3976         if (ret)
3977                 goto domain_error;
3978
3979         ret = iommu_devinfo_cache_init();
3980         if (!ret)
3981                 return ret;
3982
3983         kmem_cache_destroy(iommu_domain_cache);
3984 domain_error:
3985         iova_cache_put();
3986
3987         return -ENOMEM;
3988 }
3989
3990 static void __init iommu_exit_mempool(void)
3991 {
3992         kmem_cache_destroy(iommu_devinfo_cache);
3993         kmem_cache_destroy(iommu_domain_cache);
3994         iova_cache_put();
3995 }
3996
3997 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3998 {
3999         struct dmar_drhd_unit *drhd;
4000         u32 vtbar;
4001         int rc;
4002
4003         /* We know that this device on this chipset has its own IOMMU.
4004          * If we find it under a different IOMMU, then the BIOS is lying
4005          * to us. Hope that the IOMMU for this device is actually
4006          * disabled, and it needs no translation...
4007          */
4008         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4009         if (rc) {
4010                 /* "can't" happen */
4011                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4012                 return;
4013         }
4014         vtbar &= 0xffff0000;
4015
4016         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4017         drhd = dmar_find_matched_drhd_unit(pdev);
4018         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4019                             TAINT_FIRMWARE_WORKAROUND,
4020                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4021                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4022 }
4023 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4024
4025 static void __init init_no_remapping_devices(void)
4026 {
4027         struct dmar_drhd_unit *drhd;
4028         struct device *dev;
4029         int i;
4030
4031         for_each_drhd_unit(drhd) {
4032                 if (!drhd->include_all) {
4033                         for_each_active_dev_scope(drhd->devices,
4034                                                   drhd->devices_cnt, i, dev)
4035                                 break;
4036                         /* ignore DMAR unit if no devices exist */
4037                         if (i == drhd->devices_cnt)
4038                                 drhd->ignored = 1;
4039                 }
4040         }
4041
4042         for_each_active_drhd_unit(drhd) {
4043                 if (drhd->include_all)
4044                         continue;
4045
4046                 for_each_active_dev_scope(drhd->devices,
4047                                           drhd->devices_cnt, i, dev)
4048                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4049                                 break;
4050                 if (i < drhd->devices_cnt)
4051                         continue;
4052
4053                 /* This IOMMU has *only* gfx devices. Either bypass it or
4054                    set the gfx_mapped flag, as appropriate */
4055                 if (dmar_map_gfx) {
4056                         intel_iommu_gfx_mapped = 1;
4057                 } else {
4058                         drhd->ignored = 1;
4059                         for_each_active_dev_scope(drhd->devices,
4060                                                   drhd->devices_cnt, i, dev)
4061                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4062                 }
4063         }
4064 }
4065
4066 #ifdef CONFIG_SUSPEND
4067 static int init_iommu_hw(void)
4068 {
4069         struct dmar_drhd_unit *drhd;
4070         struct intel_iommu *iommu = NULL;
4071
4072         for_each_active_iommu(iommu, drhd)
4073                 if (iommu->qi)
4074                         dmar_reenable_qi(iommu);
4075
4076         for_each_iommu(iommu, drhd) {
4077                 if (drhd->ignored) {
4078                         /*
4079                          * we always have to disable PMRs or DMA may fail on
4080                          * this device
4081                          */
4082                         if (force_on)
4083                                 iommu_disable_protect_mem_regions(iommu);
4084                         continue;
4085                 }
4086         
4087                 iommu_flush_write_buffer(iommu);
4088
4089                 iommu_set_root_entry(iommu);
4090
4091                 iommu->flush.flush_context(iommu, 0, 0, 0,
4092                                            DMA_CCMD_GLOBAL_INVL);
4093                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4094                 iommu_enable_translation(iommu);
4095                 iommu_disable_protect_mem_regions(iommu);
4096         }
4097
4098         return 0;
4099 }
4100
4101 static void iommu_flush_all(void)
4102 {
4103         struct dmar_drhd_unit *drhd;
4104         struct intel_iommu *iommu;
4105
4106         for_each_active_iommu(iommu, drhd) {
4107                 iommu->flush.flush_context(iommu, 0, 0, 0,
4108                                            DMA_CCMD_GLOBAL_INVL);
4109                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4110                                          DMA_TLB_GLOBAL_FLUSH);
4111         }
4112 }
4113
4114 static int iommu_suspend(void)
4115 {
4116         struct dmar_drhd_unit *drhd;
4117         struct intel_iommu *iommu = NULL;
4118         unsigned long flag;
4119
4120         for_each_active_iommu(iommu, drhd) {
4121                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4122                                                  GFP_ATOMIC);
4123                 if (!iommu->iommu_state)
4124                         goto nomem;
4125         }
4126
4127         iommu_flush_all();
4128
4129         for_each_active_iommu(iommu, drhd) {
4130                 iommu_disable_translation(iommu);
4131
4132                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4133
4134                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4135                         readl(iommu->reg + DMAR_FECTL_REG);
4136                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4137                         readl(iommu->reg + DMAR_FEDATA_REG);
4138                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4139                         readl(iommu->reg + DMAR_FEADDR_REG);
4140                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4141                         readl(iommu->reg + DMAR_FEUADDR_REG);
4142
4143                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4144         }
4145         return 0;
4146
4147 nomem:
4148         for_each_active_iommu(iommu, drhd)
4149                 kfree(iommu->iommu_state);
4150
4151         return -ENOMEM;
4152 }
4153
4154 static void iommu_resume(void)
4155 {
4156         struct dmar_drhd_unit *drhd;
4157         struct intel_iommu *iommu = NULL;
4158         unsigned long flag;
4159
4160         if (init_iommu_hw()) {
4161                 if (force_on)
4162                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4163                 else
4164                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4165                 return;
4166         }
4167
4168         for_each_active_iommu(iommu, drhd) {
4169
4170                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4171
4172                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4173                         iommu->reg + DMAR_FECTL_REG);
4174                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4175                         iommu->reg + DMAR_FEDATA_REG);
4176                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4177                         iommu->reg + DMAR_FEADDR_REG);
4178                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4179                         iommu->reg + DMAR_FEUADDR_REG);
4180
4181                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4182         }
4183
4184         for_each_active_iommu(iommu, drhd)
4185                 kfree(iommu->iommu_state);
4186 }
4187
4188 static struct syscore_ops iommu_syscore_ops = {
4189         .resume         = iommu_resume,
4190         .suspend        = iommu_suspend,
4191 };
4192
4193 static void __init init_iommu_pm_ops(void)
4194 {
4195         register_syscore_ops(&iommu_syscore_ops);
4196 }
4197
4198 #else
4199 static inline void init_iommu_pm_ops(void) {}
4200 #endif  /* CONFIG_PM */
4201
4202
4203 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4204 {
4205         struct acpi_dmar_reserved_memory *rmrr;
4206         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4207         struct dmar_rmrr_unit *rmrru;
4208         size_t length;
4209
4210         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4211         if (!rmrru)
4212                 goto out;
4213
4214         rmrru->hdr = header;
4215         rmrr = (struct acpi_dmar_reserved_memory *)header;
4216         rmrru->base_address = rmrr->base_address;
4217         rmrru->end_address = rmrr->end_address;
4218
4219         length = rmrr->end_address - rmrr->base_address + 1;
4220         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4221                                               IOMMU_RESV_DIRECT);
4222         if (!rmrru->resv)
4223                 goto free_rmrru;
4224
4225         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4226                                 ((void *)rmrr) + rmrr->header.length,
4227                                 &rmrru->devices_cnt);
4228         if (rmrru->devices_cnt && rmrru->devices == NULL)
4229                 goto free_all;
4230
4231         list_add(&rmrru->list, &dmar_rmrr_units);
4232
4233         return 0;
4234 free_all:
4235         kfree(rmrru->resv);
4236 free_rmrru:
4237         kfree(rmrru);
4238 out:
4239         return -ENOMEM;
4240 }
4241
4242 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4243 {
4244         struct dmar_atsr_unit *atsru;
4245         struct acpi_dmar_atsr *tmp;
4246
4247         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4248                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4249                 if (atsr->segment != tmp->segment)
4250                         continue;
4251                 if (atsr->header.length != tmp->header.length)
4252                         continue;
4253                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4254                         return atsru;
4255         }
4256
4257         return NULL;
4258 }
4259
4260 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4261 {
4262         struct acpi_dmar_atsr *atsr;
4263         struct dmar_atsr_unit *atsru;
4264
4265         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4266                 return 0;
4267
4268         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4269         atsru = dmar_find_atsr(atsr);
4270         if (atsru)
4271                 return 0;
4272
4273         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4274         if (!atsru)
4275                 return -ENOMEM;
4276
4277         /*
4278          * If memory is allocated from slab by ACPI _DSM method, we need to
4279          * copy the memory content because the memory buffer will be freed
4280          * on return.
4281          */
4282         atsru->hdr = (void *)(atsru + 1);
4283         memcpy(atsru->hdr, hdr, hdr->length);
4284         atsru->include_all = atsr->flags & 0x1;
4285         if (!atsru->include_all) {
4286                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4287                                 (void *)atsr + atsr->header.length,
4288                                 &atsru->devices_cnt);
4289                 if (atsru->devices_cnt && atsru->devices == NULL) {
4290                         kfree(atsru);
4291                         return -ENOMEM;
4292                 }
4293         }
4294
4295         list_add_rcu(&atsru->list, &dmar_atsr_units);
4296
4297         return 0;
4298 }
4299
4300 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4301 {
4302         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4303         kfree(atsru);
4304 }
4305
4306 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4307 {
4308         struct acpi_dmar_atsr *atsr;
4309         struct dmar_atsr_unit *atsru;
4310
4311         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4312         atsru = dmar_find_atsr(atsr);
4313         if (atsru) {
4314                 list_del_rcu(&atsru->list);
4315                 synchronize_rcu();
4316                 intel_iommu_free_atsr(atsru);
4317         }
4318
4319         return 0;
4320 }
4321
4322 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4323 {
4324         int i;
4325         struct device *dev;
4326         struct acpi_dmar_atsr *atsr;
4327         struct dmar_atsr_unit *atsru;
4328
4329         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330         atsru = dmar_find_atsr(atsr);
4331         if (!atsru)
4332                 return 0;
4333
4334         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4335                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4336                                           i, dev)
4337                         return -EBUSY;
4338         }
4339
4340         return 0;
4341 }
4342
4343 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4344 {
4345         int sp, ret;
4346         struct intel_iommu *iommu = dmaru->iommu;
4347
4348         if (g_iommus[iommu->seq_id])
4349                 return 0;
4350
4351         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4352                 pr_warn("%s: Doesn't support hardware pass through.\n",
4353                         iommu->name);
4354                 return -ENXIO;
4355         }
4356         if (!ecap_sc_support(iommu->ecap) &&
4357             domain_update_iommu_snooping(iommu)) {
4358                 pr_warn("%s: Doesn't support snooping.\n",
4359                         iommu->name);
4360                 return -ENXIO;
4361         }
4362         sp = domain_update_iommu_superpage(iommu) - 1;
4363         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4364                 pr_warn("%s: Doesn't support large page.\n",
4365                         iommu->name);
4366                 return -ENXIO;
4367         }
4368
4369         /*
4370          * Disable translation if already enabled prior to OS handover.
4371          */
4372         if (iommu->gcmd & DMA_GCMD_TE)
4373                 iommu_disable_translation(iommu);
4374
4375         g_iommus[iommu->seq_id] = iommu;
4376         ret = iommu_init_domains(iommu);
4377         if (ret == 0)
4378                 ret = iommu_alloc_root_entry(iommu);
4379         if (ret)
4380                 goto out;
4381
4382 #ifdef CONFIG_INTEL_IOMMU_SVM
4383         if (pasid_supported(iommu))
4384                 intel_svm_init(iommu);
4385 #endif
4386
4387         if (dmaru->ignored) {
4388                 /*
4389                  * we always have to disable PMRs or DMA may fail on this device
4390                  */
4391                 if (force_on)
4392                         iommu_disable_protect_mem_regions(iommu);
4393                 return 0;
4394         }
4395
4396         intel_iommu_init_qi(iommu);
4397         iommu_flush_write_buffer(iommu);
4398
4399 #ifdef CONFIG_INTEL_IOMMU_SVM
4400         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4401                 ret = intel_svm_enable_prq(iommu);
4402                 if (ret)
4403                         goto disable_iommu;
4404         }
4405 #endif
4406         ret = dmar_set_interrupt(iommu);
4407         if (ret)
4408                 goto disable_iommu;
4409
4410         iommu_set_root_entry(iommu);
4411         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4412         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4413         iommu_enable_translation(iommu);
4414
4415         iommu_disable_protect_mem_regions(iommu);
4416         return 0;
4417
4418 disable_iommu:
4419         disable_dmar_iommu(iommu);
4420 out:
4421         free_dmar_iommu(iommu);
4422         return ret;
4423 }
4424
4425 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4426 {
4427         int ret = 0;
4428         struct intel_iommu *iommu = dmaru->iommu;
4429
4430         if (!intel_iommu_enabled)
4431                 return 0;
4432         if (iommu == NULL)
4433                 return -EINVAL;
4434
4435         if (insert) {
4436                 ret = intel_iommu_add(dmaru);
4437         } else {
4438                 disable_dmar_iommu(iommu);
4439                 free_dmar_iommu(iommu);
4440         }
4441
4442         return ret;
4443 }
4444
4445 static void intel_iommu_free_dmars(void)
4446 {
4447         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4448         struct dmar_atsr_unit *atsru, *atsr_n;
4449
4450         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4451                 list_del(&rmrru->list);
4452                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4453                 kfree(rmrru->resv);
4454                 kfree(rmrru);
4455         }
4456
4457         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4458                 list_del(&atsru->list);
4459                 intel_iommu_free_atsr(atsru);
4460         }
4461 }
4462
4463 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4464 {
4465         int i, ret = 1;
4466         struct pci_bus *bus;
4467         struct pci_dev *bridge = NULL;
4468         struct device *tmp;
4469         struct acpi_dmar_atsr *atsr;
4470         struct dmar_atsr_unit *atsru;
4471
4472         dev = pci_physfn(dev);
4473         for (bus = dev->bus; bus; bus = bus->parent) {
4474                 bridge = bus->self;
4475                 /* If it's an integrated device, allow ATS */
4476                 if (!bridge)
4477                         return 1;
4478                 /* Connected via non-PCIe: no ATS */
4479                 if (!pci_is_pcie(bridge) ||
4480                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4481                         return 0;
4482                 /* If we found the root port, look it up in the ATSR */
4483                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4484                         break;
4485         }
4486
4487         rcu_read_lock();
4488         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4489                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4490                 if (atsr->segment != pci_domain_nr(dev->bus))
4491                         continue;
4492
4493                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4494                         if (tmp == &bridge->dev)
4495                                 goto out;
4496
4497                 if (atsru->include_all)
4498                         goto out;
4499         }
4500         ret = 0;
4501 out:
4502         rcu_read_unlock();
4503
4504         return ret;
4505 }
4506
4507 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4508 {
4509         int ret;
4510         struct dmar_rmrr_unit *rmrru;
4511         struct dmar_atsr_unit *atsru;
4512         struct acpi_dmar_atsr *atsr;
4513         struct acpi_dmar_reserved_memory *rmrr;
4514
4515         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4516                 return 0;
4517
4518         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4519                 rmrr = container_of(rmrru->hdr,
4520                                     struct acpi_dmar_reserved_memory, header);
4521                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4522                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4523                                 ((void *)rmrr) + rmrr->header.length,
4524                                 rmrr->segment, rmrru->devices,
4525                                 rmrru->devices_cnt);
4526                         if (ret < 0)
4527                                 return ret;
4528                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4529                         dmar_remove_dev_scope(info, rmrr->segment,
4530                                 rmrru->devices, rmrru->devices_cnt);
4531                 }
4532         }
4533
4534         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4535                 if (atsru->include_all)
4536                         continue;
4537
4538                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4539                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4540                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4541                                         (void *)atsr + atsr->header.length,
4542                                         atsr->segment, atsru->devices,
4543                                         atsru->devices_cnt);
4544                         if (ret > 0)
4545                                 break;
4546                         else if (ret < 0)
4547                                 return ret;
4548                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4549                         if (dmar_remove_dev_scope(info, atsr->segment,
4550                                         atsru->devices, atsru->devices_cnt))
4551                                 break;
4552                 }
4553         }
4554
4555         return 0;
4556 }
4557
4558 /*
4559  * Here we only respond to action of unbound device from driver.
4560  *
4561  * Added device is not attached to its DMAR domain here yet. That will happen
4562  * when mapping the device to iova.
4563  */
4564 static int device_notifier(struct notifier_block *nb,
4565                                   unsigned long action, void *data)
4566 {
4567         struct device *dev = data;
4568         struct dmar_domain *domain;
4569
4570         if (iommu_dummy(dev))
4571                 return 0;
4572
4573         if (action == BUS_NOTIFY_REMOVED_DEVICE) {
4574                 domain = find_domain(dev);
4575                 if (!domain)
4576                         return 0;
4577
4578                 dmar_remove_one_dev_info(dev);
4579                 if (!domain_type_is_vm_or_si(domain) &&
4580                     list_empty(&domain->devices))
4581                         domain_exit(domain);
4582         } else if (action == BUS_NOTIFY_ADD_DEVICE) {
4583                 if (iommu_should_identity_map(dev, 1))
4584                         domain_add_dev_info(si_domain, dev);
4585         }
4586
4587         return 0;
4588 }
4589
4590 static struct notifier_block device_nb = {
4591         .notifier_call = device_notifier,
4592 };
4593
4594 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4595                                        unsigned long val, void *v)
4596 {
4597         struct memory_notify *mhp = v;
4598         unsigned long long start, end;
4599         unsigned long start_vpfn, last_vpfn;
4600
4601         switch (val) {
4602         case MEM_GOING_ONLINE:
4603                 start = mhp->start_pfn << PAGE_SHIFT;
4604                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4605                 if (iommu_domain_identity_map(si_domain, start, end)) {
4606                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4607                                 start, end);
4608                         return NOTIFY_BAD;
4609                 }
4610                 break;
4611
4612         case MEM_OFFLINE:
4613         case MEM_CANCEL_ONLINE:
4614                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4615                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4616                 while (start_vpfn <= last_vpfn) {
4617                         struct iova *iova;
4618                         struct dmar_drhd_unit *drhd;
4619                         struct intel_iommu *iommu;
4620                         struct page *freelist;
4621
4622                         iova = find_iova(&si_domain->iovad, start_vpfn);
4623                         if (iova == NULL) {
4624                                 pr_debug("Failed get IOVA for PFN %lx\n",
4625                                          start_vpfn);
4626                                 break;
4627                         }
4628
4629                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4630                                                      start_vpfn, last_vpfn);
4631                         if (iova == NULL) {
4632                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4633                                         start_vpfn, last_vpfn);
4634                                 return NOTIFY_BAD;
4635                         }
4636
4637                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4638                                                iova->pfn_hi);
4639
4640                         rcu_read_lock();
4641                         for_each_active_iommu(iommu, drhd)
4642                                 iommu_flush_iotlb_psi(iommu, si_domain,
4643                                         iova->pfn_lo, iova_size(iova),
4644                                         !freelist, 0);
4645                         rcu_read_unlock();
4646                         dma_free_pagelist(freelist);
4647
4648                         start_vpfn = iova->pfn_hi + 1;
4649                         free_iova_mem(iova);
4650                 }
4651                 break;
4652         }
4653
4654         return NOTIFY_OK;
4655 }
4656
4657 static struct notifier_block intel_iommu_memory_nb = {
4658         .notifier_call = intel_iommu_memory_notifier,
4659         .priority = 0
4660 };
4661
4662 static void free_all_cpu_cached_iovas(unsigned int cpu)
4663 {
4664         int i;
4665
4666         for (i = 0; i < g_num_of_iommus; i++) {
4667                 struct intel_iommu *iommu = g_iommus[i];
4668                 struct dmar_domain *domain;
4669                 int did;
4670
4671                 if (!iommu)
4672                         continue;
4673
4674                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4675                         domain = get_iommu_domain(iommu, (u16)did);
4676
4677                         if (!domain)
4678                                 continue;
4679                         free_cpu_cached_iovas(cpu, &domain->iovad);
4680                 }
4681         }
4682 }
4683
4684 static int intel_iommu_cpu_dead(unsigned int cpu)
4685 {
4686         free_all_cpu_cached_iovas(cpu);
4687         return 0;
4688 }
4689
4690 static void intel_disable_iommus(void)
4691 {
4692         struct intel_iommu *iommu = NULL;
4693         struct dmar_drhd_unit *drhd;
4694
4695         for_each_iommu(iommu, drhd)
4696                 iommu_disable_translation(iommu);
4697 }
4698
4699 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4700 {
4701         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4702
4703         return container_of(iommu_dev, struct intel_iommu, iommu);
4704 }
4705
4706 static ssize_t intel_iommu_show_version(struct device *dev,
4707                                         struct device_attribute *attr,
4708                                         char *buf)
4709 {
4710         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4711         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4712         return sprintf(buf, "%d:%d\n",
4713                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4714 }
4715 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4716
4717 static ssize_t intel_iommu_show_address(struct device *dev,
4718                                         struct device_attribute *attr,
4719                                         char *buf)
4720 {
4721         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4722         return sprintf(buf, "%llx\n", iommu->reg_phys);
4723 }
4724 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4725
4726 static ssize_t intel_iommu_show_cap(struct device *dev,
4727                                     struct device_attribute *attr,
4728                                     char *buf)
4729 {
4730         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4731         return sprintf(buf, "%llx\n", iommu->cap);
4732 }
4733 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4734
4735 static ssize_t intel_iommu_show_ecap(struct device *dev,
4736                                     struct device_attribute *attr,
4737                                     char *buf)
4738 {
4739         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4740         return sprintf(buf, "%llx\n", iommu->ecap);
4741 }
4742 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4743
4744 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4745                                       struct device_attribute *attr,
4746                                       char *buf)
4747 {
4748         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4749         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4750 }
4751 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4752
4753 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4754                                            struct device_attribute *attr,
4755                                            char *buf)
4756 {
4757         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4758         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4759                                                   cap_ndoms(iommu->cap)));
4760 }
4761 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4762
4763 static struct attribute *intel_iommu_attrs[] = {
4764         &dev_attr_version.attr,
4765         &dev_attr_address.attr,
4766         &dev_attr_cap.attr,
4767         &dev_attr_ecap.attr,
4768         &dev_attr_domains_supported.attr,
4769         &dev_attr_domains_used.attr,
4770         NULL,
4771 };
4772
4773 static struct attribute_group intel_iommu_group = {
4774         .name = "intel-iommu",
4775         .attrs = intel_iommu_attrs,
4776 };
4777
4778 const struct attribute_group *intel_iommu_groups[] = {
4779         &intel_iommu_group,
4780         NULL,
4781 };
4782
4783 static int __init platform_optin_force_iommu(void)
4784 {
4785         struct pci_dev *pdev = NULL;
4786         bool has_untrusted_dev = false;
4787
4788         if (!dmar_platform_optin() || no_platform_optin)
4789                 return 0;
4790
4791         for_each_pci_dev(pdev) {
4792                 if (pdev->untrusted) {
4793                         has_untrusted_dev = true;
4794                         break;
4795                 }
4796         }
4797
4798         if (!has_untrusted_dev)
4799                 return 0;
4800
4801         if (no_iommu || dmar_disabled)
4802                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4803
4804         /*
4805          * If Intel-IOMMU is disabled by default, we will apply identity
4806          * map for all devices except those marked as being untrusted.
4807          */
4808         if (dmar_disabled)
4809                 iommu_identity_mapping |= IDENTMAP_ALL;
4810
4811         dmar_disabled = 0;
4812 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4813         swiotlb = 0;
4814 #endif
4815         no_iommu = 0;
4816
4817         return 1;
4818 }
4819
4820 int __init intel_iommu_init(void)
4821 {
4822         int ret = -ENODEV;
4823         struct dmar_drhd_unit *drhd;
4824         struct intel_iommu *iommu;
4825
4826         /*
4827          * Intel IOMMU is required for a TXT/tboot launch or platform
4828          * opt in, so enforce that.
4829          */
4830         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4831
4832         if (iommu_init_mempool()) {
4833                 if (force_on)
4834                         panic("tboot: Failed to initialize iommu memory\n");
4835                 return -ENOMEM;
4836         }
4837
4838         down_write(&dmar_global_lock);
4839         if (dmar_table_init()) {
4840                 if (force_on)
4841                         panic("tboot: Failed to initialize DMAR table\n");
4842                 goto out_free_dmar;
4843         }
4844
4845         if (dmar_dev_scope_init() < 0) {
4846                 if (force_on)
4847                         panic("tboot: Failed to initialize DMAR device scope\n");
4848                 goto out_free_dmar;
4849         }
4850
4851         up_write(&dmar_global_lock);
4852
4853         /*
4854          * The bus notifier takes the dmar_global_lock, so lockdep will
4855          * complain later when we register it under the lock.
4856          */
4857         dmar_register_bus_notifier();
4858
4859         down_write(&dmar_global_lock);
4860
4861         if (no_iommu || dmar_disabled) {
4862                 /*
4863                  * We exit the function here to ensure IOMMU's remapping and
4864                  * mempool aren't setup, which means that the IOMMU's PMRs
4865                  * won't be disabled via the call to init_dmars(). So disable
4866                  * it explicitly here. The PMRs were setup by tboot prior to
4867                  * calling SENTER, but the kernel is expected to reset/tear
4868                  * down the PMRs.
4869                  */
4870                 if (intel_iommu_tboot_noforce) {
4871                         for_each_iommu(iommu, drhd)
4872                                 iommu_disable_protect_mem_regions(iommu);
4873                 }
4874
4875                 /*
4876                  * Make sure the IOMMUs are switched off, even when we
4877                  * boot into a kexec kernel and the previous kernel left
4878                  * them enabled
4879                  */
4880                 intel_disable_iommus();
4881                 goto out_free_dmar;
4882         }
4883
4884         if (list_empty(&dmar_rmrr_units))
4885                 pr_info("No RMRR found\n");
4886
4887         if (list_empty(&dmar_atsr_units))
4888                 pr_info("No ATSR found\n");
4889
4890         if (dmar_init_reserved_ranges()) {
4891                 if (force_on)
4892                         panic("tboot: Failed to reserve iommu ranges\n");
4893                 goto out_free_reserved_range;
4894         }
4895
4896         init_no_remapping_devices();
4897
4898         ret = init_dmars();
4899         if (ret) {
4900                 if (force_on)
4901                         panic("tboot: Failed to initialize DMARs\n");
4902                 pr_err("Initialization failed\n");
4903                 goto out_free_reserved_range;
4904         }
4905         up_write(&dmar_global_lock);
4906         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4907
4908 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4909         swiotlb = 0;
4910 #endif
4911         dma_ops = &intel_dma_ops;
4912
4913         init_iommu_pm_ops();
4914
4915         for_each_active_iommu(iommu, drhd) {
4916                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4917                                        intel_iommu_groups,
4918                                        "%s", iommu->name);
4919                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4920                 iommu_device_register(&iommu->iommu);
4921         }
4922
4923         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4924         bus_register_notifier(&pci_bus_type, &device_nb);
4925         if (si_domain && !hw_pass_through)
4926                 register_memory_notifier(&intel_iommu_memory_nb);
4927         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4928                           intel_iommu_cpu_dead);
4929         intel_iommu_enabled = 1;
4930         intel_iommu_debugfs_init();
4931
4932         return 0;
4933
4934 out_free_reserved_range:
4935         put_iova_domain(&reserved_iova_list);
4936 out_free_dmar:
4937         intel_iommu_free_dmars();
4938         up_write(&dmar_global_lock);
4939         iommu_exit_mempool();
4940         return ret;
4941 }
4942
4943 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4944 {
4945         struct intel_iommu *iommu = opaque;
4946
4947         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4948         return 0;
4949 }
4950
4951 /*
4952  * NB - intel-iommu lacks any sort of reference counting for the users of
4953  * dependent devices.  If multiple endpoints have intersecting dependent
4954  * devices, unbinding the driver from any one of them will possibly leave
4955  * the others unable to operate.
4956  */
4957 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4958 {
4959         if (!iommu || !dev || !dev_is_pci(dev))
4960                 return;
4961
4962         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4963 }
4964
4965 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4966 {
4967         struct intel_iommu *iommu;
4968         unsigned long flags;
4969
4970         assert_spin_locked(&device_domain_lock);
4971
4972         if (WARN_ON(!info))
4973                 return;
4974
4975         iommu = info->iommu;
4976
4977         if (info->dev) {
4978                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4979                         intel_pasid_tear_down_entry(iommu, info->dev,
4980                                         PASID_RID2PASID);
4981
4982                 iommu_disable_dev_iotlb(info);
4983                 domain_context_clear(iommu, info->dev);
4984                 intel_pasid_free_table(info->dev);
4985         }
4986
4987         unlink_domain_info(info);
4988
4989         spin_lock_irqsave(&iommu->lock, flags);
4990         domain_detach_iommu(info->domain, iommu);
4991         spin_unlock_irqrestore(&iommu->lock, flags);
4992
4993         free_devinfo_mem(info);
4994 }
4995
4996 static void dmar_remove_one_dev_info(struct device *dev)
4997 {
4998         struct device_domain_info *info;
4999         unsigned long flags;
5000
5001         spin_lock_irqsave(&device_domain_lock, flags);
5002         info = dev->archdata.iommu;
5003         __dmar_remove_one_dev_info(info);
5004         spin_unlock_irqrestore(&device_domain_lock, flags);
5005 }
5006
5007 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5008 {
5009         int adjust_width;
5010
5011         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5012         domain_reserve_special_ranges(domain);
5013
5014         /* calculate AGAW */
5015         domain->gaw = guest_width;
5016         adjust_width = guestwidth_to_adjustwidth(guest_width);
5017         domain->agaw = width_to_agaw(adjust_width);
5018
5019         domain->iommu_coherency = 0;
5020         domain->iommu_snooping = 0;
5021         domain->iommu_superpage = 0;
5022         domain->max_addr = 0;
5023
5024         /* always allocate the top pgd */
5025         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5026         if (!domain->pgd)
5027                 return -ENOMEM;
5028         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5029         return 0;
5030 }
5031
5032 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5033 {
5034         struct dmar_domain *dmar_domain;
5035         struct iommu_domain *domain;
5036
5037         if (type != IOMMU_DOMAIN_UNMANAGED)
5038                 return NULL;
5039
5040         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5041         if (!dmar_domain) {
5042                 pr_err("Can't allocate dmar_domain\n");
5043                 return NULL;
5044         }
5045         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5046                 pr_err("Domain initialization failed\n");
5047                 domain_exit(dmar_domain);
5048                 return NULL;
5049         }
5050         domain_update_iommu_cap(dmar_domain);
5051
5052         domain = &dmar_domain->domain;
5053         domain->geometry.aperture_start = 0;
5054         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5055         domain->geometry.force_aperture = true;
5056
5057         return domain;
5058 }
5059
5060 static void intel_iommu_domain_free(struct iommu_domain *domain)
5061 {
5062         domain_exit(to_dmar_domain(domain));
5063 }
5064
5065 static int intel_iommu_attach_device(struct iommu_domain *domain,
5066                                      struct device *dev)
5067 {
5068         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5069         struct intel_iommu *iommu;
5070         int addr_width;
5071         u8 bus, devfn;
5072
5073         if (device_is_rmrr_locked(dev)) {
5074                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5075                 return -EPERM;
5076         }
5077
5078         /* normally dev is not mapped */
5079         if (unlikely(domain_context_mapped(dev))) {
5080                 struct dmar_domain *old_domain;
5081
5082                 old_domain = find_domain(dev);
5083                 if (old_domain) {
5084                         rcu_read_lock();
5085                         dmar_remove_one_dev_info(dev);
5086                         rcu_read_unlock();
5087
5088                         if (!domain_type_is_vm_or_si(old_domain) &&
5089                              list_empty(&old_domain->devices))
5090                                 domain_exit(old_domain);
5091                 }
5092         }
5093
5094         iommu = device_to_iommu(dev, &bus, &devfn);
5095         if (!iommu)
5096                 return -ENODEV;
5097
5098         /* check if this iommu agaw is sufficient for max mapped address */
5099         addr_width = agaw_to_width(iommu->agaw);
5100         if (addr_width > cap_mgaw(iommu->cap))
5101                 addr_width = cap_mgaw(iommu->cap);
5102
5103         if (dmar_domain->max_addr > (1LL << addr_width)) {
5104                 dev_err(dev, "%s: iommu width (%d) is not "
5105                         "sufficient for the mapped address (%llx)\n",
5106                         __func__, addr_width, dmar_domain->max_addr);
5107                 return -EFAULT;
5108         }
5109         dmar_domain->gaw = addr_width;
5110
5111         /*
5112          * Knock out extra levels of page tables if necessary
5113          */
5114         while (iommu->agaw < dmar_domain->agaw) {
5115                 struct dma_pte *pte;
5116
5117                 pte = dmar_domain->pgd;
5118                 if (dma_pte_present(pte)) {
5119                         dmar_domain->pgd = (struct dma_pte *)
5120                                 phys_to_virt(dma_pte_addr(pte));
5121                         free_pgtable_page(pte);
5122                 }
5123                 dmar_domain->agaw--;
5124         }
5125
5126         return domain_add_dev_info(dmar_domain, dev);
5127 }
5128
5129 static void intel_iommu_detach_device(struct iommu_domain *domain,
5130                                       struct device *dev)
5131 {
5132         dmar_remove_one_dev_info(dev);
5133 }
5134
5135 static int intel_iommu_map(struct iommu_domain *domain,
5136                            unsigned long iova, phys_addr_t hpa,
5137                            size_t size, int iommu_prot)
5138 {
5139         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5140         u64 max_addr;
5141         int prot = 0;
5142         int ret;
5143
5144         if (iommu_prot & IOMMU_READ)
5145                 prot |= DMA_PTE_READ;
5146         if (iommu_prot & IOMMU_WRITE)
5147                 prot |= DMA_PTE_WRITE;
5148         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5149                 prot |= DMA_PTE_SNP;
5150
5151         max_addr = iova + size;
5152         if (dmar_domain->max_addr < max_addr) {
5153                 u64 end;
5154
5155                 /* check if minimum agaw is sufficient for mapped address */
5156                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5157                 if (end < max_addr) {
5158                         pr_err("%s: iommu width (%d) is not "
5159                                "sufficient for the mapped address (%llx)\n",
5160                                __func__, dmar_domain->gaw, max_addr);
5161                         return -EFAULT;
5162                 }
5163                 dmar_domain->max_addr = max_addr;
5164         }
5165         /* Round up size to next multiple of PAGE_SIZE, if it and
5166            the low bits of hpa would take us onto the next page */
5167         size = aligned_nrpages(hpa, size);
5168         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5169                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5170         return ret;
5171 }
5172
5173 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5174                                 unsigned long iova, size_t size)
5175 {
5176         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5177         struct page *freelist = NULL;
5178         unsigned long start_pfn, last_pfn;
5179         unsigned int npages;
5180         int iommu_id, level = 0;
5181
5182         /* Cope with horrid API which requires us to unmap more than the
5183            size argument if it happens to be a large-page mapping. */
5184         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5185
5186         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5187                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5188
5189         start_pfn = iova >> VTD_PAGE_SHIFT;
5190         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5191
5192         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5193
5194         npages = last_pfn - start_pfn + 1;
5195
5196         for_each_domain_iommu(iommu_id, dmar_domain)
5197                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5198                                       start_pfn, npages, !freelist, 0);
5199
5200         dma_free_pagelist(freelist);
5201
5202         if (dmar_domain->max_addr == iova + size)
5203                 dmar_domain->max_addr = iova;
5204
5205         return size;
5206 }
5207
5208 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5209                                             dma_addr_t iova)
5210 {
5211         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5212         struct dma_pte *pte;
5213         int level = 0;
5214         u64 phys = 0;
5215
5216         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5217         if (pte)
5218                 phys = dma_pte_addr(pte);
5219
5220         return phys;
5221 }
5222
5223 static bool intel_iommu_capable(enum iommu_cap cap)
5224 {
5225         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5226                 return domain_update_iommu_snooping(NULL) == 1;
5227         if (cap == IOMMU_CAP_INTR_REMAP)
5228                 return irq_remapping_enabled == 1;
5229
5230         return false;
5231 }
5232
5233 static int intel_iommu_add_device(struct device *dev)
5234 {
5235         struct intel_iommu *iommu;
5236         struct iommu_group *group;
5237         u8 bus, devfn;
5238
5239         iommu = device_to_iommu(dev, &bus, &devfn);
5240         if (!iommu)
5241                 return -ENODEV;
5242
5243         iommu_device_link(&iommu->iommu, dev);
5244
5245         group = iommu_group_get_for_dev(dev);
5246
5247         if (IS_ERR(group))
5248                 return PTR_ERR(group);
5249
5250         iommu_group_put(group);
5251         return 0;
5252 }
5253
5254 static void intel_iommu_remove_device(struct device *dev)
5255 {
5256         struct intel_iommu *iommu;
5257         u8 bus, devfn;
5258
5259         iommu = device_to_iommu(dev, &bus, &devfn);
5260         if (!iommu)
5261                 return;
5262
5263         iommu_group_remove_device(dev);
5264
5265         iommu_device_unlink(&iommu->iommu, dev);
5266 }
5267
5268 static void intel_iommu_get_resv_regions(struct device *device,
5269                                          struct list_head *head)
5270 {
5271         struct iommu_resv_region *reg;
5272         struct dmar_rmrr_unit *rmrr;
5273         struct device *i_dev;
5274         int i;
5275
5276         rcu_read_lock();
5277         for_each_rmrr_units(rmrr) {
5278                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5279                                           i, i_dev) {
5280                         if (i_dev != device)
5281                                 continue;
5282
5283                         list_add_tail(&rmrr->resv->list, head);
5284                 }
5285         }
5286         rcu_read_unlock();
5287
5288         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5289                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5290                                       0, IOMMU_RESV_MSI);
5291         if (!reg)
5292                 return;
5293         list_add_tail(&reg->list, head);
5294 }
5295
5296 static void intel_iommu_put_resv_regions(struct device *dev,
5297                                          struct list_head *head)
5298 {
5299         struct iommu_resv_region *entry, *next;
5300
5301         list_for_each_entry_safe(entry, next, head, list) {
5302                 if (entry->type == IOMMU_RESV_MSI)
5303                         kfree(entry);
5304         }
5305 }
5306
5307 #ifdef CONFIG_INTEL_IOMMU_SVM
5308 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5309 {
5310         struct device_domain_info *info;
5311         struct context_entry *context;
5312         struct dmar_domain *domain;
5313         unsigned long flags;
5314         u64 ctx_lo;
5315         int ret;
5316
5317         domain = get_valid_domain_for_dev(sdev->dev);
5318         if (!domain)
5319                 return -EINVAL;
5320
5321         spin_lock_irqsave(&device_domain_lock, flags);
5322         spin_lock(&iommu->lock);
5323
5324         ret = -EINVAL;
5325         info = sdev->dev->archdata.iommu;
5326         if (!info || !info->pasid_supported)
5327                 goto out;
5328
5329         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5330         if (WARN_ON(!context))
5331                 goto out;
5332
5333         ctx_lo = context[0].lo;
5334
5335         sdev->did = domain->iommu_did[iommu->seq_id];
5336         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5337
5338         if (!(ctx_lo & CONTEXT_PASIDE)) {
5339                 ctx_lo |= CONTEXT_PASIDE;
5340                 context[0].lo = ctx_lo;
5341                 wmb();
5342                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5343                                            DMA_CCMD_MASK_NOBIT,
5344                                            DMA_CCMD_DEVICE_INVL);
5345         }
5346
5347         /* Enable PASID support in the device, if it wasn't already */
5348         if (!info->pasid_enabled)
5349                 iommu_enable_dev_iotlb(info);
5350
5351         if (info->ats_enabled) {
5352                 sdev->dev_iotlb = 1;
5353                 sdev->qdep = info->ats_qdep;
5354                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5355                         sdev->qdep = 0;
5356         }
5357         ret = 0;
5358
5359  out:
5360         spin_unlock(&iommu->lock);
5361         spin_unlock_irqrestore(&device_domain_lock, flags);
5362
5363         return ret;
5364 }
5365
5366 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5367 {
5368         struct intel_iommu *iommu;
5369         u8 bus, devfn;
5370
5371         if (iommu_dummy(dev)) {
5372                 dev_warn(dev,
5373                          "No IOMMU translation for device; cannot enable SVM\n");
5374                 return NULL;
5375         }
5376
5377         iommu = device_to_iommu(dev, &bus, &devfn);
5378         if ((!iommu)) {
5379                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5380                 return NULL;
5381         }
5382
5383         return iommu;
5384 }
5385 #endif /* CONFIG_INTEL_IOMMU_SVM */
5386
5387 const struct iommu_ops intel_iommu_ops = {
5388         .capable                = intel_iommu_capable,
5389         .domain_alloc           = intel_iommu_domain_alloc,
5390         .domain_free            = intel_iommu_domain_free,
5391         .attach_dev             = intel_iommu_attach_device,
5392         .detach_dev             = intel_iommu_detach_device,
5393         .map                    = intel_iommu_map,
5394         .unmap                  = intel_iommu_unmap,
5395         .iova_to_phys           = intel_iommu_iova_to_phys,
5396         .add_device             = intel_iommu_add_device,
5397         .remove_device          = intel_iommu_remove_device,
5398         .get_resv_regions       = intel_iommu_get_resv_regions,
5399         .put_resv_regions       = intel_iommu_put_resv_regions,
5400         .device_group           = pci_device_group,
5401         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5402 };
5403
5404 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5405 {
5406         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5407         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5408         dmar_map_gfx = 0;
5409 }
5410
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5418
5419 static void quirk_iommu_rwbf(struct pci_dev *dev)
5420 {
5421         /*
5422          * Mobile 4 Series Chipset neglects to set RWBF capability,
5423          * but needs it. Same seems to hold for the desktop versions.
5424          */
5425         pci_info(dev, "Forcing write-buffer flush capability\n");
5426         rwbf_quirk = 1;
5427 }
5428
5429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5436
5437 #define GGC 0x52
5438 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5439 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5440 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5441 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5442 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5443 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5444 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5445 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5446
5447 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5448 {
5449         unsigned short ggc;
5450
5451         if (pci_read_config_word(dev, GGC, &ggc))
5452                 return;
5453
5454         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5455                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5456                 dmar_map_gfx = 0;
5457         } else if (dmar_map_gfx) {
5458                 /* we have to ensure the gfx device is idle before we flush */
5459                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5460                 intel_iommu_strict = 1;
5461        }
5462 }
5463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5467
5468 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5469    ISOCH DMAR unit for the Azalia sound device, but not give it any
5470    TLB entries, which causes it to deadlock. Check for that.  We do
5471    this in a function called from init_dmars(), instead of in a PCI
5472    quirk, because we don't want to print the obnoxious "BIOS broken"
5473    message if VT-d is actually disabled.
5474 */
5475 static void __init check_tylersburg_isoch(void)
5476 {
5477         struct pci_dev *pdev;
5478         uint32_t vtisochctrl;
5479
5480         /* If there's no Azalia in the system anyway, forget it. */
5481         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5482         if (!pdev)
5483                 return;
5484         pci_dev_put(pdev);
5485
5486         /* System Management Registers. Might be hidden, in which case
5487            we can't do the sanity check. But that's OK, because the
5488            known-broken BIOSes _don't_ actually hide it, so far. */
5489         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5490         if (!pdev)
5491                 return;
5492
5493         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5494                 pci_dev_put(pdev);
5495                 return;
5496         }
5497
5498         pci_dev_put(pdev);
5499
5500         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5501         if (vtisochctrl & 1)
5502                 return;
5503
5504         /* Drop all bits other than the number of TLB entries */
5505         vtisochctrl &= 0x1c;
5506
5507         /* If we have the recommended number of TLB entries (16), fine. */
5508         if (vtisochctrl == 0x10)
5509                 return;
5510
5511         /* Zero TLB entries? You get to ride the short bus to school. */
5512         if (!vtisochctrl) {
5513                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5514                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5515                      dmi_get_system_info(DMI_BIOS_VENDOR),
5516                      dmi_get_system_info(DMI_BIOS_VERSION),
5517                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5518                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5519                 return;
5520         }
5521
5522         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5523                vtisochctrl);
5524 }