iommu/vt-d: Remove segment from struct device_domain_info()
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45
46 #include "irq_remapping.h"
47 #include "pci.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83  * This bitmap is used to advertise the page sizes our hardware support
84  * to the IOMMU core, which will then use this information to split
85  * physically contiguous memory regions it is mapping into page sizes
86  * that we support.
87  *
88  * Traditionally the IOMMU core just handed us the mappings directly,
89  * after making sure the size is an order of a 4KiB page and that the
90  * mapping has natural alignment.
91  *
92  * To retain this behavior, we currently advertise that we support
93  * all page sizes that are an order of 4KiB.
94  *
95  * If at some point we'd like to utilize the IOMMU core's new behavior,
96  * we could change this to advertise the real page sizes we support.
97  */
98 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102         return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117         return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127         return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132         return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137         return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146    are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158         return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162         return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172  * set to 1 to panic kernel if can't successfully enable VT-d
173  * (used when kernel is launched w/ TXT)
174  */
175 static int force_on = 0;
176
177 /*
178  * 0: Present
179  * 1-11: Reserved
180  * 12-63: Context Ptr (12 - (haw-1))
181  * 64-127: Reserved
182  */
183 struct root_entry {
184         u64     val;
185         u64     rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190         return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194         root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198         root->val |= value & VTD_PAGE_MASK;
199 }
200
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
203 {
204         return (struct context_entry *)
205                 (root_present(root)?phys_to_virt(
206                 root->val & VTD_PAGE_MASK) :
207                 NULL);
208 }
209
210 /*
211  * low 64 bits:
212  * 0: present
213  * 1: fault processing disable
214  * 2-3: translation type
215  * 12-63: address space root
216  * high 64 bits:
217  * 0-2: address width
218  * 3-6: aval
219  * 8-23: domain id
220  */
221 struct context_entry {
222         u64 lo;
223         u64 hi;
224 };
225
226 static inline bool context_present(struct context_entry *context)
227 {
228         return (context->lo & 1);
229 }
230 static inline void context_set_present(struct context_entry *context)
231 {
232         context->lo |= 1;
233 }
234
235 static inline void context_set_fault_enable(struct context_entry *context)
236 {
237         context->lo &= (((u64)-1) << 2) | 1;
238 }
239
240 static inline void context_set_translation_type(struct context_entry *context,
241                                                 unsigned long value)
242 {
243         context->lo &= (((u64)-1) << 4) | 3;
244         context->lo |= (value & 3) << 2;
245 }
246
247 static inline void context_set_address_root(struct context_entry *context,
248                                             unsigned long value)
249 {
250         context->lo |= value & VTD_PAGE_MASK;
251 }
252
253 static inline void context_set_address_width(struct context_entry *context,
254                                              unsigned long value)
255 {
256         context->hi |= value & 7;
257 }
258
259 static inline void context_set_domain_id(struct context_entry *context,
260                                          unsigned long value)
261 {
262         context->hi |= (value & ((1 << 16) - 1)) << 8;
263 }
264
265 static inline void context_clear_entry(struct context_entry *context)
266 {
267         context->lo = 0;
268         context->hi = 0;
269 }
270
271 /*
272  * 0: readable
273  * 1: writable
274  * 2-6: reserved
275  * 7: super page
276  * 8-10: available
277  * 11: snoop behavior
278  * 12-63: Host physcial address
279  */
280 struct dma_pte {
281         u64 val;
282 };
283
284 static inline void dma_clear_pte(struct dma_pte *pte)
285 {
286         pte->val = 0;
287 }
288
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
290 {
291 #ifdef CONFIG_64BIT
292         return pte->val & VTD_PAGE_MASK;
293 #else
294         /* Must have a full atomic 64-bit read */
295         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
297 }
298
299 static inline bool dma_pte_present(struct dma_pte *pte)
300 {
301         return (pte->val & 3) != 0;
302 }
303
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
305 {
306         return (pte->val & (1 << 7));
307 }
308
309 static inline int first_pte_in_page(struct dma_pte *pte)
310 {
311         return !((unsigned long)pte & ~VTD_PAGE_MASK);
312 }
313
314 /*
315  * This domain is a statically identity mapping domain.
316  *      1. This domain creats a static 1:1 mapping to all usable memory.
317  *      2. It maps to each iommu if successful.
318  *      3. Each iommu mapps to this domain if successful.
319  */
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
322
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
325
326 /* domain represents a virtual machine, more than one devices
327  * across iommus may be owned in one domain, e.g. kvm guest.
328  */
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
330
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
333
334 /* define the limit of IOMMUs supported in each domain */
335 #ifdef  CONFIG_X86
336 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
337 #else
338 # define        IOMMU_UNITS_SUPPORTED   64
339 #endif
340
341 struct dmar_domain {
342         int     id;                     /* domain id */
343         int     nid;                    /* node id */
344         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345                                         /* bitmap of iommus this domain uses*/
346
347         struct list_head devices;       /* all devices' list */
348         struct iova_domain iovad;       /* iova's that belong to this domain */
349
350         struct dma_pte  *pgd;           /* virtual address */
351         int             gaw;            /* max guest address width */
352
353         /* adjusted guest address width, 0 is level 2 30-bit */
354         int             agaw;
355
356         int             flags;          /* flags to find out type of domain */
357
358         int             iommu_coherency;/* indicate coherency of iommu access */
359         int             iommu_snooping; /* indicate snooping control feature*/
360         int             iommu_count;    /* reference count of iommu */
361         int             iommu_superpage;/* Level of superpages supported:
362                                            0 == 4KiB (no superpages), 1 == 2MiB,
363                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364         spinlock_t      iommu_lock;     /* protect iommu set in domain */
365         u64             max_addr;       /* maximum mapped address */
366 };
367
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370         struct list_head link;  /* link to domain siblings */
371         struct list_head global; /* link to global list */
372         u8 bus;                 /* PCI bus number */
373         u8 devfn;               /* PCI devfn number */
374         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
375         struct intel_iommu *iommu; /* IOMMU used by this device */
376         struct dmar_domain *domain; /* pointer to domain */
377 };
378
379 struct dmar_rmrr_unit {
380         struct list_head list;          /* list of rmrr units   */
381         struct acpi_dmar_header *hdr;   /* ACPI header          */
382         u64     base_address;           /* reserved base address*/
383         u64     end_address;            /* reserved end address */
384         struct dmar_dev_scope *devices; /* target devices */
385         int     devices_cnt;            /* target device count */
386 };
387
388 struct dmar_atsr_unit {
389         struct list_head list;          /* list of ATSR units */
390         struct acpi_dmar_header *hdr;   /* ACPI header */
391         struct dmar_dev_scope *devices; /* target devices */
392         int devices_cnt;                /* target device count */
393         u8 include_all:1;               /* include all ports */
394 };
395
396 static LIST_HEAD(dmar_atsr_units);
397 static LIST_HEAD(dmar_rmrr_units);
398
399 #define for_each_rmrr_units(rmrr) \
400         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
401
402 static void flush_unmaps_timeout(unsigned long data);
403
404 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
405
406 #define HIGH_WATER_MARK 250
407 struct deferred_flush_tables {
408         int next;
409         struct iova *iova[HIGH_WATER_MARK];
410         struct dmar_domain *domain[HIGH_WATER_MARK];
411         struct page *freelist[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_exit(struct dmar_domain *domain);
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427 static void domain_remove_one_dev_info(struct dmar_domain *domain,
428                                        struct pci_dev *pdev);
429 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
430                                            struct device *dev);
431
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
434 #else
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
437
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
440
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
445
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
448
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
452
453 static struct iommu_ops intel_iommu_ops;
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
466                 } else if (!strncmp(str, "igfx_off", 8)) {
467                         dmar_map_gfx = 0;
468                         printk(KERN_INFO
469                                 "Intel-IOMMU: disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473                         dmar_forcedac = 1;
474                 } else if (!strncmp(str, "strict", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable batched IOTLB flush\n");
477                         intel_iommu_strict = 1;
478                 } else if (!strncmp(str, "sp_off", 6)) {
479                         printk(KERN_INFO
480                                 "Intel-IOMMU: disable supported super page\n");
481                         intel_iommu_superpage = 0;
482                 }
483
484                 str += strcspn(str, ",");
485                 while (*str == ',')
486                         str++;
487         }
488         return 0;
489 }
490 __setup("intel_iommu=", intel_iommu_setup);
491
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
495
496 static inline void *alloc_pgtable_page(int node)
497 {
498         struct page *page;
499         void *vaddr = NULL;
500
501         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502         if (page)
503                 vaddr = page_address(page);
504         return vaddr;
505 }
506
507 static inline void free_pgtable_page(void *vaddr)
508 {
509         free_page((unsigned long)vaddr);
510 }
511
512 static inline void *alloc_domain_mem(void)
513 {
514         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
515 }
516
517 static void free_domain_mem(void *vaddr)
518 {
519         kmem_cache_free(iommu_domain_cache, vaddr);
520 }
521
522 static inline void * alloc_devinfo_mem(void)
523 {
524         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
525 }
526
527 static inline void free_devinfo_mem(void *vaddr)
528 {
529         kmem_cache_free(iommu_devinfo_cache, vaddr);
530 }
531
532 struct iova *alloc_iova_mem(void)
533 {
534         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
535 }
536
537 void free_iova_mem(struct iova *iova)
538 {
539         kmem_cache_free(iommu_iova_cache, iova);
540 }
541
542
543 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
544 {
545         unsigned long sagaw;
546         int agaw = -1;
547
548         sagaw = cap_sagaw(iommu->cap);
549         for (agaw = width_to_agaw(max_gaw);
550              agaw >= 0; agaw--) {
551                 if (test_bit(agaw, &sagaw))
552                         break;
553         }
554
555         return agaw;
556 }
557
558 /*
559  * Calculate max SAGAW for each iommu.
560  */
561 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
562 {
563         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
564 }
565
566 /*
567  * calculate agaw for each iommu.
568  * "SAGAW" may be different across iommus, use a default agaw, and
569  * get a supported less agaw for iommus that don't support the default agaw.
570  */
571 int iommu_calculate_agaw(struct intel_iommu *iommu)
572 {
573         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
574 }
575
576 /* This functionin only returns single iommu in a domain */
577 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
578 {
579         int iommu_id;
580
581         /* si_domain and vm domain should not get here. */
582         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
583         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
584
585         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
586         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587                 return NULL;
588
589         return g_iommus[iommu_id];
590 }
591
592 static void domain_update_iommu_coherency(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu;
596         int i, found = 0;
597
598         domain->iommu_coherency = 1;
599
600         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
601                 found = 1;
602                 if (!ecap_coherent(g_iommus[i]->ecap)) {
603                         domain->iommu_coherency = 0;
604                         break;
605                 }
606         }
607         if (found)
608                 return;
609
610         /* No hardware attached; use lowest common denominator */
611         rcu_read_lock();
612         for_each_active_iommu(iommu, drhd) {
613                 if (!ecap_coherent(iommu->ecap)) {
614                         domain->iommu_coherency = 0;
615                         break;
616                 }
617         }
618         rcu_read_unlock();
619 }
620
621 static void domain_update_iommu_snooping(struct dmar_domain *domain)
622 {
623         int i;
624
625         domain->iommu_snooping = 1;
626
627         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
628                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
629                         domain->iommu_snooping = 0;
630                         break;
631                 }
632         }
633 }
634
635 static void domain_update_iommu_superpage(struct dmar_domain *domain)
636 {
637         struct dmar_drhd_unit *drhd;
638         struct intel_iommu *iommu = NULL;
639         int mask = 0xf;
640
641         if (!intel_iommu_superpage) {
642                 domain->iommu_superpage = 0;
643                 return;
644         }
645
646         /* set iommu_superpage to the smallest common denominator */
647         rcu_read_lock();
648         for_each_active_iommu(iommu, drhd) {
649                 mask &= cap_super_page_val(iommu->cap);
650                 if (!mask) {
651                         break;
652                 }
653         }
654         rcu_read_unlock();
655
656         domain->iommu_superpage = fls(mask);
657 }
658
659 /* Some capabilities may be different across iommus */
660 static void domain_update_iommu_cap(struct dmar_domain *domain)
661 {
662         domain_update_iommu_coherency(domain);
663         domain_update_iommu_snooping(domain);
664         domain_update_iommu_superpage(domain);
665 }
666
667 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
668 {
669         struct dmar_drhd_unit *drhd = NULL;
670         struct intel_iommu *iommu;
671         struct device *dev;
672         struct pci_dev *pdev;
673         int i;
674
675         rcu_read_lock();
676         for_each_active_iommu(iommu, drhd) {
677                 if (segment != drhd->segment)
678                         continue;
679
680                 for_each_active_dev_scope(drhd->devices,
681                                           drhd->devices_cnt, i, dev) {
682                         if (!dev_is_pci(dev))
683                                 continue;
684                         pdev = to_pci_dev(dev);
685                         if (pdev->bus->number == bus && pdev->devfn == devfn)
686                                 goto out;
687                         if (pdev->subordinate &&
688                             pdev->subordinate->number <= bus &&
689                             pdev->subordinate->busn_res.end >= bus)
690                                 goto out;
691                 }
692
693                 if (drhd->include_all)
694                         goto out;
695         }
696         iommu = NULL;
697 out:
698         rcu_read_unlock();
699
700         return iommu;
701 }
702
703 static void domain_flush_cache(struct dmar_domain *domain,
704                                void *addr, int size)
705 {
706         if (!domain->iommu_coherency)
707                 clflush_cache_range(addr, size);
708 }
709
710 /* Gets context entry for a given bus and devfn */
711 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
712                 u8 bus, u8 devfn)
713 {
714         struct root_entry *root;
715         struct context_entry *context;
716         unsigned long phy_addr;
717         unsigned long flags;
718
719         spin_lock_irqsave(&iommu->lock, flags);
720         root = &iommu->root_entry[bus];
721         context = get_context_addr_from_root(root);
722         if (!context) {
723                 context = (struct context_entry *)
724                                 alloc_pgtable_page(iommu->node);
725                 if (!context) {
726                         spin_unlock_irqrestore(&iommu->lock, flags);
727                         return NULL;
728                 }
729                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
730                 phy_addr = virt_to_phys((void *)context);
731                 set_root_value(root, phy_addr);
732                 set_root_present(root);
733                 __iommu_flush_cache(iommu, root, sizeof(*root));
734         }
735         spin_unlock_irqrestore(&iommu->lock, flags);
736         return &context[devfn];
737 }
738
739 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
740 {
741         struct root_entry *root;
742         struct context_entry *context;
743         int ret;
744         unsigned long flags;
745
746         spin_lock_irqsave(&iommu->lock, flags);
747         root = &iommu->root_entry[bus];
748         context = get_context_addr_from_root(root);
749         if (!context) {
750                 ret = 0;
751                 goto out;
752         }
753         ret = context_present(&context[devfn]);
754 out:
755         spin_unlock_irqrestore(&iommu->lock, flags);
756         return ret;
757 }
758
759 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
760 {
761         struct root_entry *root;
762         struct context_entry *context;
763         unsigned long flags;
764
765         spin_lock_irqsave(&iommu->lock, flags);
766         root = &iommu->root_entry[bus];
767         context = get_context_addr_from_root(root);
768         if (context) {
769                 context_clear_entry(&context[devfn]);
770                 __iommu_flush_cache(iommu, &context[devfn], \
771                         sizeof(*context));
772         }
773         spin_unlock_irqrestore(&iommu->lock, flags);
774 }
775
776 static void free_context_table(struct intel_iommu *iommu)
777 {
778         struct root_entry *root;
779         int i;
780         unsigned long flags;
781         struct context_entry *context;
782
783         spin_lock_irqsave(&iommu->lock, flags);
784         if (!iommu->root_entry) {
785                 goto out;
786         }
787         for (i = 0; i < ROOT_ENTRY_NR; i++) {
788                 root = &iommu->root_entry[i];
789                 context = get_context_addr_from_root(root);
790                 if (context)
791                         free_pgtable_page(context);
792         }
793         free_pgtable_page(iommu->root_entry);
794         iommu->root_entry = NULL;
795 out:
796         spin_unlock_irqrestore(&iommu->lock, flags);
797 }
798
799 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
800                                       unsigned long pfn, int *target_level)
801 {
802         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
803         struct dma_pte *parent, *pte = NULL;
804         int level = agaw_to_level(domain->agaw);
805         int offset;
806
807         BUG_ON(!domain->pgd);
808
809         if (addr_width < BITS_PER_LONG && pfn >> addr_width)
810                 /* Address beyond IOMMU's addressing capabilities. */
811                 return NULL;
812
813         parent = domain->pgd;
814
815         while (1) {
816                 void *tmp_page;
817
818                 offset = pfn_level_offset(pfn, level);
819                 pte = &parent[offset];
820                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
821                         break;
822                 if (level == *target_level)
823                         break;
824
825                 if (!dma_pte_present(pte)) {
826                         uint64_t pteval;
827
828                         tmp_page = alloc_pgtable_page(domain->nid);
829
830                         if (!tmp_page)
831                                 return NULL;
832
833                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
834                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
835                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
836                                 /* Someone else set it while we were thinking; use theirs. */
837                                 free_pgtable_page(tmp_page);
838                         } else {
839                                 dma_pte_addr(pte);
840                                 domain_flush_cache(domain, pte, sizeof(*pte));
841                         }
842                 }
843                 if (level == 1)
844                         break;
845
846                 parent = phys_to_virt(dma_pte_addr(pte));
847                 level--;
848         }
849
850         if (!*target_level)
851                 *target_level = level;
852
853         return pte;
854 }
855
856
857 /* return address's pte at specific level */
858 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
859                                          unsigned long pfn,
860                                          int level, int *large_page)
861 {
862         struct dma_pte *parent, *pte = NULL;
863         int total = agaw_to_level(domain->agaw);
864         int offset;
865
866         parent = domain->pgd;
867         while (level <= total) {
868                 offset = pfn_level_offset(pfn, total);
869                 pte = &parent[offset];
870                 if (level == total)
871                         return pte;
872
873                 if (!dma_pte_present(pte)) {
874                         *large_page = total;
875                         break;
876                 }
877
878                 if (pte->val & DMA_PTE_LARGE_PAGE) {
879                         *large_page = total;
880                         return pte;
881                 }
882
883                 parent = phys_to_virt(dma_pte_addr(pte));
884                 total--;
885         }
886         return NULL;
887 }
888
889 /* clear last level pte, a tlb flush should be followed */
890 static void dma_pte_clear_range(struct dmar_domain *domain,
891                                 unsigned long start_pfn,
892                                 unsigned long last_pfn)
893 {
894         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
895         unsigned int large_page = 1;
896         struct dma_pte *first_pte, *pte;
897
898         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
899         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
900         BUG_ON(start_pfn > last_pfn);
901
902         /* we don't need lock here; nobody else touches the iova range */
903         do {
904                 large_page = 1;
905                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
906                 if (!pte) {
907                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
908                         continue;
909                 }
910                 do {
911                         dma_clear_pte(pte);
912                         start_pfn += lvl_to_nr_pages(large_page);
913                         pte++;
914                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
915
916                 domain_flush_cache(domain, first_pte,
917                                    (void *)pte - (void *)first_pte);
918
919         } while (start_pfn && start_pfn <= last_pfn);
920 }
921
922 static void dma_pte_free_level(struct dmar_domain *domain, int level,
923                                struct dma_pte *pte, unsigned long pfn,
924                                unsigned long start_pfn, unsigned long last_pfn)
925 {
926         pfn = max(start_pfn, pfn);
927         pte = &pte[pfn_level_offset(pfn, level)];
928
929         do {
930                 unsigned long level_pfn;
931                 struct dma_pte *level_pte;
932
933                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
934                         goto next;
935
936                 level_pfn = pfn & level_mask(level - 1);
937                 level_pte = phys_to_virt(dma_pte_addr(pte));
938
939                 if (level > 2)
940                         dma_pte_free_level(domain, level - 1, level_pte,
941                                            level_pfn, start_pfn, last_pfn);
942
943                 /* If range covers entire pagetable, free it */
944                 if (!(start_pfn > level_pfn ||
945                       last_pfn < level_pfn + level_size(level) - 1)) {
946                         dma_clear_pte(pte);
947                         domain_flush_cache(domain, pte, sizeof(*pte));
948                         free_pgtable_page(level_pte);
949                 }
950 next:
951                 pfn += level_size(level);
952         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
953 }
954
955 /* free page table pages. last level pte should already be cleared */
956 static void dma_pte_free_pagetable(struct dmar_domain *domain,
957                                    unsigned long start_pfn,
958                                    unsigned long last_pfn)
959 {
960         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
961
962         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
963         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
964         BUG_ON(start_pfn > last_pfn);
965
966         /* We don't need lock here; nobody else touches the iova range */
967         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
968                            domain->pgd, 0, start_pfn, last_pfn);
969
970         /* free pgd */
971         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
972                 free_pgtable_page(domain->pgd);
973                 domain->pgd = NULL;
974         }
975 }
976
977 /* When a page at a given level is being unlinked from its parent, we don't
978    need to *modify* it at all. All we need to do is make a list of all the
979    pages which can be freed just as soon as we've flushed the IOTLB and we
980    know the hardware page-walk will no longer touch them.
981    The 'pte' argument is the *parent* PTE, pointing to the page that is to
982    be freed. */
983 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
984                                             int level, struct dma_pte *pte,
985                                             struct page *freelist)
986 {
987         struct page *pg;
988
989         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
990         pg->freelist = freelist;
991         freelist = pg;
992
993         if (level == 1)
994                 return freelist;
995
996         for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
997                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
998                         freelist = dma_pte_list_pagetables(domain, level - 1,
999                                                            pte, freelist);
1000         }
1001
1002         return freelist;
1003 }
1004
1005 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1006                                         struct dma_pte *pte, unsigned long pfn,
1007                                         unsigned long start_pfn,
1008                                         unsigned long last_pfn,
1009                                         struct page *freelist)
1010 {
1011         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1012
1013         pfn = max(start_pfn, pfn);
1014         pte = &pte[pfn_level_offset(pfn, level)];
1015
1016         do {
1017                 unsigned long level_pfn;
1018
1019                 if (!dma_pte_present(pte))
1020                         goto next;
1021
1022                 level_pfn = pfn & level_mask(level);
1023
1024                 /* If range covers entire pagetable, free it */
1025                 if (start_pfn <= level_pfn &&
1026                     last_pfn >= level_pfn + level_size(level) - 1) {
1027                         /* These suborbinate page tables are going away entirely. Don't
1028                            bother to clear them; we're just going to *free* them. */
1029                         if (level > 1 && !dma_pte_superpage(pte))
1030                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1031
1032                         dma_clear_pte(pte);
1033                         if (!first_pte)
1034                                 first_pte = pte;
1035                         last_pte = pte;
1036                 } else if (level > 1) {
1037                         /* Recurse down into a level that isn't *entirely* obsolete */
1038                         freelist = dma_pte_clear_level(domain, level - 1,
1039                                                        phys_to_virt(dma_pte_addr(pte)),
1040                                                        level_pfn, start_pfn, last_pfn,
1041                                                        freelist);
1042                 }
1043 next:
1044                 pfn += level_size(level);
1045         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1046
1047         if (first_pte)
1048                 domain_flush_cache(domain, first_pte,
1049                                    (void *)++last_pte - (void *)first_pte);
1050
1051         return freelist;
1052 }
1053
1054 /* We can't just free the pages because the IOMMU may still be walking
1055    the page tables, and may have cached the intermediate levels. The
1056    pages can only be freed after the IOTLB flush has been done. */
1057 struct page *domain_unmap(struct dmar_domain *domain,
1058                           unsigned long start_pfn,
1059                           unsigned long last_pfn)
1060 {
1061         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1062         struct page *freelist = NULL;
1063
1064         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1065         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1066         BUG_ON(start_pfn > last_pfn);
1067
1068         /* we don't need lock here; nobody else touches the iova range */
1069         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1070                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1071
1072         /* free pgd */
1073         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1074                 struct page *pgd_page = virt_to_page(domain->pgd);
1075                 pgd_page->freelist = freelist;
1076                 freelist = pgd_page;
1077
1078                 domain->pgd = NULL;
1079         }
1080
1081         return freelist;
1082 }
1083
1084 void dma_free_pagelist(struct page *freelist)
1085 {
1086         struct page *pg;
1087
1088         while ((pg = freelist)) {
1089                 freelist = pg->freelist;
1090                 free_pgtable_page(page_address(pg));
1091         }
1092 }
1093
1094 /* iommu handling */
1095 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1096 {
1097         struct root_entry *root;
1098         unsigned long flags;
1099
1100         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1101         if (!root)
1102                 return -ENOMEM;
1103
1104         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1105
1106         spin_lock_irqsave(&iommu->lock, flags);
1107         iommu->root_entry = root;
1108         spin_unlock_irqrestore(&iommu->lock, flags);
1109
1110         return 0;
1111 }
1112
1113 static void iommu_set_root_entry(struct intel_iommu *iommu)
1114 {
1115         void *addr;
1116         u32 sts;
1117         unsigned long flag;
1118
1119         addr = iommu->root_entry;
1120
1121         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1122         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1123
1124         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1125
1126         /* Make sure hardware complete it */
1127         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1128                       readl, (sts & DMA_GSTS_RTPS), sts);
1129
1130         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1131 }
1132
1133 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135         u32 val;
1136         unsigned long flag;
1137
1138         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139                 return;
1140
1141         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (!(val & DMA_GSTS_WBFS)), val);
1147
1148         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153                                   u16 did, u16 source_id, u8 function_mask,
1154                                   u64 type)
1155 {
1156         u64 val = 0;
1157         unsigned long flag;
1158
1159         switch (type) {
1160         case DMA_CCMD_GLOBAL_INVL:
1161                 val = DMA_CCMD_GLOBAL_INVL;
1162                 break;
1163         case DMA_CCMD_DOMAIN_INVL:
1164                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165                 break;
1166         case DMA_CCMD_DEVICE_INVL:
1167                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169                 break;
1170         default:
1171                 BUG();
1172         }
1173         val |= DMA_CCMD_ICC;
1174
1175         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1176         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1177
1178         /* Make sure hardware complete it */
1179         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1180                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1181
1182         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1183 }
1184
1185 /* return value determine if we need a write buffer flush */
1186 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1187                                 u64 addr, unsigned int size_order, u64 type)
1188 {
1189         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1190         u64 val = 0, val_iva = 0;
1191         unsigned long flag;
1192
1193         switch (type) {
1194         case DMA_TLB_GLOBAL_FLUSH:
1195                 /* global flush doesn't need set IVA_REG */
1196                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1197                 break;
1198         case DMA_TLB_DSI_FLUSH:
1199                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1200                 break;
1201         case DMA_TLB_PSI_FLUSH:
1202                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1203                 /* IH bit is passed in as part of address */
1204                 val_iva = size_order | addr;
1205                 break;
1206         default:
1207                 BUG();
1208         }
1209         /* Note: set drain read/write */
1210 #if 0
1211         /*
1212          * This is probably to be super secure.. Looks like we can
1213          * ignore it without any impact.
1214          */
1215         if (cap_read_drain(iommu->cap))
1216                 val |= DMA_TLB_READ_DRAIN;
1217 #endif
1218         if (cap_write_drain(iommu->cap))
1219                 val |= DMA_TLB_WRITE_DRAIN;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222         /* Note: Only uses first TLB reg currently */
1223         if (val_iva)
1224                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1225         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1226
1227         /* Make sure hardware complete it */
1228         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1229                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1230
1231         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1232
1233         /* check IOTLB invalidation granularity */
1234         if (DMA_TLB_IAIG(val) == 0)
1235                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1236         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1237                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1238                         (unsigned long long)DMA_TLB_IIRG(type),
1239                         (unsigned long long)DMA_TLB_IAIG(val));
1240 }
1241
1242 static struct device_domain_info *
1243 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1244                          u8 bus, u8 devfn)
1245 {
1246         int found = 0;
1247         unsigned long flags;
1248         struct device_domain_info *info;
1249         struct pci_dev *pdev;
1250
1251         if (!ecap_dev_iotlb_support(iommu->ecap))
1252                 return NULL;
1253
1254         if (!iommu->qi)
1255                 return NULL;
1256
1257         spin_lock_irqsave(&device_domain_lock, flags);
1258         list_for_each_entry(info, &domain->devices, link)
1259                 if (info->bus == bus && info->devfn == devfn) {
1260                         found = 1;
1261                         break;
1262                 }
1263         spin_unlock_irqrestore(&device_domain_lock, flags);
1264
1265         if (!found || !info->dev || !dev_is_pci(info->dev))
1266                 return NULL;
1267
1268         pdev = to_pci_dev(info->dev);
1269
1270         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1271                 return NULL;
1272
1273         if (!dmar_find_matched_atsr_unit(pdev))
1274                 return NULL;
1275
1276         return info;
1277 }
1278
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1280 {
1281         if (!info || !dev_is_pci(info->dev))
1282                 return;
1283
1284         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1285 }
1286
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1288 {
1289         if (!info->dev || !dev_is_pci(info->dev) ||
1290             !pci_ats_enabled(to_pci_dev(info->dev)))
1291                 return;
1292
1293         pci_disable_ats(to_pci_dev(info->dev));
1294 }
1295
1296 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1297                                   u64 addr, unsigned mask)
1298 {
1299         u16 sid, qdep;
1300         unsigned long flags;
1301         struct device_domain_info *info;
1302
1303         spin_lock_irqsave(&device_domain_lock, flags);
1304         list_for_each_entry(info, &domain->devices, link) {
1305                 struct pci_dev *pdev;
1306                 if (!info->dev || !dev_is_pci(info->dev))
1307                         continue;
1308
1309                 pdev = to_pci_dev(info->dev);
1310                 if (!pci_ats_enabled(pdev))
1311                         continue;
1312
1313                 sid = info->bus << 8 | info->devfn;
1314                 qdep = pci_ats_queue_depth(pdev);
1315                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1316         }
1317         spin_unlock_irqrestore(&device_domain_lock, flags);
1318 }
1319
1320 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1321                                   unsigned long pfn, unsigned int pages, int ih, int map)
1322 {
1323         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1324         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1325
1326         BUG_ON(pages == 0);
1327
1328         if (ih)
1329                 ih = 1 << 6;
1330         /*
1331          * Fallback to domain selective flush if no PSI support or the size is
1332          * too big.
1333          * PSI requires page size to be 2 ^ x, and the base address is naturally
1334          * aligned to the size
1335          */
1336         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1337                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1338                                                 DMA_TLB_DSI_FLUSH);
1339         else
1340                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1341                                                 DMA_TLB_PSI_FLUSH);
1342
1343         /*
1344          * In caching mode, changes of pages from non-present to present require
1345          * flush. However, device IOTLB doesn't need to be flushed in this case.
1346          */
1347         if (!cap_caching_mode(iommu->cap) || !map)
1348                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1349 }
1350
1351 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1352 {
1353         u32 pmen;
1354         unsigned long flags;
1355
1356         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1357         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1358         pmen &= ~DMA_PMEN_EPM;
1359         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1360
1361         /* wait for the protected region status bit to clear */
1362         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1363                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1364
1365         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1366 }
1367
1368 static int iommu_enable_translation(struct intel_iommu *iommu)
1369 {
1370         u32 sts;
1371         unsigned long flags;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1374         iommu->gcmd |= DMA_GCMD_TE;
1375         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1376
1377         /* Make sure hardware complete it */
1378         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1379                       readl, (sts & DMA_GSTS_TES), sts);
1380
1381         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1382         return 0;
1383 }
1384
1385 static int iommu_disable_translation(struct intel_iommu *iommu)
1386 {
1387         u32 sts;
1388         unsigned long flag;
1389
1390         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1391         iommu->gcmd &= ~DMA_GCMD_TE;
1392         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1393
1394         /* Make sure hardware complete it */
1395         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1396                       readl, (!(sts & DMA_GSTS_TES)), sts);
1397
1398         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1399         return 0;
1400 }
1401
1402
1403 static int iommu_init_domains(struct intel_iommu *iommu)
1404 {
1405         unsigned long ndomains;
1406         unsigned long nlongs;
1407
1408         ndomains = cap_ndoms(iommu->cap);
1409         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1410                  iommu->seq_id, ndomains);
1411         nlongs = BITS_TO_LONGS(ndomains);
1412
1413         spin_lock_init(&iommu->lock);
1414
1415         /* TBD: there might be 64K domains,
1416          * consider other allocation for future chip
1417          */
1418         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1419         if (!iommu->domain_ids) {
1420                 pr_err("IOMMU%d: allocating domain id array failed\n",
1421                        iommu->seq_id);
1422                 return -ENOMEM;
1423         }
1424         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1425                         GFP_KERNEL);
1426         if (!iommu->domains) {
1427                 pr_err("IOMMU%d: allocating domain array failed\n",
1428                        iommu->seq_id);
1429                 kfree(iommu->domain_ids);
1430                 iommu->domain_ids = NULL;
1431                 return -ENOMEM;
1432         }
1433
1434         /*
1435          * if Caching mode is set, then invalid translations are tagged
1436          * with domainid 0. Hence we need to pre-allocate it.
1437          */
1438         if (cap_caching_mode(iommu->cap))
1439                 set_bit(0, iommu->domain_ids);
1440         return 0;
1441 }
1442
1443 static void free_dmar_iommu(struct intel_iommu *iommu)
1444 {
1445         struct dmar_domain *domain;
1446         int i, count;
1447         unsigned long flags;
1448
1449         if ((iommu->domains) && (iommu->domain_ids)) {
1450                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1451                         /*
1452                          * Domain id 0 is reserved for invalid translation
1453                          * if hardware supports caching mode.
1454                          */
1455                         if (cap_caching_mode(iommu->cap) && i == 0)
1456                                 continue;
1457
1458                         domain = iommu->domains[i];
1459                         clear_bit(i, iommu->domain_ids);
1460
1461                         spin_lock_irqsave(&domain->iommu_lock, flags);
1462                         count = --domain->iommu_count;
1463                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1464                         if (count == 0)
1465                                 domain_exit(domain);
1466                 }
1467         }
1468
1469         if (iommu->gcmd & DMA_GCMD_TE)
1470                 iommu_disable_translation(iommu);
1471
1472         kfree(iommu->domains);
1473         kfree(iommu->domain_ids);
1474         iommu->domains = NULL;
1475         iommu->domain_ids = NULL;
1476
1477         g_iommus[iommu->seq_id] = NULL;
1478
1479         /* free context mapping */
1480         free_context_table(iommu);
1481 }
1482
1483 static struct dmar_domain *alloc_domain(bool vm)
1484 {
1485         /* domain id for virtual machine, it won't be set in context */
1486         static atomic_t vm_domid = ATOMIC_INIT(0);
1487         struct dmar_domain *domain;
1488
1489         domain = alloc_domain_mem();
1490         if (!domain)
1491                 return NULL;
1492
1493         domain->nid = -1;
1494         domain->iommu_count = 0;
1495         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1496         domain->flags = 0;
1497         spin_lock_init(&domain->iommu_lock);
1498         INIT_LIST_HEAD(&domain->devices);
1499         if (vm) {
1500                 domain->id = atomic_inc_return(&vm_domid);
1501                 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1502         }
1503
1504         return domain;
1505 }
1506
1507 static int iommu_attach_domain(struct dmar_domain *domain,
1508                                struct intel_iommu *iommu)
1509 {
1510         int num;
1511         unsigned long ndomains;
1512         unsigned long flags;
1513
1514         ndomains = cap_ndoms(iommu->cap);
1515
1516         spin_lock_irqsave(&iommu->lock, flags);
1517
1518         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1519         if (num >= ndomains) {
1520                 spin_unlock_irqrestore(&iommu->lock, flags);
1521                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1522                 return -ENOMEM;
1523         }
1524
1525         domain->id = num;
1526         domain->iommu_count++;
1527         set_bit(num, iommu->domain_ids);
1528         set_bit(iommu->seq_id, domain->iommu_bmp);
1529         iommu->domains[num] = domain;
1530         spin_unlock_irqrestore(&iommu->lock, flags);
1531
1532         return 0;
1533 }
1534
1535 static void iommu_detach_domain(struct dmar_domain *domain,
1536                                 struct intel_iommu *iommu)
1537 {
1538         unsigned long flags;
1539         int num, ndomains;
1540
1541         spin_lock_irqsave(&iommu->lock, flags);
1542         ndomains = cap_ndoms(iommu->cap);
1543         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1544                 if (iommu->domains[num] == domain) {
1545                         clear_bit(num, iommu->domain_ids);
1546                         iommu->domains[num] = NULL;
1547                         break;
1548                 }
1549         }
1550         spin_unlock_irqrestore(&iommu->lock, flags);
1551 }
1552
1553 static struct iova_domain reserved_iova_list;
1554 static struct lock_class_key reserved_rbtree_key;
1555
1556 static int dmar_init_reserved_ranges(void)
1557 {
1558         struct pci_dev *pdev = NULL;
1559         struct iova *iova;
1560         int i;
1561
1562         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1563
1564         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1565                 &reserved_rbtree_key);
1566
1567         /* IOAPIC ranges shouldn't be accessed by DMA */
1568         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1569                 IOVA_PFN(IOAPIC_RANGE_END));
1570         if (!iova) {
1571                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1572                 return -ENODEV;
1573         }
1574
1575         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1576         for_each_pci_dev(pdev) {
1577                 struct resource *r;
1578
1579                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1580                         r = &pdev->resource[i];
1581                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1582                                 continue;
1583                         iova = reserve_iova(&reserved_iova_list,
1584                                             IOVA_PFN(r->start),
1585                                             IOVA_PFN(r->end));
1586                         if (!iova) {
1587                                 printk(KERN_ERR "Reserve iova failed\n");
1588                                 return -ENODEV;
1589                         }
1590                 }
1591         }
1592         return 0;
1593 }
1594
1595 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1596 {
1597         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1598 }
1599
1600 static inline int guestwidth_to_adjustwidth(int gaw)
1601 {
1602         int agaw;
1603         int r = (gaw - 12) % 9;
1604
1605         if (r == 0)
1606                 agaw = gaw;
1607         else
1608                 agaw = gaw + 9 - r;
1609         if (agaw > 64)
1610                 agaw = 64;
1611         return agaw;
1612 }
1613
1614 static int domain_init(struct dmar_domain *domain, int guest_width)
1615 {
1616         struct intel_iommu *iommu;
1617         int adjust_width, agaw;
1618         unsigned long sagaw;
1619
1620         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1621         domain_reserve_special_ranges(domain);
1622
1623         /* calculate AGAW */
1624         iommu = domain_get_iommu(domain);
1625         if (guest_width > cap_mgaw(iommu->cap))
1626                 guest_width = cap_mgaw(iommu->cap);
1627         domain->gaw = guest_width;
1628         adjust_width = guestwidth_to_adjustwidth(guest_width);
1629         agaw = width_to_agaw(adjust_width);
1630         sagaw = cap_sagaw(iommu->cap);
1631         if (!test_bit(agaw, &sagaw)) {
1632                 /* hardware doesn't support it, choose a bigger one */
1633                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1634                 agaw = find_next_bit(&sagaw, 5, agaw);
1635                 if (agaw >= 5)
1636                         return -ENODEV;
1637         }
1638         domain->agaw = agaw;
1639
1640         if (ecap_coherent(iommu->ecap))
1641                 domain->iommu_coherency = 1;
1642         else
1643                 domain->iommu_coherency = 0;
1644
1645         if (ecap_sc_support(iommu->ecap))
1646                 domain->iommu_snooping = 1;
1647         else
1648                 domain->iommu_snooping = 0;
1649
1650         if (intel_iommu_superpage)
1651                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1652         else
1653                 domain->iommu_superpage = 0;
1654
1655         domain->nid = iommu->node;
1656
1657         /* always allocate the top pgd */
1658         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1659         if (!domain->pgd)
1660                 return -ENOMEM;
1661         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1662         return 0;
1663 }
1664
1665 static void domain_exit(struct dmar_domain *domain)
1666 {
1667         struct dmar_drhd_unit *drhd;
1668         struct intel_iommu *iommu;
1669         struct page *freelist = NULL;
1670
1671         /* Domain 0 is reserved, so dont process it */
1672         if (!domain)
1673                 return;
1674
1675         /* Flush any lazy unmaps that may reference this domain */
1676         if (!intel_iommu_strict)
1677                 flush_unmaps_timeout(0);
1678
1679         /* remove associated devices */
1680         domain_remove_dev_info(domain);
1681
1682         /* destroy iovas */
1683         put_iova_domain(&domain->iovad);
1684
1685         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1686
1687         /* clear attached or cached domains */
1688         rcu_read_lock();
1689         for_each_active_iommu(iommu, drhd)
1690                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1691                     test_bit(iommu->seq_id, domain->iommu_bmp))
1692                         iommu_detach_domain(domain, iommu);
1693         rcu_read_unlock();
1694
1695         dma_free_pagelist(freelist);
1696
1697         free_domain_mem(domain);
1698 }
1699
1700 static int domain_context_mapping_one(struct dmar_domain *domain,
1701                                       struct intel_iommu *iommu,
1702                                       u8 bus, u8 devfn, int translation)
1703 {
1704         struct context_entry *context;
1705         unsigned long flags;
1706         struct dma_pte *pgd;
1707         unsigned long num;
1708         unsigned long ndomains;
1709         int id;
1710         int agaw;
1711         struct device_domain_info *info = NULL;
1712
1713         pr_debug("Set context mapping for %02x:%02x.%d\n",
1714                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1715
1716         BUG_ON(!domain->pgd);
1717         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1718                translation != CONTEXT_TT_MULTI_LEVEL);
1719
1720         context = device_to_context_entry(iommu, bus, devfn);
1721         if (!context)
1722                 return -ENOMEM;
1723         spin_lock_irqsave(&iommu->lock, flags);
1724         if (context_present(context)) {
1725                 spin_unlock_irqrestore(&iommu->lock, flags);
1726                 return 0;
1727         }
1728
1729         id = domain->id;
1730         pgd = domain->pgd;
1731
1732         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1733             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1734                 int found = 0;
1735
1736                 /* find an available domain id for this device in iommu */
1737                 ndomains = cap_ndoms(iommu->cap);
1738                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1739                         if (iommu->domains[num] == domain) {
1740                                 id = num;
1741                                 found = 1;
1742                                 break;
1743                         }
1744                 }
1745
1746                 if (found == 0) {
1747                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1748                         if (num >= ndomains) {
1749                                 spin_unlock_irqrestore(&iommu->lock, flags);
1750                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1751                                 return -EFAULT;
1752                         }
1753
1754                         set_bit(num, iommu->domain_ids);
1755                         iommu->domains[num] = domain;
1756                         id = num;
1757                 }
1758
1759                 /* Skip top levels of page tables for
1760                  * iommu which has less agaw than default.
1761                  * Unnecessary for PT mode.
1762                  */
1763                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1764                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1765                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1766                                 if (!dma_pte_present(pgd)) {
1767                                         spin_unlock_irqrestore(&iommu->lock, flags);
1768                                         return -ENOMEM;
1769                                 }
1770                         }
1771                 }
1772         }
1773
1774         context_set_domain_id(context, id);
1775
1776         if (translation != CONTEXT_TT_PASS_THROUGH) {
1777                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1778                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1779                                      CONTEXT_TT_MULTI_LEVEL;
1780         }
1781         /*
1782          * In pass through mode, AW must be programmed to indicate the largest
1783          * AGAW value supported by hardware. And ASR is ignored by hardware.
1784          */
1785         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1786                 context_set_address_width(context, iommu->msagaw);
1787         else {
1788                 context_set_address_root(context, virt_to_phys(pgd));
1789                 context_set_address_width(context, iommu->agaw);
1790         }
1791
1792         context_set_translation_type(context, translation);
1793         context_set_fault_enable(context);
1794         context_set_present(context);
1795         domain_flush_cache(domain, context, sizeof(*context));
1796
1797         /*
1798          * It's a non-present to present mapping. If hardware doesn't cache
1799          * non-present entry we only need to flush the write-buffer. If the
1800          * _does_ cache non-present entries, then it does so in the special
1801          * domain #0, which we have to flush:
1802          */
1803         if (cap_caching_mode(iommu->cap)) {
1804                 iommu->flush.flush_context(iommu, 0,
1805                                            (((u16)bus) << 8) | devfn,
1806                                            DMA_CCMD_MASK_NOBIT,
1807                                            DMA_CCMD_DEVICE_INVL);
1808                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1809         } else {
1810                 iommu_flush_write_buffer(iommu);
1811         }
1812         iommu_enable_dev_iotlb(info);
1813         spin_unlock_irqrestore(&iommu->lock, flags);
1814
1815         spin_lock_irqsave(&domain->iommu_lock, flags);
1816         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1817                 domain->iommu_count++;
1818                 if (domain->iommu_count == 1)
1819                         domain->nid = iommu->node;
1820                 domain_update_iommu_cap(domain);
1821         }
1822         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1823         return 0;
1824 }
1825
1826 static int
1827 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1828                         int translation)
1829 {
1830         int ret;
1831         struct pci_dev *tmp, *parent;
1832         struct intel_iommu *iommu;
1833
1834         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1835                                 pdev->devfn);
1836         if (!iommu)
1837                 return -ENODEV;
1838
1839         ret = domain_context_mapping_one(domain, iommu,
1840                                          pdev->bus->number, pdev->devfn,
1841                                          translation);
1842         if (ret)
1843                 return ret;
1844
1845         /* dependent device mapping */
1846         tmp = pci_find_upstream_pcie_bridge(pdev);
1847         if (!tmp)
1848                 return 0;
1849         /* Secondary interface's bus number and devfn 0 */
1850         parent = pdev->bus->self;
1851         while (parent != tmp) {
1852                 ret = domain_context_mapping_one(domain, iommu,
1853                                                  parent->bus->number,
1854                                                  parent->devfn, translation);
1855                 if (ret)
1856                         return ret;
1857                 parent = parent->bus->self;
1858         }
1859         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1860                 return domain_context_mapping_one(domain, iommu,
1861                                         tmp->subordinate->number, 0,
1862                                         translation);
1863         else /* this is a legacy PCI bridge */
1864                 return domain_context_mapping_one(domain, iommu,
1865                                                   tmp->bus->number,
1866                                                   tmp->devfn,
1867                                                   translation);
1868 }
1869
1870 static int domain_context_mapped(struct pci_dev *pdev)
1871 {
1872         int ret;
1873         struct pci_dev *tmp, *parent;
1874         struct intel_iommu *iommu;
1875
1876         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1877                                 pdev->devfn);
1878         if (!iommu)
1879                 return -ENODEV;
1880
1881         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1882         if (!ret)
1883                 return ret;
1884         /* dependent device mapping */
1885         tmp = pci_find_upstream_pcie_bridge(pdev);
1886         if (!tmp)
1887                 return ret;
1888         /* Secondary interface's bus number and devfn 0 */
1889         parent = pdev->bus->self;
1890         while (parent != tmp) {
1891                 ret = device_context_mapped(iommu, parent->bus->number,
1892                                             parent->devfn);
1893                 if (!ret)
1894                         return ret;
1895                 parent = parent->bus->self;
1896         }
1897         if (pci_is_pcie(tmp))
1898                 return device_context_mapped(iommu, tmp->subordinate->number,
1899                                              0);
1900         else
1901                 return device_context_mapped(iommu, tmp->bus->number,
1902                                              tmp->devfn);
1903 }
1904
1905 /* Returns a number of VTD pages, but aligned to MM page size */
1906 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1907                                             size_t size)
1908 {
1909         host_addr &= ~PAGE_MASK;
1910         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1911 }
1912
1913 /* Return largest possible superpage level for a given mapping */
1914 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1915                                           unsigned long iov_pfn,
1916                                           unsigned long phy_pfn,
1917                                           unsigned long pages)
1918 {
1919         int support, level = 1;
1920         unsigned long pfnmerge;
1921
1922         support = domain->iommu_superpage;
1923
1924         /* To use a large page, the virtual *and* physical addresses
1925            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1926            of them will mean we have to use smaller pages. So just
1927            merge them and check both at once. */
1928         pfnmerge = iov_pfn | phy_pfn;
1929
1930         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1931                 pages >>= VTD_STRIDE_SHIFT;
1932                 if (!pages)
1933                         break;
1934                 pfnmerge >>= VTD_STRIDE_SHIFT;
1935                 level++;
1936                 support--;
1937         }
1938         return level;
1939 }
1940
1941 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1942                             struct scatterlist *sg, unsigned long phys_pfn,
1943                             unsigned long nr_pages, int prot)
1944 {
1945         struct dma_pte *first_pte = NULL, *pte = NULL;
1946         phys_addr_t uninitialized_var(pteval);
1947         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1948         unsigned long sg_res;
1949         unsigned int largepage_lvl = 0;
1950         unsigned long lvl_pages = 0;
1951
1952         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1953
1954         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1955                 return -EINVAL;
1956
1957         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1958
1959         if (sg)
1960                 sg_res = 0;
1961         else {
1962                 sg_res = nr_pages + 1;
1963                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1964         }
1965
1966         while (nr_pages > 0) {
1967                 uint64_t tmp;
1968
1969                 if (!sg_res) {
1970                         sg_res = aligned_nrpages(sg->offset, sg->length);
1971                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1972                         sg->dma_length = sg->length;
1973                         pteval = page_to_phys(sg_page(sg)) | prot;
1974                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1975                 }
1976
1977                 if (!pte) {
1978                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1979
1980                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1981                         if (!pte)
1982                                 return -ENOMEM;
1983                         /* It is large page*/
1984                         if (largepage_lvl > 1) {
1985                                 pteval |= DMA_PTE_LARGE_PAGE;
1986                                 /* Ensure that old small page tables are removed to make room
1987                                    for superpage, if they exist. */
1988                                 dma_pte_clear_range(domain, iov_pfn,
1989                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1990                                 dma_pte_free_pagetable(domain, iov_pfn,
1991                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1992                         } else {
1993                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1994                         }
1995
1996                 }
1997                 /* We don't need lock here, nobody else
1998                  * touches the iova range
1999                  */
2000                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2001                 if (tmp) {
2002                         static int dumps = 5;
2003                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2004                                iov_pfn, tmp, (unsigned long long)pteval);
2005                         if (dumps) {
2006                                 dumps--;
2007                                 debug_dma_dump_mappings(NULL);
2008                         }
2009                         WARN_ON(1);
2010                 }
2011
2012                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2013
2014                 BUG_ON(nr_pages < lvl_pages);
2015                 BUG_ON(sg_res < lvl_pages);
2016
2017                 nr_pages -= lvl_pages;
2018                 iov_pfn += lvl_pages;
2019                 phys_pfn += lvl_pages;
2020                 pteval += lvl_pages * VTD_PAGE_SIZE;
2021                 sg_res -= lvl_pages;
2022
2023                 /* If the next PTE would be the first in a new page, then we
2024                    need to flush the cache on the entries we've just written.
2025                    And then we'll need to recalculate 'pte', so clear it and
2026                    let it get set again in the if (!pte) block above.
2027
2028                    If we're done (!nr_pages) we need to flush the cache too.
2029
2030                    Also if we've been setting superpages, we may need to
2031                    recalculate 'pte' and switch back to smaller pages for the
2032                    end of the mapping, if the trailing size is not enough to
2033                    use another superpage (i.e. sg_res < lvl_pages). */
2034                 pte++;
2035                 if (!nr_pages || first_pte_in_page(pte) ||
2036                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2037                         domain_flush_cache(domain, first_pte,
2038                                            (void *)pte - (void *)first_pte);
2039                         pte = NULL;
2040                 }
2041
2042                 if (!sg_res && nr_pages)
2043                         sg = sg_next(sg);
2044         }
2045         return 0;
2046 }
2047
2048 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2049                                     struct scatterlist *sg, unsigned long nr_pages,
2050                                     int prot)
2051 {
2052         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2053 }
2054
2055 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2056                                      unsigned long phys_pfn, unsigned long nr_pages,
2057                                      int prot)
2058 {
2059         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2060 }
2061
2062 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2063 {
2064         if (!iommu)
2065                 return;
2066
2067         clear_context_table(iommu, bus, devfn);
2068         iommu->flush.flush_context(iommu, 0, 0, 0,
2069                                            DMA_CCMD_GLOBAL_INVL);
2070         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2071 }
2072
2073 static inline void unlink_domain_info(struct device_domain_info *info)
2074 {
2075         assert_spin_locked(&device_domain_lock);
2076         list_del(&info->link);
2077         list_del(&info->global);
2078         if (info->dev)
2079                 info->dev->archdata.iommu = NULL;
2080 }
2081
2082 static void domain_remove_dev_info(struct dmar_domain *domain)
2083 {
2084         struct device_domain_info *info;
2085         unsigned long flags, flags2;
2086
2087         spin_lock_irqsave(&device_domain_lock, flags);
2088         while (!list_empty(&domain->devices)) {
2089                 info = list_entry(domain->devices.next,
2090                         struct device_domain_info, link);
2091                 unlink_domain_info(info);
2092                 spin_unlock_irqrestore(&device_domain_lock, flags);
2093
2094                 iommu_disable_dev_iotlb(info);
2095                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2096
2097                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2098                         iommu_detach_dependent_devices(info->iommu, info->dev);
2099                         /* clear this iommu in iommu_bmp, update iommu count
2100                          * and capabilities
2101                          */
2102                         spin_lock_irqsave(&domain->iommu_lock, flags2);
2103                         if (test_and_clear_bit(info->iommu->seq_id,
2104                                                domain->iommu_bmp)) {
2105                                 domain->iommu_count--;
2106                                 domain_update_iommu_cap(domain);
2107                         }
2108                         spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2109                 }
2110
2111                 free_devinfo_mem(info);
2112                 spin_lock_irqsave(&device_domain_lock, flags);
2113         }
2114         spin_unlock_irqrestore(&device_domain_lock, flags);
2115 }
2116
2117 /*
2118  * find_domain
2119  * Note: we use struct device->archdata.iommu stores the info
2120  */
2121 static struct dmar_domain *find_domain(struct device *dev)
2122 {
2123         struct device_domain_info *info;
2124
2125         /* No lock here, assumes no domain exit in normal case */
2126         info = dev->archdata.iommu;
2127         if (info)
2128                 return info->domain;
2129         return NULL;
2130 }
2131
2132 static inline struct device_domain_info *
2133 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2134 {
2135         struct device_domain_info *info;
2136
2137         list_for_each_entry(info, &device_domain_list, global)
2138                 if (info->iommu->segment == segment && info->bus == bus &&
2139                     info->devfn == devfn)
2140                         return info;
2141
2142         return NULL;
2143 }
2144
2145 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2146                                                 int bus, int devfn,
2147                                                 struct device *dev,
2148                                                 struct dmar_domain *domain)
2149 {
2150         struct dmar_domain *found = NULL;
2151         struct device_domain_info *info;
2152         unsigned long flags;
2153
2154         info = alloc_devinfo_mem();
2155         if (!info)
2156                 return NULL;
2157
2158         info->bus = bus;
2159         info->devfn = devfn;
2160         info->dev = dev;
2161         info->domain = domain;
2162         info->iommu = iommu;
2163         if (!dev)
2164                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2165
2166         spin_lock_irqsave(&device_domain_lock, flags);
2167         if (dev)
2168                 found = find_domain(dev);
2169         else {
2170                 struct device_domain_info *info2;
2171                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2172                 if (info2)
2173                         found = info2->domain;
2174         }
2175         if (found) {
2176                 spin_unlock_irqrestore(&device_domain_lock, flags);
2177                 free_devinfo_mem(info);
2178                 /* Caller must free the original domain */
2179                 return found;
2180         }
2181
2182         list_add(&info->link, &domain->devices);
2183         list_add(&info->global, &device_domain_list);
2184         if (dev)
2185                 dev->archdata.iommu = info;
2186         spin_unlock_irqrestore(&device_domain_lock, flags);
2187
2188         return domain;
2189 }
2190
2191 /* domain is initialized */
2192 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2193 {
2194         struct dmar_domain *domain, *free = NULL;
2195         struct intel_iommu *iommu = NULL;
2196         struct device_domain_info *info;
2197         struct dmar_drhd_unit *drhd;
2198         struct pci_dev *dev_tmp;
2199         unsigned long flags;
2200         int bus = 0, devfn = 0;
2201         int segment;
2202
2203         domain = find_domain(&pdev->dev);
2204         if (domain)
2205                 return domain;
2206
2207         segment = pci_domain_nr(pdev->bus);
2208
2209         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2210         if (dev_tmp) {
2211                 if (pci_is_pcie(dev_tmp)) {
2212                         bus = dev_tmp->subordinate->number;
2213                         devfn = 0;
2214                 } else {
2215                         bus = dev_tmp->bus->number;
2216                         devfn = dev_tmp->devfn;
2217                 }
2218                 spin_lock_irqsave(&device_domain_lock, flags);
2219                 info = dmar_search_domain_by_dev_info(segment, bus, devfn);
2220                 if (info) {
2221                         iommu = info->iommu;
2222                         domain = info->domain;
2223                 }
2224                 spin_unlock_irqrestore(&device_domain_lock, flags);
2225                 if (info)
2226                         goto found_domain;
2227         }
2228
2229         drhd = dmar_find_matched_drhd_unit(pdev);
2230         if (!drhd) {
2231                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2232                         pci_name(pdev));
2233                 return NULL;
2234         }
2235         iommu = drhd->iommu;
2236
2237         /* Allocate and intialize new domain for the device */
2238         domain = alloc_domain(false);
2239         if (!domain)
2240                 goto error;
2241         if (iommu_attach_domain(domain, iommu)) {
2242                 free_domain_mem(domain);
2243                 goto error;
2244         }
2245         free = domain;
2246         if (domain_init(domain, gaw))
2247                 goto error;
2248
2249         /* register pcie-to-pci device */
2250         if (dev_tmp) {
2251                 domain = dmar_insert_dev_info(iommu, bus, devfn, NULL,
2252                                               domain);
2253                 if (!domain)
2254                         goto error;
2255         }
2256
2257 found_domain:
2258         domain = dmar_insert_dev_info(iommu, pdev->bus->number,
2259                                       pdev->devfn, &pdev->dev, domain);
2260 error:
2261         if (free != domain)
2262                 domain_exit(free);
2263
2264         return domain;
2265 }
2266
2267 static int iommu_identity_mapping;
2268 #define IDENTMAP_ALL            1
2269 #define IDENTMAP_GFX            2
2270 #define IDENTMAP_AZALIA         4
2271
2272 static int iommu_domain_identity_map(struct dmar_domain *domain,
2273                                      unsigned long long start,
2274                                      unsigned long long end)
2275 {
2276         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2277         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2278
2279         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2280                           dma_to_mm_pfn(last_vpfn))) {
2281                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2282                 return -ENOMEM;
2283         }
2284
2285         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2286                  start, end, domain->id);
2287         /*
2288          * RMRR range might have overlap with physical memory range,
2289          * clear it first
2290          */
2291         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2292
2293         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2294                                   last_vpfn - first_vpfn + 1,
2295                                   DMA_PTE_READ|DMA_PTE_WRITE);
2296 }
2297
2298 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2299                                       unsigned long long start,
2300                                       unsigned long long end)
2301 {
2302         struct dmar_domain *domain;
2303         int ret;
2304
2305         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2306         if (!domain)
2307                 return -ENOMEM;
2308
2309         /* For _hardware_ passthrough, don't bother. But for software
2310            passthrough, we do it anyway -- it may indicate a memory
2311            range which is reserved in E820, so which didn't get set
2312            up to start with in si_domain */
2313         if (domain == si_domain && hw_pass_through) {
2314                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2315                        pci_name(pdev), start, end);
2316                 return 0;
2317         }
2318
2319         printk(KERN_INFO
2320                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2321                pci_name(pdev), start, end);
2322         
2323         if (end < start) {
2324                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2325                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2326                         dmi_get_system_info(DMI_BIOS_VENDOR),
2327                         dmi_get_system_info(DMI_BIOS_VERSION),
2328                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2329                 ret = -EIO;
2330                 goto error;
2331         }
2332
2333         if (end >> agaw_to_width(domain->agaw)) {
2334                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2335                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2336                      agaw_to_width(domain->agaw),
2337                      dmi_get_system_info(DMI_BIOS_VENDOR),
2338                      dmi_get_system_info(DMI_BIOS_VERSION),
2339                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2340                 ret = -EIO;
2341                 goto error;
2342         }
2343
2344         ret = iommu_domain_identity_map(domain, start, end);
2345         if (ret)
2346                 goto error;
2347
2348         /* context entry init */
2349         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2350         if (ret)
2351                 goto error;
2352
2353         return 0;
2354
2355  error:
2356         domain_exit(domain);
2357         return ret;
2358 }
2359
2360 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2361         struct pci_dev *pdev)
2362 {
2363         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2364                 return 0;
2365         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2366                 rmrr->end_address);
2367 }
2368
2369 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2370 static inline void iommu_prepare_isa(void)
2371 {
2372         struct pci_dev *pdev;
2373         int ret;
2374
2375         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2376         if (!pdev)
2377                 return;
2378
2379         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2380         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2381
2382         if (ret)
2383                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2384                        "floppy might not work\n");
2385
2386 }
2387 #else
2388 static inline void iommu_prepare_isa(void)
2389 {
2390         return;
2391 }
2392 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2393
2394 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2395
2396 static int __init si_domain_init(int hw)
2397 {
2398         struct dmar_drhd_unit *drhd;
2399         struct intel_iommu *iommu;
2400         int nid, ret = 0;
2401
2402         si_domain = alloc_domain(false);
2403         if (!si_domain)
2404                 return -EFAULT;
2405
2406         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2407
2408         for_each_active_iommu(iommu, drhd) {
2409                 ret = iommu_attach_domain(si_domain, iommu);
2410                 if (ret) {
2411                         domain_exit(si_domain);
2412                         return -EFAULT;
2413                 }
2414         }
2415
2416         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2417                 domain_exit(si_domain);
2418                 return -EFAULT;
2419         }
2420
2421         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2422                  si_domain->id);
2423
2424         if (hw)
2425                 return 0;
2426
2427         for_each_online_node(nid) {
2428                 unsigned long start_pfn, end_pfn;
2429                 int i;
2430
2431                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2432                         ret = iommu_domain_identity_map(si_domain,
2433                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2434                         if (ret)
2435                                 return ret;
2436                 }
2437         }
2438
2439         return 0;
2440 }
2441
2442 static int identity_mapping(struct pci_dev *pdev)
2443 {
2444         struct device_domain_info *info;
2445
2446         if (likely(!iommu_identity_mapping))
2447                 return 0;
2448
2449         info = pdev->dev.archdata.iommu;
2450         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2451                 return (info->domain == si_domain);
2452
2453         return 0;
2454 }
2455
2456 static int domain_add_dev_info(struct dmar_domain *domain,
2457                                struct pci_dev *pdev,
2458                                int translation)
2459 {
2460         struct dmar_domain *ndomain;
2461         struct intel_iommu *iommu;
2462         int ret;
2463
2464         iommu = device_to_iommu(pci_domain_nr(pdev->bus),
2465                                 pdev->bus->number, pdev->devfn);
2466         if (!iommu)
2467                 return -ENODEV;
2468
2469         ndomain = dmar_insert_dev_info(iommu, pdev->bus->number, pdev->devfn,
2470                                        &pdev->dev, domain);
2471         if (ndomain != domain)
2472                 return -EBUSY;
2473
2474         ret = domain_context_mapping(domain, pdev, translation);
2475         if (ret) {
2476                 domain_remove_one_dev_info(domain, pdev);
2477                 return ret;
2478         }
2479
2480         return 0;
2481 }
2482
2483 static bool device_has_rmrr(struct pci_dev *dev)
2484 {
2485         struct dmar_rmrr_unit *rmrr;
2486         struct device *tmp;
2487         int i;
2488
2489         rcu_read_lock();
2490         for_each_rmrr_units(rmrr) {
2491                 /*
2492                  * Return TRUE if this RMRR contains the device that
2493                  * is passed in.
2494                  */
2495                 for_each_active_dev_scope(rmrr->devices,
2496                                           rmrr->devices_cnt, i, tmp)
2497                         if (tmp == &dev->dev) {
2498                                 rcu_read_unlock();
2499                                 return true;
2500                         }
2501         }
2502         rcu_read_unlock();
2503         return false;
2504 }
2505
2506 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2507 {
2508
2509         /*
2510          * We want to prevent any device associated with an RMRR from
2511          * getting placed into the SI Domain. This is done because
2512          * problems exist when devices are moved in and out of domains
2513          * and their respective RMRR info is lost. We exempt USB devices
2514          * from this process due to their usage of RMRRs that are known
2515          * to not be needed after BIOS hand-off to OS.
2516          */
2517         if (device_has_rmrr(pdev) &&
2518             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2519                 return 0;
2520
2521         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2522                 return 1;
2523
2524         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2525                 return 1;
2526
2527         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2528                 return 0;
2529
2530         /*
2531          * We want to start off with all devices in the 1:1 domain, and
2532          * take them out later if we find they can't access all of memory.
2533          *
2534          * However, we can't do this for PCI devices behind bridges,
2535          * because all PCI devices behind the same bridge will end up
2536          * with the same source-id on their transactions.
2537          *
2538          * Practically speaking, we can't change things around for these
2539          * devices at run-time, because we can't be sure there'll be no
2540          * DMA transactions in flight for any of their siblings.
2541          * 
2542          * So PCI devices (unless they're on the root bus) as well as
2543          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2544          * the 1:1 domain, just in _case_ one of their siblings turns out
2545          * not to be able to map all of memory.
2546          */
2547         if (!pci_is_pcie(pdev)) {
2548                 if (!pci_is_root_bus(pdev->bus))
2549                         return 0;
2550                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2551                         return 0;
2552         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2553                 return 0;
2554
2555         /* 
2556          * At boot time, we don't yet know if devices will be 64-bit capable.
2557          * Assume that they will -- if they turn out not to be, then we can 
2558          * take them out of the 1:1 domain later.
2559          */
2560         if (!startup) {
2561                 /*
2562                  * If the device's dma_mask is less than the system's memory
2563                  * size then this is not a candidate for identity mapping.
2564                  */
2565                 u64 dma_mask = pdev->dma_mask;
2566
2567                 if (pdev->dev.coherent_dma_mask &&
2568                     pdev->dev.coherent_dma_mask < dma_mask)
2569                         dma_mask = pdev->dev.coherent_dma_mask;
2570
2571                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2572         }
2573
2574         return 1;
2575 }
2576
2577 static int __init iommu_prepare_static_identity_mapping(int hw)
2578 {
2579         struct pci_dev *pdev = NULL;
2580         int ret;
2581
2582         ret = si_domain_init(hw);
2583         if (ret)
2584                 return -EFAULT;
2585
2586         for_each_pci_dev(pdev) {
2587                 if (iommu_should_identity_map(pdev, 1)) {
2588                         ret = domain_add_dev_info(si_domain, pdev,
2589                                              hw ? CONTEXT_TT_PASS_THROUGH :
2590                                                   CONTEXT_TT_MULTI_LEVEL);
2591                         if (ret) {
2592                                 /* device not associated with an iommu */
2593                                 if (ret == -ENODEV)
2594                                         continue;
2595                                 return ret;
2596                         }
2597                         pr_info("IOMMU: %s identity mapping for device %s\n",
2598                                 hw ? "hardware" : "software", pci_name(pdev));
2599                 }
2600         }
2601
2602         return 0;
2603 }
2604
2605 static int __init init_dmars(void)
2606 {
2607         struct dmar_drhd_unit *drhd;
2608         struct dmar_rmrr_unit *rmrr;
2609         struct device *dev;
2610         struct intel_iommu *iommu;
2611         int i, ret;
2612
2613         /*
2614          * for each drhd
2615          *    allocate root
2616          *    initialize and program root entry to not present
2617          * endfor
2618          */
2619         for_each_drhd_unit(drhd) {
2620                 /*
2621                  * lock not needed as this is only incremented in the single
2622                  * threaded kernel __init code path all other access are read
2623                  * only
2624                  */
2625                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2626                         g_num_of_iommus++;
2627                         continue;
2628                 }
2629                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2630                           IOMMU_UNITS_SUPPORTED);
2631         }
2632
2633         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2634                         GFP_KERNEL);
2635         if (!g_iommus) {
2636                 printk(KERN_ERR "Allocating global iommu array failed\n");
2637                 ret = -ENOMEM;
2638                 goto error;
2639         }
2640
2641         deferred_flush = kzalloc(g_num_of_iommus *
2642                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2643         if (!deferred_flush) {
2644                 ret = -ENOMEM;
2645                 goto free_g_iommus;
2646         }
2647
2648         for_each_active_iommu(iommu, drhd) {
2649                 g_iommus[iommu->seq_id] = iommu;
2650
2651                 ret = iommu_init_domains(iommu);
2652                 if (ret)
2653                         goto free_iommu;
2654
2655                 /*
2656                  * TBD:
2657                  * we could share the same root & context tables
2658                  * among all IOMMU's. Need to Split it later.
2659                  */
2660                 ret = iommu_alloc_root_entry(iommu);
2661                 if (ret) {
2662                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2663                         goto free_iommu;
2664                 }
2665                 if (!ecap_pass_through(iommu->ecap))
2666                         hw_pass_through = 0;
2667         }
2668
2669         /*
2670          * Start from the sane iommu hardware state.
2671          */
2672         for_each_active_iommu(iommu, drhd) {
2673                 /*
2674                  * If the queued invalidation is already initialized by us
2675                  * (for example, while enabling interrupt-remapping) then
2676                  * we got the things already rolling from a sane state.
2677                  */
2678                 if (iommu->qi)
2679                         continue;
2680
2681                 /*
2682                  * Clear any previous faults.
2683                  */
2684                 dmar_fault(-1, iommu);
2685                 /*
2686                  * Disable queued invalidation if supported and already enabled
2687                  * before OS handover.
2688                  */
2689                 dmar_disable_qi(iommu);
2690         }
2691
2692         for_each_active_iommu(iommu, drhd) {
2693                 if (dmar_enable_qi(iommu)) {
2694                         /*
2695                          * Queued Invalidate not enabled, use Register Based
2696                          * Invalidate
2697                          */
2698                         iommu->flush.flush_context = __iommu_flush_context;
2699                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2700                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2701                                "invalidation\n",
2702                                 iommu->seq_id,
2703                                (unsigned long long)drhd->reg_base_addr);
2704                 } else {
2705                         iommu->flush.flush_context = qi_flush_context;
2706                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2707                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2708                                "invalidation\n",
2709                                 iommu->seq_id,
2710                                (unsigned long long)drhd->reg_base_addr);
2711                 }
2712         }
2713
2714         if (iommu_pass_through)
2715                 iommu_identity_mapping |= IDENTMAP_ALL;
2716
2717 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2718         iommu_identity_mapping |= IDENTMAP_GFX;
2719 #endif
2720
2721         check_tylersburg_isoch();
2722
2723         /*
2724          * If pass through is not set or not enabled, setup context entries for
2725          * identity mappings for rmrr, gfx, and isa and may fall back to static
2726          * identity mapping if iommu_identity_mapping is set.
2727          */
2728         if (iommu_identity_mapping) {
2729                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2730                 if (ret) {
2731                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2732                         goto free_iommu;
2733                 }
2734         }
2735         /*
2736          * For each rmrr
2737          *   for each dev attached to rmrr
2738          *   do
2739          *     locate drhd for dev, alloc domain for dev
2740          *     allocate free domain
2741          *     allocate page table entries for rmrr
2742          *     if context not allocated for bus
2743          *           allocate and init context
2744          *           set present in root table for this bus
2745          *     init context with domain, translation etc
2746          *    endfor
2747          * endfor
2748          */
2749         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2750         for_each_rmrr_units(rmrr) {
2751                 /* some BIOS lists non-exist devices in DMAR table. */
2752                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2753                                           i, dev) {
2754                         if (!dev_is_pci(dev))
2755                                 continue;
2756                         ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2757                         if (ret)
2758                                 printk(KERN_ERR
2759                                        "IOMMU: mapping reserved region failed\n");
2760                 }
2761         }
2762
2763         iommu_prepare_isa();
2764
2765         /*
2766          * for each drhd
2767          *   enable fault log
2768          *   global invalidate context cache
2769          *   global invalidate iotlb
2770          *   enable translation
2771          */
2772         for_each_iommu(iommu, drhd) {
2773                 if (drhd->ignored) {
2774                         /*
2775                          * we always have to disable PMRs or DMA may fail on
2776                          * this device
2777                          */
2778                         if (force_on)
2779                                 iommu_disable_protect_mem_regions(iommu);
2780                         continue;
2781                 }
2782
2783                 iommu_flush_write_buffer(iommu);
2784
2785                 ret = dmar_set_interrupt(iommu);
2786                 if (ret)
2787                         goto free_iommu;
2788
2789                 iommu_set_root_entry(iommu);
2790
2791                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2792                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2793
2794                 ret = iommu_enable_translation(iommu);
2795                 if (ret)
2796                         goto free_iommu;
2797
2798                 iommu_disable_protect_mem_regions(iommu);
2799         }
2800
2801         return 0;
2802
2803 free_iommu:
2804         for_each_active_iommu(iommu, drhd)
2805                 free_dmar_iommu(iommu);
2806         kfree(deferred_flush);
2807 free_g_iommus:
2808         kfree(g_iommus);
2809 error:
2810         return ret;
2811 }
2812
2813 /* This takes a number of _MM_ pages, not VTD pages */
2814 static struct iova *intel_alloc_iova(struct device *dev,
2815                                      struct dmar_domain *domain,
2816                                      unsigned long nrpages, uint64_t dma_mask)
2817 {
2818         struct pci_dev *pdev = to_pci_dev(dev);
2819         struct iova *iova = NULL;
2820
2821         /* Restrict dma_mask to the width that the iommu can handle */
2822         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2823
2824         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2825                 /*
2826                  * First try to allocate an io virtual address in
2827                  * DMA_BIT_MASK(32) and if that fails then try allocating
2828                  * from higher range
2829                  */
2830                 iova = alloc_iova(&domain->iovad, nrpages,
2831                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2832                 if (iova)
2833                         return iova;
2834         }
2835         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2836         if (unlikely(!iova)) {
2837                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2838                        nrpages, pci_name(pdev));
2839                 return NULL;
2840         }
2841
2842         return iova;
2843 }
2844
2845 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2846 {
2847         struct dmar_domain *domain;
2848         int ret;
2849
2850         domain = get_domain_for_dev(pdev,
2851                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2852         if (!domain) {
2853                 printk(KERN_ERR
2854                         "Allocating domain for %s failed", pci_name(pdev));
2855                 return NULL;
2856         }
2857
2858         /* make sure context mapping is ok */
2859         if (unlikely(!domain_context_mapped(pdev))) {
2860                 ret = domain_context_mapping(domain, pdev,
2861                                              CONTEXT_TT_MULTI_LEVEL);
2862                 if (ret) {
2863                         printk(KERN_ERR
2864                                 "Domain context map for %s failed",
2865                                 pci_name(pdev));
2866                         return NULL;
2867                 }
2868         }
2869
2870         return domain;
2871 }
2872
2873 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2874 {
2875         struct device_domain_info *info;
2876
2877         /* No lock here, assumes no domain exit in normal case */
2878         info = dev->dev.archdata.iommu;
2879         if (likely(info))
2880                 return info->domain;
2881
2882         return __get_valid_domain_for_dev(dev);
2883 }
2884
2885 static int iommu_dummy(struct device *dev)
2886 {
2887         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2888 }
2889
2890 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2891 static int iommu_no_mapping(struct device *dev)
2892 {
2893         struct pci_dev *pdev;
2894         int found;
2895
2896         if (unlikely(!dev_is_pci(dev)))
2897                 return 1;
2898
2899         if (iommu_dummy(dev))
2900                 return 1;
2901
2902         if (!iommu_identity_mapping)
2903                 return 0;
2904
2905         pdev = to_pci_dev(dev);
2906         found = identity_mapping(pdev);
2907         if (found) {
2908                 if (iommu_should_identity_map(pdev, 0))
2909                         return 1;
2910                 else {
2911                         /*
2912                          * 32 bit DMA is removed from si_domain and fall back
2913                          * to non-identity mapping.
2914                          */
2915                         domain_remove_one_dev_info(si_domain, pdev);
2916                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2917                                pci_name(pdev));
2918                         return 0;
2919                 }
2920         } else {
2921                 /*
2922                  * In case of a detached 64 bit DMA device from vm, the device
2923                  * is put into si_domain for identity mapping.
2924                  */
2925                 if (iommu_should_identity_map(pdev, 0)) {
2926                         int ret;
2927                         ret = domain_add_dev_info(si_domain, pdev,
2928                                                   hw_pass_through ?
2929                                                   CONTEXT_TT_PASS_THROUGH :
2930                                                   CONTEXT_TT_MULTI_LEVEL);
2931                         if (!ret) {
2932                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2933                                        pci_name(pdev));
2934                                 return 1;
2935                         }
2936                 }
2937         }
2938
2939         return 0;
2940 }
2941
2942 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2943                                      size_t size, int dir, u64 dma_mask)
2944 {
2945         struct pci_dev *pdev = to_pci_dev(hwdev);
2946         struct dmar_domain *domain;
2947         phys_addr_t start_paddr;
2948         struct iova *iova;
2949         int prot = 0;
2950         int ret;
2951         struct intel_iommu *iommu;
2952         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2953
2954         BUG_ON(dir == DMA_NONE);
2955
2956         if (iommu_no_mapping(hwdev))
2957                 return paddr;
2958
2959         domain = get_valid_domain_for_dev(pdev);
2960         if (!domain)
2961                 return 0;
2962
2963         iommu = domain_get_iommu(domain);
2964         size = aligned_nrpages(paddr, size);
2965
2966         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2967         if (!iova)
2968                 goto error;
2969
2970         /*
2971          * Check if DMAR supports zero-length reads on write only
2972          * mappings..
2973          */
2974         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2975                         !cap_zlr(iommu->cap))
2976                 prot |= DMA_PTE_READ;
2977         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2978                 prot |= DMA_PTE_WRITE;
2979         /*
2980          * paddr - (paddr + size) might be partial page, we should map the whole
2981          * page.  Note: if two part of one page are separately mapped, we
2982          * might have two guest_addr mapping to the same host paddr, but this
2983          * is not a big problem
2984          */
2985         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2986                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2987         if (ret)
2988                 goto error;
2989
2990         /* it's a non-present to present mapping. Only flush if caching mode */
2991         if (cap_caching_mode(iommu->cap))
2992                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2993         else
2994                 iommu_flush_write_buffer(iommu);
2995
2996         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2997         start_paddr += paddr & ~PAGE_MASK;
2998         return start_paddr;
2999
3000 error:
3001         if (iova)
3002                 __free_iova(&domain->iovad, iova);
3003         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3004                 pci_name(pdev), size, (unsigned long long)paddr, dir);
3005         return 0;
3006 }
3007
3008 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3009                                  unsigned long offset, size_t size,
3010                                  enum dma_data_direction dir,
3011                                  struct dma_attrs *attrs)
3012 {
3013         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3014                                   dir, to_pci_dev(dev)->dma_mask);
3015 }
3016
3017 static void flush_unmaps(void)
3018 {
3019         int i, j;
3020
3021         timer_on = 0;
3022
3023         /* just flush them all */
3024         for (i = 0; i < g_num_of_iommus; i++) {
3025                 struct intel_iommu *iommu = g_iommus[i];
3026                 if (!iommu)
3027                         continue;
3028
3029                 if (!deferred_flush[i].next)
3030                         continue;
3031
3032                 /* In caching mode, global flushes turn emulation expensive */
3033                 if (!cap_caching_mode(iommu->cap))
3034                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3035                                          DMA_TLB_GLOBAL_FLUSH);
3036                 for (j = 0; j < deferred_flush[i].next; j++) {
3037                         unsigned long mask;
3038                         struct iova *iova = deferred_flush[i].iova[j];
3039                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3040
3041                         /* On real hardware multiple invalidations are expensive */
3042                         if (cap_caching_mode(iommu->cap))
3043                                 iommu_flush_iotlb_psi(iommu, domain->id,
3044                                         iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3045                                         !deferred_flush[i].freelist[j], 0);
3046                         else {
3047                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3048                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3049                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3050                         }
3051                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3052                         if (deferred_flush[i].freelist[j])
3053                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3054                 }
3055                 deferred_flush[i].next = 0;
3056         }
3057
3058         list_size = 0;
3059 }
3060
3061 static void flush_unmaps_timeout(unsigned long data)
3062 {
3063         unsigned long flags;
3064
3065         spin_lock_irqsave(&async_umap_flush_lock, flags);
3066         flush_unmaps();
3067         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3068 }
3069
3070 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3071 {
3072         unsigned long flags;
3073         int next, iommu_id;
3074         struct intel_iommu *iommu;
3075
3076         spin_lock_irqsave(&async_umap_flush_lock, flags);
3077         if (list_size == HIGH_WATER_MARK)
3078                 flush_unmaps();
3079
3080         iommu = domain_get_iommu(dom);
3081         iommu_id = iommu->seq_id;
3082
3083         next = deferred_flush[iommu_id].next;
3084         deferred_flush[iommu_id].domain[next] = dom;
3085         deferred_flush[iommu_id].iova[next] = iova;
3086         deferred_flush[iommu_id].freelist[next] = freelist;
3087         deferred_flush[iommu_id].next++;
3088
3089         if (!timer_on) {
3090                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3091                 timer_on = 1;
3092         }
3093         list_size++;
3094         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3095 }
3096
3097 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3098                              size_t size, enum dma_data_direction dir,
3099                              struct dma_attrs *attrs)
3100 {
3101         struct pci_dev *pdev = to_pci_dev(dev);
3102         struct dmar_domain *domain;
3103         unsigned long start_pfn, last_pfn;
3104         struct iova *iova;
3105         struct intel_iommu *iommu;
3106         struct page *freelist;
3107
3108         if (iommu_no_mapping(dev))
3109                 return;
3110
3111         domain = find_domain(dev);
3112         BUG_ON(!domain);
3113
3114         iommu = domain_get_iommu(domain);
3115
3116         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3117         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3118                       (unsigned long long)dev_addr))
3119                 return;
3120
3121         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3122         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3123
3124         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3125                  pci_name(pdev), start_pfn, last_pfn);
3126
3127         freelist = domain_unmap(domain, start_pfn, last_pfn);
3128
3129         if (intel_iommu_strict) {
3130                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3131                                       last_pfn - start_pfn + 1, !freelist, 0);
3132                 /* free iova */
3133                 __free_iova(&domain->iovad, iova);
3134                 dma_free_pagelist(freelist);
3135         } else {
3136                 add_unmap(domain, iova, freelist);
3137                 /*
3138                  * queue up the release of the unmap to save the 1/6th of the
3139                  * cpu used up by the iotlb flush operation...
3140                  */
3141         }
3142 }
3143
3144 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3145                                   dma_addr_t *dma_handle, gfp_t flags,
3146                                   struct dma_attrs *attrs)
3147 {
3148         void *vaddr;
3149         int order;
3150
3151         size = PAGE_ALIGN(size);
3152         order = get_order(size);
3153
3154         if (!iommu_no_mapping(hwdev))
3155                 flags &= ~(GFP_DMA | GFP_DMA32);
3156         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3157                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3158                         flags |= GFP_DMA;
3159                 else
3160                         flags |= GFP_DMA32;
3161         }
3162
3163         vaddr = (void *)__get_free_pages(flags, order);
3164         if (!vaddr)
3165                 return NULL;
3166         memset(vaddr, 0, size);
3167
3168         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3169                                          DMA_BIDIRECTIONAL,
3170                                          hwdev->coherent_dma_mask);
3171         if (*dma_handle)
3172                 return vaddr;
3173         free_pages((unsigned long)vaddr, order);
3174         return NULL;
3175 }
3176
3177 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3178                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3179 {
3180         int order;
3181
3182         size = PAGE_ALIGN(size);
3183         order = get_order(size);
3184
3185         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3186         free_pages((unsigned long)vaddr, order);
3187 }
3188
3189 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3190                            int nelems, enum dma_data_direction dir,
3191                            struct dma_attrs *attrs)
3192 {
3193         struct dmar_domain *domain;
3194         unsigned long start_pfn, last_pfn;
3195         struct iova *iova;
3196         struct intel_iommu *iommu;
3197         struct page *freelist;
3198
3199         if (iommu_no_mapping(hwdev))
3200                 return;
3201
3202         domain = find_domain(hwdev);
3203         BUG_ON(!domain);
3204
3205         iommu = domain_get_iommu(domain);
3206
3207         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3208         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3209                       (unsigned long long)sglist[0].dma_address))
3210                 return;
3211
3212         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3213         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3214
3215         freelist = domain_unmap(domain, start_pfn, last_pfn);
3216
3217         if (intel_iommu_strict) {
3218                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219                                       last_pfn - start_pfn + 1, !freelist, 0);
3220                 /* free iova */
3221                 __free_iova(&domain->iovad, iova);
3222                 dma_free_pagelist(freelist);
3223         } else {
3224                 add_unmap(domain, iova, freelist);
3225                 /*
3226                  * queue up the release of the unmap to save the 1/6th of the
3227                  * cpu used up by the iotlb flush operation...
3228                  */
3229         }
3230 }
3231
3232 static int intel_nontranslate_map_sg(struct device *hddev,
3233         struct scatterlist *sglist, int nelems, int dir)
3234 {
3235         int i;
3236         struct scatterlist *sg;
3237
3238         for_each_sg(sglist, sg, nelems, i) {
3239                 BUG_ON(!sg_page(sg));
3240                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3241                 sg->dma_length = sg->length;
3242         }
3243         return nelems;
3244 }
3245
3246 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3247                         enum dma_data_direction dir, struct dma_attrs *attrs)
3248 {
3249         int i;
3250         struct pci_dev *pdev = to_pci_dev(hwdev);
3251         struct dmar_domain *domain;
3252         size_t size = 0;
3253         int prot = 0;
3254         struct iova *iova = NULL;
3255         int ret;
3256         struct scatterlist *sg;
3257         unsigned long start_vpfn;
3258         struct intel_iommu *iommu;
3259
3260         BUG_ON(dir == DMA_NONE);
3261         if (iommu_no_mapping(hwdev))
3262                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3263
3264         domain = get_valid_domain_for_dev(pdev);
3265         if (!domain)
3266                 return 0;
3267
3268         iommu = domain_get_iommu(domain);
3269
3270         for_each_sg(sglist, sg, nelems, i)
3271                 size += aligned_nrpages(sg->offset, sg->length);
3272
3273         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3274                                 pdev->dma_mask);
3275         if (!iova) {
3276                 sglist->dma_length = 0;
3277                 return 0;
3278         }
3279
3280         /*
3281          * Check if DMAR supports zero-length reads on write only
3282          * mappings..
3283          */
3284         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3285                         !cap_zlr(iommu->cap))
3286                 prot |= DMA_PTE_READ;
3287         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3288                 prot |= DMA_PTE_WRITE;
3289
3290         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3291
3292         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3293         if (unlikely(ret)) {
3294                 /*  clear the page */
3295                 dma_pte_clear_range(domain, start_vpfn,
3296                                     start_vpfn + size - 1);
3297                 /* free page tables */
3298                 dma_pte_free_pagetable(domain, start_vpfn,
3299                                        start_vpfn + size - 1);
3300                 /* free iova */
3301                 __free_iova(&domain->iovad, iova);
3302                 return 0;
3303         }
3304
3305         /* it's a non-present to present mapping. Only flush if caching mode */
3306         if (cap_caching_mode(iommu->cap))
3307                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3308         else
3309                 iommu_flush_write_buffer(iommu);
3310
3311         return nelems;
3312 }
3313
3314 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3315 {
3316         return !dma_addr;
3317 }
3318
3319 struct dma_map_ops intel_dma_ops = {
3320         .alloc = intel_alloc_coherent,
3321         .free = intel_free_coherent,
3322         .map_sg = intel_map_sg,
3323         .unmap_sg = intel_unmap_sg,
3324         .map_page = intel_map_page,
3325         .unmap_page = intel_unmap_page,
3326         .mapping_error = intel_mapping_error,
3327 };
3328
3329 static inline int iommu_domain_cache_init(void)
3330 {
3331         int ret = 0;
3332
3333         iommu_domain_cache = kmem_cache_create("iommu_domain",
3334                                          sizeof(struct dmar_domain),
3335                                          0,
3336                                          SLAB_HWCACHE_ALIGN,
3337
3338                                          NULL);
3339         if (!iommu_domain_cache) {
3340                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3341                 ret = -ENOMEM;
3342         }
3343
3344         return ret;
3345 }
3346
3347 static inline int iommu_devinfo_cache_init(void)
3348 {
3349         int ret = 0;
3350
3351         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3352                                          sizeof(struct device_domain_info),
3353                                          0,
3354                                          SLAB_HWCACHE_ALIGN,
3355                                          NULL);
3356         if (!iommu_devinfo_cache) {
3357                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3358                 ret = -ENOMEM;
3359         }
3360
3361         return ret;
3362 }
3363
3364 static inline int iommu_iova_cache_init(void)
3365 {
3366         int ret = 0;
3367
3368         iommu_iova_cache = kmem_cache_create("iommu_iova",
3369                                          sizeof(struct iova),
3370                                          0,
3371                                          SLAB_HWCACHE_ALIGN,
3372                                          NULL);
3373         if (!iommu_iova_cache) {
3374                 printk(KERN_ERR "Couldn't create iova cache\n");
3375                 ret = -ENOMEM;
3376         }
3377
3378         return ret;
3379 }
3380
3381 static int __init iommu_init_mempool(void)
3382 {
3383         int ret;
3384         ret = iommu_iova_cache_init();
3385         if (ret)
3386                 return ret;
3387
3388         ret = iommu_domain_cache_init();
3389         if (ret)
3390                 goto domain_error;
3391
3392         ret = iommu_devinfo_cache_init();
3393         if (!ret)
3394                 return ret;
3395
3396         kmem_cache_destroy(iommu_domain_cache);
3397 domain_error:
3398         kmem_cache_destroy(iommu_iova_cache);
3399
3400         return -ENOMEM;
3401 }
3402
3403 static void __init iommu_exit_mempool(void)
3404 {
3405         kmem_cache_destroy(iommu_devinfo_cache);
3406         kmem_cache_destroy(iommu_domain_cache);
3407         kmem_cache_destroy(iommu_iova_cache);
3408
3409 }
3410
3411 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3412 {
3413         struct dmar_drhd_unit *drhd;
3414         u32 vtbar;
3415         int rc;
3416
3417         /* We know that this device on this chipset has its own IOMMU.
3418          * If we find it under a different IOMMU, then the BIOS is lying
3419          * to us. Hope that the IOMMU for this device is actually
3420          * disabled, and it needs no translation...
3421          */
3422         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3423         if (rc) {
3424                 /* "can't" happen */
3425                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3426                 return;
3427         }
3428         vtbar &= 0xffff0000;
3429
3430         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3431         drhd = dmar_find_matched_drhd_unit(pdev);
3432         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3433                             TAINT_FIRMWARE_WORKAROUND,
3434                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3435                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3436 }
3437 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3438
3439 static void __init init_no_remapping_devices(void)
3440 {
3441         struct dmar_drhd_unit *drhd;
3442         struct device *dev;
3443         int i;
3444
3445         for_each_drhd_unit(drhd) {
3446                 if (!drhd->include_all) {
3447                         for_each_active_dev_scope(drhd->devices,
3448                                                   drhd->devices_cnt, i, dev)
3449                                 break;
3450                         /* ignore DMAR unit if no devices exist */
3451                         if (i == drhd->devices_cnt)
3452                                 drhd->ignored = 1;
3453                 }
3454         }
3455
3456         for_each_active_drhd_unit(drhd) {
3457                 if (drhd->include_all)
3458                         continue;
3459
3460                 for_each_active_dev_scope(drhd->devices,
3461                                           drhd->devices_cnt, i, dev)
3462                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3463                                 break;
3464                 if (i < drhd->devices_cnt)
3465                         continue;
3466
3467                 /* This IOMMU has *only* gfx devices. Either bypass it or
3468                    set the gfx_mapped flag, as appropriate */
3469                 if (dmar_map_gfx) {
3470                         intel_iommu_gfx_mapped = 1;
3471                 } else {
3472                         drhd->ignored = 1;
3473                         for_each_active_dev_scope(drhd->devices,
3474                                                   drhd->devices_cnt, i, dev)
3475                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3476                 }
3477         }
3478 }
3479
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3482 {
3483         struct dmar_drhd_unit *drhd;
3484         struct intel_iommu *iommu = NULL;
3485
3486         for_each_active_iommu(iommu, drhd)
3487                 if (iommu->qi)
3488                         dmar_reenable_qi(iommu);
3489
3490         for_each_iommu(iommu, drhd) {
3491                 if (drhd->ignored) {
3492                         /*
3493                          * we always have to disable PMRs or DMA may fail on
3494                          * this device
3495                          */
3496                         if (force_on)
3497                                 iommu_disable_protect_mem_regions(iommu);
3498                         continue;
3499                 }
3500         
3501                 iommu_flush_write_buffer(iommu);
3502
3503                 iommu_set_root_entry(iommu);
3504
3505                 iommu->flush.flush_context(iommu, 0, 0, 0,
3506                                            DMA_CCMD_GLOBAL_INVL);
3507                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3508                                          DMA_TLB_GLOBAL_FLUSH);
3509                 if (iommu_enable_translation(iommu))
3510                         return 1;
3511                 iommu_disable_protect_mem_regions(iommu);
3512         }
3513
3514         return 0;
3515 }
3516
3517 static void iommu_flush_all(void)
3518 {
3519         struct dmar_drhd_unit *drhd;
3520         struct intel_iommu *iommu;
3521
3522         for_each_active_iommu(iommu, drhd) {
3523                 iommu->flush.flush_context(iommu, 0, 0, 0,
3524                                            DMA_CCMD_GLOBAL_INVL);
3525                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3526                                          DMA_TLB_GLOBAL_FLUSH);
3527         }
3528 }
3529
3530 static int iommu_suspend(void)
3531 {
3532         struct dmar_drhd_unit *drhd;
3533         struct intel_iommu *iommu = NULL;
3534         unsigned long flag;
3535
3536         for_each_active_iommu(iommu, drhd) {
3537                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3538                                                  GFP_ATOMIC);
3539                 if (!iommu->iommu_state)
3540                         goto nomem;
3541         }
3542
3543         iommu_flush_all();
3544
3545         for_each_active_iommu(iommu, drhd) {
3546                 iommu_disable_translation(iommu);
3547
3548                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3549
3550                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3551                         readl(iommu->reg + DMAR_FECTL_REG);
3552                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3553                         readl(iommu->reg + DMAR_FEDATA_REG);
3554                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3555                         readl(iommu->reg + DMAR_FEADDR_REG);
3556                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3557                         readl(iommu->reg + DMAR_FEUADDR_REG);
3558
3559                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3560         }
3561         return 0;
3562
3563 nomem:
3564         for_each_active_iommu(iommu, drhd)
3565                 kfree(iommu->iommu_state);
3566
3567         return -ENOMEM;
3568 }
3569
3570 static void iommu_resume(void)
3571 {
3572         struct dmar_drhd_unit *drhd;
3573         struct intel_iommu *iommu = NULL;
3574         unsigned long flag;
3575
3576         if (init_iommu_hw()) {
3577                 if (force_on)
3578                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3579                 else
3580                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3581                 return;
3582         }
3583
3584         for_each_active_iommu(iommu, drhd) {
3585
3586                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3587
3588                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3589                         iommu->reg + DMAR_FECTL_REG);
3590                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3591                         iommu->reg + DMAR_FEDATA_REG);
3592                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3593                         iommu->reg + DMAR_FEADDR_REG);
3594                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3595                         iommu->reg + DMAR_FEUADDR_REG);
3596
3597                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3598         }
3599
3600         for_each_active_iommu(iommu, drhd)
3601                 kfree(iommu->iommu_state);
3602 }
3603
3604 static struct syscore_ops iommu_syscore_ops = {
3605         .resume         = iommu_resume,
3606         .suspend        = iommu_suspend,
3607 };
3608
3609 static void __init init_iommu_pm_ops(void)
3610 {
3611         register_syscore_ops(&iommu_syscore_ops);
3612 }
3613
3614 #else
3615 static inline void init_iommu_pm_ops(void) {}
3616 #endif  /* CONFIG_PM */
3617
3618
3619 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3620 {
3621         struct acpi_dmar_reserved_memory *rmrr;
3622         struct dmar_rmrr_unit *rmrru;
3623
3624         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3625         if (!rmrru)
3626                 return -ENOMEM;
3627
3628         rmrru->hdr = header;
3629         rmrr = (struct acpi_dmar_reserved_memory *)header;
3630         rmrru->base_address = rmrr->base_address;
3631         rmrru->end_address = rmrr->end_address;
3632         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3633                                 ((void *)rmrr) + rmrr->header.length,
3634                                 &rmrru->devices_cnt);
3635         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3636                 kfree(rmrru);
3637                 return -ENOMEM;
3638         }
3639
3640         list_add(&rmrru->list, &dmar_rmrr_units);
3641
3642         return 0;
3643 }
3644
3645 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3646 {
3647         struct acpi_dmar_atsr *atsr;
3648         struct dmar_atsr_unit *atsru;
3649
3650         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3651         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3652         if (!atsru)
3653                 return -ENOMEM;
3654
3655         atsru->hdr = hdr;
3656         atsru->include_all = atsr->flags & 0x1;
3657         if (!atsru->include_all) {
3658                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3659                                 (void *)atsr + atsr->header.length,
3660                                 &atsru->devices_cnt);
3661                 if (atsru->devices_cnt && atsru->devices == NULL) {
3662                         kfree(atsru);
3663                         return -ENOMEM;
3664                 }
3665         }
3666
3667         list_add_rcu(&atsru->list, &dmar_atsr_units);
3668
3669         return 0;
3670 }
3671
3672 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3673 {
3674         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3675         kfree(atsru);
3676 }
3677
3678 static void intel_iommu_free_dmars(void)
3679 {
3680         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3681         struct dmar_atsr_unit *atsru, *atsr_n;
3682
3683         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3684                 list_del(&rmrru->list);
3685                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3686                 kfree(rmrru);
3687         }
3688
3689         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3690                 list_del(&atsru->list);
3691                 intel_iommu_free_atsr(atsru);
3692         }
3693 }
3694
3695 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3696 {
3697         int i, ret = 1;
3698         struct pci_bus *bus;
3699         struct pci_dev *bridge = NULL;
3700         struct device *tmp;
3701         struct acpi_dmar_atsr *atsr;
3702         struct dmar_atsr_unit *atsru;
3703
3704         dev = pci_physfn(dev);
3705         for (bus = dev->bus; bus; bus = bus->parent) {
3706                 bridge = bus->self;
3707                 if (!bridge || !pci_is_pcie(bridge) ||
3708                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3709                         return 0;
3710                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3711                         break;
3712         }
3713         if (!bridge)
3714                 return 0;
3715
3716         rcu_read_lock();
3717         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3718                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3719                 if (atsr->segment != pci_domain_nr(dev->bus))
3720                         continue;
3721
3722                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3723                         if (tmp == &bridge->dev)
3724                                 goto out;
3725
3726                 if (atsru->include_all)
3727                         goto out;
3728         }
3729         ret = 0;
3730 out:
3731         rcu_read_unlock();
3732
3733         return ret;
3734 }
3735
3736 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3737 {
3738         int ret = 0;
3739         struct dmar_rmrr_unit *rmrru;
3740         struct dmar_atsr_unit *atsru;
3741         struct acpi_dmar_atsr *atsr;
3742         struct acpi_dmar_reserved_memory *rmrr;
3743
3744         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3745                 return 0;
3746
3747         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3748                 rmrr = container_of(rmrru->hdr,
3749                                     struct acpi_dmar_reserved_memory, header);
3750                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3751                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3752                                 ((void *)rmrr) + rmrr->header.length,
3753                                 rmrr->segment, rmrru->devices,
3754                                 rmrru->devices_cnt);
3755                         if (ret > 0)
3756                                 break;
3757                         else if(ret < 0)
3758                                 return ret;
3759                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3760                         if (dmar_remove_dev_scope(info, rmrr->segment,
3761                                 rmrru->devices, rmrru->devices_cnt))
3762                                 break;
3763                 }
3764         }
3765
3766         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3767                 if (atsru->include_all)
3768                         continue;
3769
3770                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3771                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3772                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3773                                         (void *)atsr + atsr->header.length,
3774                                         atsr->segment, atsru->devices,
3775                                         atsru->devices_cnt);
3776                         if (ret > 0)
3777                                 break;
3778                         else if(ret < 0)
3779                                 return ret;
3780                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3781                         if (dmar_remove_dev_scope(info, atsr->segment,
3782                                         atsru->devices, atsru->devices_cnt))
3783                                 break;
3784                 }
3785         }
3786
3787         return 0;
3788 }
3789
3790 /*
3791  * Here we only respond to action of unbound device from driver.
3792  *
3793  * Added device is not attached to its DMAR domain here yet. That will happen
3794  * when mapping the device to iova.
3795  */
3796 static int device_notifier(struct notifier_block *nb,
3797                                   unsigned long action, void *data)
3798 {
3799         struct device *dev = data;
3800         struct pci_dev *pdev = to_pci_dev(dev);
3801         struct dmar_domain *domain;
3802
3803         if (iommu_dummy(dev))
3804                 return 0;
3805
3806         if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3807             action != BUS_NOTIFY_DEL_DEVICE)
3808                 return 0;
3809
3810         domain = find_domain(dev);
3811         if (!domain)
3812                 return 0;
3813
3814         down_read(&dmar_global_lock);
3815         domain_remove_one_dev_info(domain, pdev);
3816         if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3818             list_empty(&domain->devices))
3819                 domain_exit(domain);
3820         up_read(&dmar_global_lock);
3821
3822         return 0;
3823 }
3824
3825 static struct notifier_block device_nb = {
3826         .notifier_call = device_notifier,
3827 };
3828
3829 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3830                                        unsigned long val, void *v)
3831 {
3832         struct memory_notify *mhp = v;
3833         unsigned long long start, end;
3834         unsigned long start_vpfn, last_vpfn;
3835
3836         switch (val) {
3837         case MEM_GOING_ONLINE:
3838                 start = mhp->start_pfn << PAGE_SHIFT;
3839                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3840                 if (iommu_domain_identity_map(si_domain, start, end)) {
3841                         pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3842                                 start, end);
3843                         return NOTIFY_BAD;
3844                 }
3845                 break;
3846
3847         case MEM_OFFLINE:
3848         case MEM_CANCEL_ONLINE:
3849                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3850                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3851                 while (start_vpfn <= last_vpfn) {
3852                         struct iova *iova;
3853                         struct dmar_drhd_unit *drhd;
3854                         struct intel_iommu *iommu;
3855                         struct page *freelist;
3856
3857                         iova = find_iova(&si_domain->iovad, start_vpfn);
3858                         if (iova == NULL) {
3859                                 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3860                                          start_vpfn);
3861                                 break;
3862                         }
3863
3864                         iova = split_and_remove_iova(&si_domain->iovad, iova,
3865                                                      start_vpfn, last_vpfn);
3866                         if (iova == NULL) {
3867                                 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3868                                         start_vpfn, last_vpfn);
3869                                 return NOTIFY_BAD;
3870                         }
3871
3872                         freelist = domain_unmap(si_domain, iova->pfn_lo,
3873                                                iova->pfn_hi);
3874
3875                         rcu_read_lock();
3876                         for_each_active_iommu(iommu, drhd)
3877                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
3878                                         iova->pfn_lo,
3879                                         iova->pfn_hi - iova->pfn_lo + 1,
3880                                         !freelist, 0);
3881                         rcu_read_unlock();
3882                         dma_free_pagelist(freelist);
3883
3884                         start_vpfn = iova->pfn_hi + 1;
3885                         free_iova_mem(iova);
3886                 }
3887                 break;
3888         }
3889
3890         return NOTIFY_OK;
3891 }
3892
3893 static struct notifier_block intel_iommu_memory_nb = {
3894         .notifier_call = intel_iommu_memory_notifier,
3895         .priority = 0
3896 };
3897
3898 int __init intel_iommu_init(void)
3899 {
3900         int ret = -ENODEV;
3901         struct dmar_drhd_unit *drhd;
3902         struct intel_iommu *iommu;
3903
3904         /* VT-d is required for a TXT/tboot launch, so enforce that */
3905         force_on = tboot_force_iommu();
3906
3907         if (iommu_init_mempool()) {
3908                 if (force_on)
3909                         panic("tboot: Failed to initialize iommu memory\n");
3910                 return -ENOMEM;
3911         }
3912
3913         down_write(&dmar_global_lock);
3914         if (dmar_table_init()) {
3915                 if (force_on)
3916                         panic("tboot: Failed to initialize DMAR table\n");
3917                 goto out_free_dmar;
3918         }
3919
3920         /*
3921          * Disable translation if already enabled prior to OS handover.
3922          */
3923         for_each_active_iommu(iommu, drhd)
3924                 if (iommu->gcmd & DMA_GCMD_TE)
3925                         iommu_disable_translation(iommu);
3926
3927         if (dmar_dev_scope_init() < 0) {
3928                 if (force_on)
3929                         panic("tboot: Failed to initialize DMAR device scope\n");
3930                 goto out_free_dmar;
3931         }
3932
3933         if (no_iommu || dmar_disabled)
3934                 goto out_free_dmar;
3935
3936         if (list_empty(&dmar_rmrr_units))
3937                 printk(KERN_INFO "DMAR: No RMRR found\n");
3938
3939         if (list_empty(&dmar_atsr_units))
3940                 printk(KERN_INFO "DMAR: No ATSR found\n");
3941
3942         if (dmar_init_reserved_ranges()) {
3943                 if (force_on)
3944                         panic("tboot: Failed to reserve iommu ranges\n");
3945                 goto out_free_reserved_range;
3946         }
3947
3948         init_no_remapping_devices();
3949
3950         ret = init_dmars();
3951         if (ret) {
3952                 if (force_on)
3953                         panic("tboot: Failed to initialize DMARs\n");
3954                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3955                 goto out_free_reserved_range;
3956         }
3957         up_write(&dmar_global_lock);
3958         printk(KERN_INFO
3959         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3960
3961         init_timer(&unmap_timer);
3962 #ifdef CONFIG_SWIOTLB
3963         swiotlb = 0;
3964 #endif
3965         dma_ops = &intel_dma_ops;
3966
3967         init_iommu_pm_ops();
3968
3969         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3970         bus_register_notifier(&pci_bus_type, &device_nb);
3971         if (si_domain && !hw_pass_through)
3972                 register_memory_notifier(&intel_iommu_memory_nb);
3973
3974         intel_iommu_enabled = 1;
3975
3976         return 0;
3977
3978 out_free_reserved_range:
3979         put_iova_domain(&reserved_iova_list);
3980 out_free_dmar:
3981         intel_iommu_free_dmars();
3982         up_write(&dmar_global_lock);
3983         iommu_exit_mempool();
3984         return ret;
3985 }
3986
3987 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3988                                            struct device *dev)
3989 {
3990         struct pci_dev *tmp, *parent, *pdev;
3991
3992         if (!iommu || !dev || !dev_is_pci(dev))
3993                 return;
3994
3995         pdev = to_pci_dev(dev);
3996
3997         /* dependent device detach */
3998         tmp = pci_find_upstream_pcie_bridge(pdev);
3999         /* Secondary interface's bus number and devfn 0 */
4000         if (tmp) {
4001                 parent = pdev->bus->self;
4002                 while (parent != tmp) {
4003                         iommu_detach_dev(iommu, parent->bus->number,
4004                                          parent->devfn);
4005                         parent = parent->bus->self;
4006                 }
4007                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4008                         iommu_detach_dev(iommu,
4009                                 tmp->subordinate->number, 0);
4010                 else /* this is a legacy PCI bridge */
4011                         iommu_detach_dev(iommu, tmp->bus->number,
4012                                          tmp->devfn);
4013         }
4014 }
4015
4016 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4017                                           struct pci_dev *pdev)
4018 {
4019         struct device_domain_info *info, *tmp;
4020         struct intel_iommu *iommu;
4021         unsigned long flags;
4022         int found = 0;
4023
4024         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4025                                 pdev->devfn);
4026         if (!iommu)
4027                 return;
4028
4029         spin_lock_irqsave(&device_domain_lock, flags);
4030         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4031                 if (info->iommu->segment == pci_domain_nr(pdev->bus) &&
4032                     info->bus == pdev->bus->number &&
4033                     info->devfn == pdev->devfn) {
4034                         unlink_domain_info(info);
4035                         spin_unlock_irqrestore(&device_domain_lock, flags);
4036
4037                         iommu_disable_dev_iotlb(info);
4038                         iommu_detach_dev(iommu, info->bus, info->devfn);
4039                         iommu_detach_dependent_devices(iommu, &pdev->dev);
4040                         free_devinfo_mem(info);
4041
4042                         spin_lock_irqsave(&device_domain_lock, flags);
4043
4044                         if (found)
4045                                 break;
4046                         else
4047                                 continue;
4048                 }
4049
4050                 /* if there is no other devices under the same iommu
4051                  * owned by this domain, clear this iommu in iommu_bmp
4052                  * update iommu count and coherency
4053                  */
4054                 if (info->iommu == iommu)
4055                         found = 1;
4056         }
4057
4058         spin_unlock_irqrestore(&device_domain_lock, flags);
4059
4060         if (found == 0) {
4061                 unsigned long tmp_flags;
4062                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4063                 clear_bit(iommu->seq_id, domain->iommu_bmp);
4064                 domain->iommu_count--;
4065                 domain_update_iommu_cap(domain);
4066                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4067
4068                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4069                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4070                         spin_lock_irqsave(&iommu->lock, tmp_flags);
4071                         clear_bit(domain->id, iommu->domain_ids);
4072                         iommu->domains[domain->id] = NULL;
4073                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4074                 }
4075         }
4076 }
4077
4078 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4079 {
4080         int adjust_width;
4081
4082         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4083         domain_reserve_special_ranges(domain);
4084
4085         /* calculate AGAW */
4086         domain->gaw = guest_width;
4087         adjust_width = guestwidth_to_adjustwidth(guest_width);
4088         domain->agaw = width_to_agaw(adjust_width);
4089
4090         domain->iommu_coherency = 0;
4091         domain->iommu_snooping = 0;
4092         domain->iommu_superpage = 0;
4093         domain->max_addr = 0;
4094         domain->nid = -1;
4095
4096         /* always allocate the top pgd */
4097         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4098         if (!domain->pgd)
4099                 return -ENOMEM;
4100         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4101         return 0;
4102 }
4103
4104 static int intel_iommu_domain_init(struct iommu_domain *domain)
4105 {
4106         struct dmar_domain *dmar_domain;
4107
4108         dmar_domain = alloc_domain(true);
4109         if (!dmar_domain) {
4110                 printk(KERN_ERR
4111                         "intel_iommu_domain_init: dmar_domain == NULL\n");
4112                 return -ENOMEM;
4113         }
4114         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4115                 printk(KERN_ERR
4116                         "intel_iommu_domain_init() failed\n");
4117                 domain_exit(dmar_domain);
4118                 return -ENOMEM;
4119         }
4120         domain_update_iommu_cap(dmar_domain);
4121         domain->priv = dmar_domain;
4122
4123         domain->geometry.aperture_start = 0;
4124         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4125         domain->geometry.force_aperture = true;
4126
4127         return 0;
4128 }
4129
4130 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4131 {
4132         struct dmar_domain *dmar_domain = domain->priv;
4133
4134         domain->priv = NULL;
4135         domain_exit(dmar_domain);
4136 }
4137
4138 static int intel_iommu_attach_device(struct iommu_domain *domain,
4139                                      struct device *dev)
4140 {
4141         struct dmar_domain *dmar_domain = domain->priv;
4142         struct pci_dev *pdev = to_pci_dev(dev);
4143         struct intel_iommu *iommu;
4144         int addr_width;
4145
4146         /* normally pdev is not mapped */
4147         if (unlikely(domain_context_mapped(pdev))) {
4148                 struct dmar_domain *old_domain;
4149
4150                 old_domain = find_domain(dev);
4151                 if (old_domain) {
4152                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4153                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4154                                 domain_remove_one_dev_info(old_domain, pdev);
4155                         else
4156                                 domain_remove_dev_info(old_domain);
4157                 }
4158         }
4159
4160         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4161                                 pdev->devfn);
4162         if (!iommu)
4163                 return -ENODEV;
4164
4165         /* check if this iommu agaw is sufficient for max mapped address */
4166         addr_width = agaw_to_width(iommu->agaw);
4167         if (addr_width > cap_mgaw(iommu->cap))
4168                 addr_width = cap_mgaw(iommu->cap);
4169
4170         if (dmar_domain->max_addr > (1LL << addr_width)) {
4171                 printk(KERN_ERR "%s: iommu width (%d) is not "
4172                        "sufficient for the mapped address (%llx)\n",
4173                        __func__, addr_width, dmar_domain->max_addr);
4174                 return -EFAULT;
4175         }
4176         dmar_domain->gaw = addr_width;
4177
4178         /*
4179          * Knock out extra levels of page tables if necessary
4180          */
4181         while (iommu->agaw < dmar_domain->agaw) {
4182                 struct dma_pte *pte;
4183
4184                 pte = dmar_domain->pgd;
4185                 if (dma_pte_present(pte)) {
4186                         dmar_domain->pgd = (struct dma_pte *)
4187                                 phys_to_virt(dma_pte_addr(pte));
4188                         free_pgtable_page(pte);
4189                 }
4190                 dmar_domain->agaw--;
4191         }
4192
4193         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4194 }
4195
4196 static void intel_iommu_detach_device(struct iommu_domain *domain,
4197                                       struct device *dev)
4198 {
4199         struct dmar_domain *dmar_domain = domain->priv;
4200         struct pci_dev *pdev = to_pci_dev(dev);
4201
4202         domain_remove_one_dev_info(dmar_domain, pdev);
4203 }
4204
4205 static int intel_iommu_map(struct iommu_domain *domain,
4206                            unsigned long iova, phys_addr_t hpa,
4207                            size_t size, int iommu_prot)
4208 {
4209         struct dmar_domain *dmar_domain = domain->priv;
4210         u64 max_addr;
4211         int prot = 0;
4212         int ret;
4213
4214         if (iommu_prot & IOMMU_READ)
4215                 prot |= DMA_PTE_READ;
4216         if (iommu_prot & IOMMU_WRITE)
4217                 prot |= DMA_PTE_WRITE;
4218         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4219                 prot |= DMA_PTE_SNP;
4220
4221         max_addr = iova + size;
4222         if (dmar_domain->max_addr < max_addr) {
4223                 u64 end;
4224
4225                 /* check if minimum agaw is sufficient for mapped address */
4226                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4227                 if (end < max_addr) {
4228                         printk(KERN_ERR "%s: iommu width (%d) is not "
4229                                "sufficient for the mapped address (%llx)\n",
4230                                __func__, dmar_domain->gaw, max_addr);
4231                         return -EFAULT;
4232                 }
4233                 dmar_domain->max_addr = max_addr;
4234         }
4235         /* Round up size to next multiple of PAGE_SIZE, if it and
4236            the low bits of hpa would take us onto the next page */
4237         size = aligned_nrpages(hpa, size);
4238         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4239                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4240         return ret;
4241 }
4242
4243 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4244                                 unsigned long iova, size_t size)
4245 {
4246         struct dmar_domain *dmar_domain = domain->priv;
4247         struct page *freelist = NULL;
4248         struct intel_iommu *iommu;
4249         unsigned long start_pfn, last_pfn;
4250         unsigned int npages;
4251         int iommu_id, num, ndomains, level = 0;
4252
4253         /* Cope with horrid API which requires us to unmap more than the
4254            size argument if it happens to be a large-page mapping. */
4255         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4256                 BUG();
4257
4258         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4259                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4260
4261         start_pfn = iova >> VTD_PAGE_SHIFT;
4262         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4263
4264         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4265
4266         npages = last_pfn - start_pfn + 1;
4267
4268         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4269                iommu = g_iommus[iommu_id];
4270
4271                /*
4272                 * find bit position of dmar_domain
4273                 */
4274                ndomains = cap_ndoms(iommu->cap);
4275                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4276                        if (iommu->domains[num] == dmar_domain)
4277                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4278                                                      npages, !freelist, 0);
4279                }
4280
4281         }
4282
4283         dma_free_pagelist(freelist);
4284
4285         if (dmar_domain->max_addr == iova + size)
4286                 dmar_domain->max_addr = iova;
4287
4288         return size;
4289 }
4290
4291 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4292                                             dma_addr_t iova)
4293 {
4294         struct dmar_domain *dmar_domain = domain->priv;
4295         struct dma_pte *pte;
4296         int level = 0;
4297         u64 phys = 0;
4298
4299         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4300         if (pte)
4301                 phys = dma_pte_addr(pte);
4302
4303         return phys;
4304 }
4305
4306 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4307                                       unsigned long cap)
4308 {
4309         struct dmar_domain *dmar_domain = domain->priv;
4310
4311         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4312                 return dmar_domain->iommu_snooping;
4313         if (cap == IOMMU_CAP_INTR_REMAP)
4314                 return irq_remapping_enabled;
4315
4316         return 0;
4317 }
4318
4319 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4320
4321 static int intel_iommu_add_device(struct device *dev)
4322 {
4323         struct pci_dev *pdev = to_pci_dev(dev);
4324         struct pci_dev *bridge, *dma_pdev = NULL;
4325         struct iommu_group *group;
4326         int ret;
4327
4328         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4329                              pdev->bus->number, pdev->devfn))
4330                 return -ENODEV;
4331
4332         bridge = pci_find_upstream_pcie_bridge(pdev);
4333         if (bridge) {
4334                 if (pci_is_pcie(bridge))
4335                         dma_pdev = pci_get_domain_bus_and_slot(
4336                                                 pci_domain_nr(pdev->bus),
4337                                                 bridge->subordinate->number, 0);
4338                 if (!dma_pdev)
4339                         dma_pdev = pci_dev_get(bridge);
4340         } else
4341                 dma_pdev = pci_dev_get(pdev);
4342
4343         /* Account for quirked devices */
4344         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4345
4346         /*
4347          * If it's a multifunction device that does not support our
4348          * required ACS flags, add to the same group as lowest numbered
4349          * function that also does not suport the required ACS flags.
4350          */
4351         if (dma_pdev->multifunction &&
4352             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4353                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4354
4355                 for (i = 0; i < 8; i++) {
4356                         struct pci_dev *tmp;
4357
4358                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4359                         if (!tmp)
4360                                 continue;
4361
4362                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4363                                 swap_pci_ref(&dma_pdev, tmp);
4364                                 break;
4365                         }
4366                         pci_dev_put(tmp);
4367                 }
4368         }
4369
4370         /*
4371          * Devices on the root bus go through the iommu.  If that's not us,
4372          * find the next upstream device and test ACS up to the root bus.
4373          * Finding the next device may require skipping virtual buses.
4374          */
4375         while (!pci_is_root_bus(dma_pdev->bus)) {
4376                 struct pci_bus *bus = dma_pdev->bus;
4377
4378                 while (!bus->self) {
4379                         if (!pci_is_root_bus(bus))
4380                                 bus = bus->parent;
4381                         else
4382                                 goto root_bus;
4383                 }
4384
4385                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4386                         break;
4387
4388                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4389         }
4390
4391 root_bus:
4392         group = iommu_group_get(&dma_pdev->dev);
4393         pci_dev_put(dma_pdev);
4394         if (!group) {
4395                 group = iommu_group_alloc();
4396                 if (IS_ERR(group))
4397                         return PTR_ERR(group);
4398         }
4399
4400         ret = iommu_group_add_device(group, dev);
4401
4402         iommu_group_put(group);
4403         return ret;
4404 }
4405
4406 static void intel_iommu_remove_device(struct device *dev)
4407 {
4408         iommu_group_remove_device(dev);
4409 }
4410
4411 static struct iommu_ops intel_iommu_ops = {
4412         .domain_init    = intel_iommu_domain_init,
4413         .domain_destroy = intel_iommu_domain_destroy,
4414         .attach_dev     = intel_iommu_attach_device,
4415         .detach_dev     = intel_iommu_detach_device,
4416         .map            = intel_iommu_map,
4417         .unmap          = intel_iommu_unmap,
4418         .iova_to_phys   = intel_iommu_iova_to_phys,
4419         .domain_has_cap = intel_iommu_domain_has_cap,
4420         .add_device     = intel_iommu_add_device,
4421         .remove_device  = intel_iommu_remove_device,
4422         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4423 };
4424
4425 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4426 {
4427         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4428         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4429         dmar_map_gfx = 0;
4430 }
4431
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4439
4440 static void quirk_iommu_rwbf(struct pci_dev *dev)
4441 {
4442         /*
4443          * Mobile 4 Series Chipset neglects to set RWBF capability,
4444          * but needs it. Same seems to hold for the desktop versions.
4445          */
4446         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4447         rwbf_quirk = 1;
4448 }
4449
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4457
4458 #define GGC 0x52
4459 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4460 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4461 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4462 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4463 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4464 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4465 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4466 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4467
4468 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4469 {
4470         unsigned short ggc;
4471
4472         if (pci_read_config_word(dev, GGC, &ggc))
4473                 return;
4474
4475         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4476                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4477                 dmar_map_gfx = 0;
4478         } else if (dmar_map_gfx) {
4479                 /* we have to ensure the gfx device is idle before we flush */
4480                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4481                 intel_iommu_strict = 1;
4482        }
4483 }
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4488
4489 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4490    ISOCH DMAR unit for the Azalia sound device, but not give it any
4491    TLB entries, which causes it to deadlock. Check for that.  We do
4492    this in a function called from init_dmars(), instead of in a PCI
4493    quirk, because we don't want to print the obnoxious "BIOS broken"
4494    message if VT-d is actually disabled.
4495 */
4496 static void __init check_tylersburg_isoch(void)
4497 {
4498         struct pci_dev *pdev;
4499         uint32_t vtisochctrl;
4500
4501         /* If there's no Azalia in the system anyway, forget it. */
4502         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4503         if (!pdev)
4504                 return;
4505         pci_dev_put(pdev);
4506
4507         /* System Management Registers. Might be hidden, in which case
4508            we can't do the sanity check. But that's OK, because the
4509            known-broken BIOSes _don't_ actually hide it, so far. */
4510         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4511         if (!pdev)
4512                 return;
4513
4514         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4515                 pci_dev_put(pdev);
4516                 return;
4517         }
4518
4519         pci_dev_put(pdev);
4520
4521         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4522         if (vtisochctrl & 1)
4523                 return;
4524
4525         /* Drop all bits other than the number of TLB entries */
4526         vtisochctrl &= 0x1c;
4527
4528         /* If we have the recommended number of TLB entries (16), fine. */
4529         if (vtisochctrl == 0x10)
4530                 return;
4531
4532         /* Zero TLB entries? You get to ride the short bus to school. */
4533         if (!vtisochctrl) {
4534                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4535                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4536                      dmi_get_system_info(DMI_BIOS_VENDOR),
4537                      dmi_get_system_info(DMI_BIOS_VERSION),
4538                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4539                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4540                 return;
4541         }
4542         
4543         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4544                vtisochctrl);
4545 }