40dbafd376fb3cfa4b692a9a71b4be3e38b08c55
[linux-2.6-microblaze.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45
46 #include "irq_remapping.h"
47 #include "pci.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83  * This bitmap is used to advertise the page sizes our hardware support
84  * to the IOMMU core, which will then use this information to split
85  * physically contiguous memory regions it is mapping into page sizes
86  * that we support.
87  *
88  * Traditionally the IOMMU core just handed us the mappings directly,
89  * after making sure the size is an order of a 4KiB page and that the
90  * mapping has natural alignment.
91  *
92  * To retain this behavior, we currently advertise that we support
93  * all page sizes that are an order of 4KiB.
94  *
95  * If at some point we'd like to utilize the IOMMU core's new behavior,
96  * we could change this to advertise the real page sizes we support.
97  */
98 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102         return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117         return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127         return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132         return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137         return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146    are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158         return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162         return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172  * set to 1 to panic kernel if can't successfully enable VT-d
173  * (used when kernel is launched w/ TXT)
174  */
175 static int force_on = 0;
176
177 /*
178  * 0: Present
179  * 1-11: Reserved
180  * 12-63: Context Ptr (12 - (haw-1))
181  * 64-127: Reserved
182  */
183 struct root_entry {
184         u64     val;
185         u64     rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190         return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194         root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198         root->val |= value & VTD_PAGE_MASK;
199 }
200
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
203 {
204         return (struct context_entry *)
205                 (root_present(root)?phys_to_virt(
206                 root->val & VTD_PAGE_MASK) :
207                 NULL);
208 }
209
210 /*
211  * low 64 bits:
212  * 0: present
213  * 1: fault processing disable
214  * 2-3: translation type
215  * 12-63: address space root
216  * high 64 bits:
217  * 0-2: address width
218  * 3-6: aval
219  * 8-23: domain id
220  */
221 struct context_entry {
222         u64 lo;
223         u64 hi;
224 };
225
226 static inline bool context_present(struct context_entry *context)
227 {
228         return (context->lo & 1);
229 }
230 static inline void context_set_present(struct context_entry *context)
231 {
232         context->lo |= 1;
233 }
234
235 static inline void context_set_fault_enable(struct context_entry *context)
236 {
237         context->lo &= (((u64)-1) << 2) | 1;
238 }
239
240 static inline void context_set_translation_type(struct context_entry *context,
241                                                 unsigned long value)
242 {
243         context->lo &= (((u64)-1) << 4) | 3;
244         context->lo |= (value & 3) << 2;
245 }
246
247 static inline void context_set_address_root(struct context_entry *context,
248                                             unsigned long value)
249 {
250         context->lo |= value & VTD_PAGE_MASK;
251 }
252
253 static inline void context_set_address_width(struct context_entry *context,
254                                              unsigned long value)
255 {
256         context->hi |= value & 7;
257 }
258
259 static inline void context_set_domain_id(struct context_entry *context,
260                                          unsigned long value)
261 {
262         context->hi |= (value & ((1 << 16) - 1)) << 8;
263 }
264
265 static inline void context_clear_entry(struct context_entry *context)
266 {
267         context->lo = 0;
268         context->hi = 0;
269 }
270
271 /*
272  * 0: readable
273  * 1: writable
274  * 2-6: reserved
275  * 7: super page
276  * 8-10: available
277  * 11: snoop behavior
278  * 12-63: Host physcial address
279  */
280 struct dma_pte {
281         u64 val;
282 };
283
284 static inline void dma_clear_pte(struct dma_pte *pte)
285 {
286         pte->val = 0;
287 }
288
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
290 {
291 #ifdef CONFIG_64BIT
292         return pte->val & VTD_PAGE_MASK;
293 #else
294         /* Must have a full atomic 64-bit read */
295         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
297 }
298
299 static inline bool dma_pte_present(struct dma_pte *pte)
300 {
301         return (pte->val & 3) != 0;
302 }
303
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
305 {
306         return (pte->val & (1 << 7));
307 }
308
309 static inline int first_pte_in_page(struct dma_pte *pte)
310 {
311         return !((unsigned long)pte & ~VTD_PAGE_MASK);
312 }
313
314 /*
315  * This domain is a statically identity mapping domain.
316  *      1. This domain creats a static 1:1 mapping to all usable memory.
317  *      2. It maps to each iommu if successful.
318  *      3. Each iommu mapps to this domain if successful.
319  */
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
322
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
325
326 /* domain represents a virtual machine, more than one devices
327  * across iommus may be owned in one domain, e.g. kvm guest.
328  */
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
330
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
333
334 /* define the limit of IOMMUs supported in each domain */
335 #ifdef  CONFIG_X86
336 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
337 #else
338 # define        IOMMU_UNITS_SUPPORTED   64
339 #endif
340
341 struct dmar_domain {
342         int     id;                     /* domain id */
343         int     nid;                    /* node id */
344         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345                                         /* bitmap of iommus this domain uses*/
346
347         struct list_head devices;       /* all devices' list */
348         struct iova_domain iovad;       /* iova's that belong to this domain */
349
350         struct dma_pte  *pgd;           /* virtual address */
351         int             gaw;            /* max guest address width */
352
353         /* adjusted guest address width, 0 is level 2 30-bit */
354         int             agaw;
355
356         int             flags;          /* flags to find out type of domain */
357
358         int             iommu_coherency;/* indicate coherency of iommu access */
359         int             iommu_snooping; /* indicate snooping control feature*/
360         int             iommu_count;    /* reference count of iommu */
361         int             iommu_superpage;/* Level of superpages supported:
362                                            0 == 4KiB (no superpages), 1 == 2MiB,
363                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364         spinlock_t      iommu_lock;     /* protect iommu set in domain */
365         u64             max_addr;       /* maximum mapped address */
366 };
367
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370         struct list_head link;  /* link to domain siblings */
371         struct list_head global; /* link to global list */
372         int segment;            /* PCI domain */
373         u8 bus;                 /* PCI bus number */
374         u8 devfn;               /* PCI devfn number */
375         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
376         struct intel_iommu *iommu; /* IOMMU used by this device */
377         struct dmar_domain *domain; /* pointer to domain */
378 };
379
380 struct dmar_rmrr_unit {
381         struct list_head list;          /* list of rmrr units   */
382         struct acpi_dmar_header *hdr;   /* ACPI header          */
383         u64     base_address;           /* reserved base address*/
384         u64     end_address;            /* reserved end address */
385         struct dmar_dev_scope *devices; /* target devices */
386         int     devices_cnt;            /* target device count */
387 };
388
389 struct dmar_atsr_unit {
390         struct list_head list;          /* list of ATSR units */
391         struct acpi_dmar_header *hdr;   /* ACPI header */
392         struct dmar_dev_scope *devices; /* target devices */
393         int devices_cnt;                /* target device count */
394         u8 include_all:1;               /* include all ports */
395 };
396
397 static LIST_HEAD(dmar_atsr_units);
398 static LIST_HEAD(dmar_rmrr_units);
399
400 #define for_each_rmrr_units(rmrr) \
401         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
402
403 static void flush_unmaps_timeout(unsigned long data);
404
405 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
406
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
409         int next;
410         struct iova *iova[HIGH_WATER_MARK];
411         struct dmar_domain *domain[HIGH_WATER_MARK];
412         struct page *freelist[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_exit(struct dmar_domain *domain);
427 static void domain_remove_dev_info(struct dmar_domain *domain);
428 static void domain_remove_one_dev_info(struct dmar_domain *domain,
429                                        struct pci_dev *pdev);
430 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
431                                            struct pci_dev *pdev);
432
433 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
434 int dmar_disabled = 0;
435 #else
436 int dmar_disabled = 1;
437 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
438
439 int intel_iommu_enabled = 0;
440 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
441
442 static int dmar_map_gfx = 1;
443 static int dmar_forcedac;
444 static int intel_iommu_strict;
445 static int intel_iommu_superpage = 1;
446
447 int intel_iommu_gfx_mapped;
448 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
449
450 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
451 static DEFINE_SPINLOCK(device_domain_lock);
452 static LIST_HEAD(device_domain_list);
453
454 static struct iommu_ops intel_iommu_ops;
455
456 static int __init intel_iommu_setup(char *str)
457 {
458         if (!str)
459                 return -EINVAL;
460         while (*str) {
461                 if (!strncmp(str, "on", 2)) {
462                         dmar_disabled = 0;
463                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
464                 } else if (!strncmp(str, "off", 3)) {
465                         dmar_disabled = 1;
466                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
467                 } else if (!strncmp(str, "igfx_off", 8)) {
468                         dmar_map_gfx = 0;
469                         printk(KERN_INFO
470                                 "Intel-IOMMU: disable GFX device mapping\n");
471                 } else if (!strncmp(str, "forcedac", 8)) {
472                         printk(KERN_INFO
473                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
474                         dmar_forcedac = 1;
475                 } else if (!strncmp(str, "strict", 6)) {
476                         printk(KERN_INFO
477                                 "Intel-IOMMU: disable batched IOTLB flush\n");
478                         intel_iommu_strict = 1;
479                 } else if (!strncmp(str, "sp_off", 6)) {
480                         printk(KERN_INFO
481                                 "Intel-IOMMU: disable supported super page\n");
482                         intel_iommu_superpage = 0;
483                 }
484
485                 str += strcspn(str, ",");
486                 while (*str == ',')
487                         str++;
488         }
489         return 0;
490 }
491 __setup("intel_iommu=", intel_iommu_setup);
492
493 static struct kmem_cache *iommu_domain_cache;
494 static struct kmem_cache *iommu_devinfo_cache;
495 static struct kmem_cache *iommu_iova_cache;
496
497 static inline void *alloc_pgtable_page(int node)
498 {
499         struct page *page;
500         void *vaddr = NULL;
501
502         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
503         if (page)
504                 vaddr = page_address(page);
505         return vaddr;
506 }
507
508 static inline void free_pgtable_page(void *vaddr)
509 {
510         free_page((unsigned long)vaddr);
511 }
512
513 static inline void *alloc_domain_mem(void)
514 {
515         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
516 }
517
518 static void free_domain_mem(void *vaddr)
519 {
520         kmem_cache_free(iommu_domain_cache, vaddr);
521 }
522
523 static inline void * alloc_devinfo_mem(void)
524 {
525         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
526 }
527
528 static inline void free_devinfo_mem(void *vaddr)
529 {
530         kmem_cache_free(iommu_devinfo_cache, vaddr);
531 }
532
533 struct iova *alloc_iova_mem(void)
534 {
535         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
536 }
537
538 void free_iova_mem(struct iova *iova)
539 {
540         kmem_cache_free(iommu_iova_cache, iova);
541 }
542
543
544 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
545 {
546         unsigned long sagaw;
547         int agaw = -1;
548
549         sagaw = cap_sagaw(iommu->cap);
550         for (agaw = width_to_agaw(max_gaw);
551              agaw >= 0; agaw--) {
552                 if (test_bit(agaw, &sagaw))
553                         break;
554         }
555
556         return agaw;
557 }
558
559 /*
560  * Calculate max SAGAW for each iommu.
561  */
562 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
563 {
564         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
565 }
566
567 /*
568  * calculate agaw for each iommu.
569  * "SAGAW" may be different across iommus, use a default agaw, and
570  * get a supported less agaw for iommus that don't support the default agaw.
571  */
572 int iommu_calculate_agaw(struct intel_iommu *iommu)
573 {
574         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
575 }
576
577 /* This functionin only returns single iommu in a domain */
578 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
579 {
580         int iommu_id;
581
582         /* si_domain and vm domain should not get here. */
583         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
584         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
585
586         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
587         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
588                 return NULL;
589
590         return g_iommus[iommu_id];
591 }
592
593 static void domain_update_iommu_coherency(struct dmar_domain *domain)
594 {
595         struct dmar_drhd_unit *drhd;
596         struct intel_iommu *iommu;
597         int i, found = 0;
598
599         domain->iommu_coherency = 1;
600
601         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
602                 found = 1;
603                 if (!ecap_coherent(g_iommus[i]->ecap)) {
604                         domain->iommu_coherency = 0;
605                         break;
606                 }
607         }
608         if (found)
609                 return;
610
611         /* No hardware attached; use lowest common denominator */
612         rcu_read_lock();
613         for_each_active_iommu(iommu, drhd) {
614                 if (!ecap_coherent(iommu->ecap)) {
615                         domain->iommu_coherency = 0;
616                         break;
617                 }
618         }
619         rcu_read_unlock();
620 }
621
622 static void domain_update_iommu_snooping(struct dmar_domain *domain)
623 {
624         int i;
625
626         domain->iommu_snooping = 1;
627
628         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
629                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
630                         domain->iommu_snooping = 0;
631                         break;
632                 }
633         }
634 }
635
636 static void domain_update_iommu_superpage(struct dmar_domain *domain)
637 {
638         struct dmar_drhd_unit *drhd;
639         struct intel_iommu *iommu = NULL;
640         int mask = 0xf;
641
642         if (!intel_iommu_superpage) {
643                 domain->iommu_superpage = 0;
644                 return;
645         }
646
647         /* set iommu_superpage to the smallest common denominator */
648         rcu_read_lock();
649         for_each_active_iommu(iommu, drhd) {
650                 mask &= cap_super_page_val(iommu->cap);
651                 if (!mask) {
652                         break;
653                 }
654         }
655         rcu_read_unlock();
656
657         domain->iommu_superpage = fls(mask);
658 }
659
660 /* Some capabilities may be different across iommus */
661 static void domain_update_iommu_cap(struct dmar_domain *domain)
662 {
663         domain_update_iommu_coherency(domain);
664         domain_update_iommu_snooping(domain);
665         domain_update_iommu_superpage(domain);
666 }
667
668 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
669 {
670         struct dmar_drhd_unit *drhd = NULL;
671         struct intel_iommu *iommu;
672         struct device *dev;
673         struct pci_dev *pdev;
674         int i;
675
676         rcu_read_lock();
677         for_each_active_iommu(iommu, drhd) {
678                 if (segment != drhd->segment)
679                         continue;
680
681                 for_each_active_dev_scope(drhd->devices,
682                                           drhd->devices_cnt, i, dev) {
683                         if (!dev_is_pci(dev))
684                                 continue;
685                         pdev = to_pci_dev(dev);
686                         if (pdev->bus->number == bus && pdev->devfn == devfn)
687                                 goto out;
688                         if (pdev->subordinate &&
689                             pdev->subordinate->number <= bus &&
690                             pdev->subordinate->busn_res.end >= bus)
691                                 goto out;
692                 }
693
694                 if (drhd->include_all)
695                         goto out;
696         }
697         iommu = NULL;
698 out:
699         rcu_read_unlock();
700
701         return iommu;
702 }
703
704 static void domain_flush_cache(struct dmar_domain *domain,
705                                void *addr, int size)
706 {
707         if (!domain->iommu_coherency)
708                 clflush_cache_range(addr, size);
709 }
710
711 /* Gets context entry for a given bus and devfn */
712 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
713                 u8 bus, u8 devfn)
714 {
715         struct root_entry *root;
716         struct context_entry *context;
717         unsigned long phy_addr;
718         unsigned long flags;
719
720         spin_lock_irqsave(&iommu->lock, flags);
721         root = &iommu->root_entry[bus];
722         context = get_context_addr_from_root(root);
723         if (!context) {
724                 context = (struct context_entry *)
725                                 alloc_pgtable_page(iommu->node);
726                 if (!context) {
727                         spin_unlock_irqrestore(&iommu->lock, flags);
728                         return NULL;
729                 }
730                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731                 phy_addr = virt_to_phys((void *)context);
732                 set_root_value(root, phy_addr);
733                 set_root_present(root);
734                 __iommu_flush_cache(iommu, root, sizeof(*root));
735         }
736         spin_unlock_irqrestore(&iommu->lock, flags);
737         return &context[devfn];
738 }
739
740 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
741 {
742         struct root_entry *root;
743         struct context_entry *context;
744         int ret;
745         unsigned long flags;
746
747         spin_lock_irqsave(&iommu->lock, flags);
748         root = &iommu->root_entry[bus];
749         context = get_context_addr_from_root(root);
750         if (!context) {
751                 ret = 0;
752                 goto out;
753         }
754         ret = context_present(&context[devfn]);
755 out:
756         spin_unlock_irqrestore(&iommu->lock, flags);
757         return ret;
758 }
759
760 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
761 {
762         struct root_entry *root;
763         struct context_entry *context;
764         unsigned long flags;
765
766         spin_lock_irqsave(&iommu->lock, flags);
767         root = &iommu->root_entry[bus];
768         context = get_context_addr_from_root(root);
769         if (context) {
770                 context_clear_entry(&context[devfn]);
771                 __iommu_flush_cache(iommu, &context[devfn], \
772                         sizeof(*context));
773         }
774         spin_unlock_irqrestore(&iommu->lock, flags);
775 }
776
777 static void free_context_table(struct intel_iommu *iommu)
778 {
779         struct root_entry *root;
780         int i;
781         unsigned long flags;
782         struct context_entry *context;
783
784         spin_lock_irqsave(&iommu->lock, flags);
785         if (!iommu->root_entry) {
786                 goto out;
787         }
788         for (i = 0; i < ROOT_ENTRY_NR; i++) {
789                 root = &iommu->root_entry[i];
790                 context = get_context_addr_from_root(root);
791                 if (context)
792                         free_pgtable_page(context);
793         }
794         free_pgtable_page(iommu->root_entry);
795         iommu->root_entry = NULL;
796 out:
797         spin_unlock_irqrestore(&iommu->lock, flags);
798 }
799
800 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
801                                       unsigned long pfn, int *target_level)
802 {
803         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
804         struct dma_pte *parent, *pte = NULL;
805         int level = agaw_to_level(domain->agaw);
806         int offset;
807
808         BUG_ON(!domain->pgd);
809
810         if (addr_width < BITS_PER_LONG && pfn >> addr_width)
811                 /* Address beyond IOMMU's addressing capabilities. */
812                 return NULL;
813
814         parent = domain->pgd;
815
816         while (1) {
817                 void *tmp_page;
818
819                 offset = pfn_level_offset(pfn, level);
820                 pte = &parent[offset];
821                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
822                         break;
823                 if (level == *target_level)
824                         break;
825
826                 if (!dma_pte_present(pte)) {
827                         uint64_t pteval;
828
829                         tmp_page = alloc_pgtable_page(domain->nid);
830
831                         if (!tmp_page)
832                                 return NULL;
833
834                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
835                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
836                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
837                                 /* Someone else set it while we were thinking; use theirs. */
838                                 free_pgtable_page(tmp_page);
839                         } else {
840                                 dma_pte_addr(pte);
841                                 domain_flush_cache(domain, pte, sizeof(*pte));
842                         }
843                 }
844                 if (level == 1)
845                         break;
846
847                 parent = phys_to_virt(dma_pte_addr(pte));
848                 level--;
849         }
850
851         if (!*target_level)
852                 *target_level = level;
853
854         return pte;
855 }
856
857
858 /* return address's pte at specific level */
859 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
860                                          unsigned long pfn,
861                                          int level, int *large_page)
862 {
863         struct dma_pte *parent, *pte = NULL;
864         int total = agaw_to_level(domain->agaw);
865         int offset;
866
867         parent = domain->pgd;
868         while (level <= total) {
869                 offset = pfn_level_offset(pfn, total);
870                 pte = &parent[offset];
871                 if (level == total)
872                         return pte;
873
874                 if (!dma_pte_present(pte)) {
875                         *large_page = total;
876                         break;
877                 }
878
879                 if (pte->val & DMA_PTE_LARGE_PAGE) {
880                         *large_page = total;
881                         return pte;
882                 }
883
884                 parent = phys_to_virt(dma_pte_addr(pte));
885                 total--;
886         }
887         return NULL;
888 }
889
890 /* clear last level pte, a tlb flush should be followed */
891 static void dma_pte_clear_range(struct dmar_domain *domain,
892                                 unsigned long start_pfn,
893                                 unsigned long last_pfn)
894 {
895         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896         unsigned int large_page = 1;
897         struct dma_pte *first_pte, *pte;
898
899         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901         BUG_ON(start_pfn > last_pfn);
902
903         /* we don't need lock here; nobody else touches the iova range */
904         do {
905                 large_page = 1;
906                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
907                 if (!pte) {
908                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
909                         continue;
910                 }
911                 do {
912                         dma_clear_pte(pte);
913                         start_pfn += lvl_to_nr_pages(large_page);
914                         pte++;
915                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
916
917                 domain_flush_cache(domain, first_pte,
918                                    (void *)pte - (void *)first_pte);
919
920         } while (start_pfn && start_pfn <= last_pfn);
921 }
922
923 static void dma_pte_free_level(struct dmar_domain *domain, int level,
924                                struct dma_pte *pte, unsigned long pfn,
925                                unsigned long start_pfn, unsigned long last_pfn)
926 {
927         pfn = max(start_pfn, pfn);
928         pte = &pte[pfn_level_offset(pfn, level)];
929
930         do {
931                 unsigned long level_pfn;
932                 struct dma_pte *level_pte;
933
934                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
935                         goto next;
936
937                 level_pfn = pfn & level_mask(level - 1);
938                 level_pte = phys_to_virt(dma_pte_addr(pte));
939
940                 if (level > 2)
941                         dma_pte_free_level(domain, level - 1, level_pte,
942                                            level_pfn, start_pfn, last_pfn);
943
944                 /* If range covers entire pagetable, free it */
945                 if (!(start_pfn > level_pfn ||
946                       last_pfn < level_pfn + level_size(level) - 1)) {
947                         dma_clear_pte(pte);
948                         domain_flush_cache(domain, pte, sizeof(*pte));
949                         free_pgtable_page(level_pte);
950                 }
951 next:
952                 pfn += level_size(level);
953         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 }
955
956 /* free page table pages. last level pte should already be cleared */
957 static void dma_pte_free_pagetable(struct dmar_domain *domain,
958                                    unsigned long start_pfn,
959                                    unsigned long last_pfn)
960 {
961         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
962
963         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
964         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
965         BUG_ON(start_pfn > last_pfn);
966
967         /* We don't need lock here; nobody else touches the iova range */
968         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
969                            domain->pgd, 0, start_pfn, last_pfn);
970
971         /* free pgd */
972         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
973                 free_pgtable_page(domain->pgd);
974                 domain->pgd = NULL;
975         }
976 }
977
978 /* When a page at a given level is being unlinked from its parent, we don't
979    need to *modify* it at all. All we need to do is make a list of all the
980    pages which can be freed just as soon as we've flushed the IOTLB and we
981    know the hardware page-walk will no longer touch them.
982    The 'pte' argument is the *parent* PTE, pointing to the page that is to
983    be freed. */
984 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
985                                             int level, struct dma_pte *pte,
986                                             struct page *freelist)
987 {
988         struct page *pg;
989
990         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
991         pg->freelist = freelist;
992         freelist = pg;
993
994         if (level == 1)
995                 return freelist;
996
997         for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
998                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
999                         freelist = dma_pte_list_pagetables(domain, level - 1,
1000                                                            pte, freelist);
1001         }
1002
1003         return freelist;
1004 }
1005
1006 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1007                                         struct dma_pte *pte, unsigned long pfn,
1008                                         unsigned long start_pfn,
1009                                         unsigned long last_pfn,
1010                                         struct page *freelist)
1011 {
1012         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1013
1014         pfn = max(start_pfn, pfn);
1015         pte = &pte[pfn_level_offset(pfn, level)];
1016
1017         do {
1018                 unsigned long level_pfn;
1019
1020                 if (!dma_pte_present(pte))
1021                         goto next;
1022
1023                 level_pfn = pfn & level_mask(level);
1024
1025                 /* If range covers entire pagetable, free it */
1026                 if (start_pfn <= level_pfn &&
1027                     last_pfn >= level_pfn + level_size(level) - 1) {
1028                         /* These suborbinate page tables are going away entirely. Don't
1029                            bother to clear them; we're just going to *free* them. */
1030                         if (level > 1 && !dma_pte_superpage(pte))
1031                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1032
1033                         dma_clear_pte(pte);
1034                         if (!first_pte)
1035                                 first_pte = pte;
1036                         last_pte = pte;
1037                 } else if (level > 1) {
1038                         /* Recurse down into a level that isn't *entirely* obsolete */
1039                         freelist = dma_pte_clear_level(domain, level - 1,
1040                                                        phys_to_virt(dma_pte_addr(pte)),
1041                                                        level_pfn, start_pfn, last_pfn,
1042                                                        freelist);
1043                 }
1044 next:
1045                 pfn += level_size(level);
1046         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1047
1048         if (first_pte)
1049                 domain_flush_cache(domain, first_pte,
1050                                    (void *)++last_pte - (void *)first_pte);
1051
1052         return freelist;
1053 }
1054
1055 /* We can't just free the pages because the IOMMU may still be walking
1056    the page tables, and may have cached the intermediate levels. The
1057    pages can only be freed after the IOTLB flush has been done. */
1058 struct page *domain_unmap(struct dmar_domain *domain,
1059                           unsigned long start_pfn,
1060                           unsigned long last_pfn)
1061 {
1062         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1063         struct page *freelist = NULL;
1064
1065         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1066         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1067         BUG_ON(start_pfn > last_pfn);
1068
1069         /* we don't need lock here; nobody else touches the iova range */
1070         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1072
1073         /* free pgd */
1074         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075                 struct page *pgd_page = virt_to_page(domain->pgd);
1076                 pgd_page->freelist = freelist;
1077                 freelist = pgd_page;
1078
1079                 domain->pgd = NULL;
1080         }
1081
1082         return freelist;
1083 }
1084
1085 void dma_free_pagelist(struct page *freelist)
1086 {
1087         struct page *pg;
1088
1089         while ((pg = freelist)) {
1090                 freelist = pg->freelist;
1091                 free_pgtable_page(page_address(pg));
1092         }
1093 }
1094
1095 /* iommu handling */
1096 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1097 {
1098         struct root_entry *root;
1099         unsigned long flags;
1100
1101         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1102         if (!root)
1103                 return -ENOMEM;
1104
1105         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1106
1107         spin_lock_irqsave(&iommu->lock, flags);
1108         iommu->root_entry = root;
1109         spin_unlock_irqrestore(&iommu->lock, flags);
1110
1111         return 0;
1112 }
1113
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1115 {
1116         void *addr;
1117         u32 sts;
1118         unsigned long flag;
1119
1120         addr = iommu->root_entry;
1121
1122         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1123         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1124
1125         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1126
1127         /* Make sure hardware complete it */
1128         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1129                       readl, (sts & DMA_GSTS_RTPS), sts);
1130
1131         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1132 }
1133
1134 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1135 {
1136         u32 val;
1137         unsigned long flag;
1138
1139         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1140                 return;
1141
1142         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1143         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1144
1145         /* Make sure hardware complete it */
1146         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1147                       readl, (!(val & DMA_GSTS_WBFS)), val);
1148
1149         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1150 }
1151
1152 /* return value determine if we need a write buffer flush */
1153 static void __iommu_flush_context(struct intel_iommu *iommu,
1154                                   u16 did, u16 source_id, u8 function_mask,
1155                                   u64 type)
1156 {
1157         u64 val = 0;
1158         unsigned long flag;
1159
1160         switch (type) {
1161         case DMA_CCMD_GLOBAL_INVL:
1162                 val = DMA_CCMD_GLOBAL_INVL;
1163                 break;
1164         case DMA_CCMD_DOMAIN_INVL:
1165                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1166                 break;
1167         case DMA_CCMD_DEVICE_INVL:
1168                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1169                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1170                 break;
1171         default:
1172                 BUG();
1173         }
1174         val |= DMA_CCMD_ICC;
1175
1176         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1178
1179         /* Make sure hardware complete it */
1180         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1181                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1182
1183         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184 }
1185
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1188                                 u64 addr, unsigned int size_order, u64 type)
1189 {
1190         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191         u64 val = 0, val_iva = 0;
1192         unsigned long flag;
1193
1194         switch (type) {
1195         case DMA_TLB_GLOBAL_FLUSH:
1196                 /* global flush doesn't need set IVA_REG */
1197                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198                 break;
1199         case DMA_TLB_DSI_FLUSH:
1200                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201                 break;
1202         case DMA_TLB_PSI_FLUSH:
1203                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204                 /* IH bit is passed in as part of address */
1205                 val_iva = size_order | addr;
1206                 break;
1207         default:
1208                 BUG();
1209         }
1210         /* Note: set drain read/write */
1211 #if 0
1212         /*
1213          * This is probably to be super secure.. Looks like we can
1214          * ignore it without any impact.
1215          */
1216         if (cap_read_drain(iommu->cap))
1217                 val |= DMA_TLB_READ_DRAIN;
1218 #endif
1219         if (cap_write_drain(iommu->cap))
1220                 val |= DMA_TLB_WRITE_DRAIN;
1221
1222         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1223         /* Note: Only uses first TLB reg currently */
1224         if (val_iva)
1225                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1226         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1227
1228         /* Make sure hardware complete it */
1229         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1230                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1231
1232         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233
1234         /* check IOTLB invalidation granularity */
1235         if (DMA_TLB_IAIG(val) == 0)
1236                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1237         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1238                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1239                         (unsigned long long)DMA_TLB_IIRG(type),
1240                         (unsigned long long)DMA_TLB_IAIG(val));
1241 }
1242
1243 static struct device_domain_info *iommu_support_dev_iotlb(
1244         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1245 {
1246         int found = 0;
1247         unsigned long flags;
1248         struct device_domain_info *info;
1249         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1250
1251         if (!ecap_dev_iotlb_support(iommu->ecap))
1252                 return NULL;
1253
1254         if (!iommu->qi)
1255                 return NULL;
1256
1257         spin_lock_irqsave(&device_domain_lock, flags);
1258         list_for_each_entry(info, &domain->devices, link)
1259                 if (info->bus == bus && info->devfn == devfn) {
1260                         found = 1;
1261                         break;
1262                 }
1263         spin_unlock_irqrestore(&device_domain_lock, flags);
1264
1265         if (!found || !info->dev)
1266                 return NULL;
1267
1268         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1269                 return NULL;
1270
1271         if (!dmar_find_matched_atsr_unit(info->dev))
1272                 return NULL;
1273
1274         info->iommu = iommu;
1275
1276         return info;
1277 }
1278
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1280 {
1281         if (!info)
1282                 return;
1283
1284         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1285 }
1286
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1288 {
1289         if (!info->dev || !pci_ats_enabled(info->dev))
1290                 return;
1291
1292         pci_disable_ats(info->dev);
1293 }
1294
1295 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1296                                   u64 addr, unsigned mask)
1297 {
1298         u16 sid, qdep;
1299         unsigned long flags;
1300         struct device_domain_info *info;
1301
1302         spin_lock_irqsave(&device_domain_lock, flags);
1303         list_for_each_entry(info, &domain->devices, link) {
1304                 if (!info->dev || !pci_ats_enabled(info->dev))
1305                         continue;
1306
1307                 sid = info->bus << 8 | info->devfn;
1308                 qdep = pci_ats_queue_depth(info->dev);
1309                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1310         }
1311         spin_unlock_irqrestore(&device_domain_lock, flags);
1312 }
1313
1314 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1315                                   unsigned long pfn, unsigned int pages, int ih, int map)
1316 {
1317         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1318         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1319
1320         BUG_ON(pages == 0);
1321
1322         if (ih)
1323                 ih = 1 << 6;
1324         /*
1325          * Fallback to domain selective flush if no PSI support or the size is
1326          * too big.
1327          * PSI requires page size to be 2 ^ x, and the base address is naturally
1328          * aligned to the size
1329          */
1330         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1331                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1332                                                 DMA_TLB_DSI_FLUSH);
1333         else
1334                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1335                                                 DMA_TLB_PSI_FLUSH);
1336
1337         /*
1338          * In caching mode, changes of pages from non-present to present require
1339          * flush. However, device IOTLB doesn't need to be flushed in this case.
1340          */
1341         if (!cap_caching_mode(iommu->cap) || !map)
1342                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1343 }
1344
1345 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1346 {
1347         u32 pmen;
1348         unsigned long flags;
1349
1350         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1351         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1352         pmen &= ~DMA_PMEN_EPM;
1353         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1354
1355         /* wait for the protected region status bit to clear */
1356         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1357                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1358
1359         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1360 }
1361
1362 static int iommu_enable_translation(struct intel_iommu *iommu)
1363 {
1364         u32 sts;
1365         unsigned long flags;
1366
1367         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1368         iommu->gcmd |= DMA_GCMD_TE;
1369         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1370
1371         /* Make sure hardware complete it */
1372         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1373                       readl, (sts & DMA_GSTS_TES), sts);
1374
1375         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1376         return 0;
1377 }
1378
1379 static int iommu_disable_translation(struct intel_iommu *iommu)
1380 {
1381         u32 sts;
1382         unsigned long flag;
1383
1384         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385         iommu->gcmd &= ~DMA_GCMD_TE;
1386         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1387
1388         /* Make sure hardware complete it */
1389         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1390                       readl, (!(sts & DMA_GSTS_TES)), sts);
1391
1392         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1393         return 0;
1394 }
1395
1396
1397 static int iommu_init_domains(struct intel_iommu *iommu)
1398 {
1399         unsigned long ndomains;
1400         unsigned long nlongs;
1401
1402         ndomains = cap_ndoms(iommu->cap);
1403         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1404                  iommu->seq_id, ndomains);
1405         nlongs = BITS_TO_LONGS(ndomains);
1406
1407         spin_lock_init(&iommu->lock);
1408
1409         /* TBD: there might be 64K domains,
1410          * consider other allocation for future chip
1411          */
1412         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1413         if (!iommu->domain_ids) {
1414                 pr_err("IOMMU%d: allocating domain id array failed\n",
1415                        iommu->seq_id);
1416                 return -ENOMEM;
1417         }
1418         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1419                         GFP_KERNEL);
1420         if (!iommu->domains) {
1421                 pr_err("IOMMU%d: allocating domain array failed\n",
1422                        iommu->seq_id);
1423                 kfree(iommu->domain_ids);
1424                 iommu->domain_ids = NULL;
1425                 return -ENOMEM;
1426         }
1427
1428         /*
1429          * if Caching mode is set, then invalid translations are tagged
1430          * with domainid 0. Hence we need to pre-allocate it.
1431          */
1432         if (cap_caching_mode(iommu->cap))
1433                 set_bit(0, iommu->domain_ids);
1434         return 0;
1435 }
1436
1437 static void free_dmar_iommu(struct intel_iommu *iommu)
1438 {
1439         struct dmar_domain *domain;
1440         int i, count;
1441         unsigned long flags;
1442
1443         if ((iommu->domains) && (iommu->domain_ids)) {
1444                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1445                         /*
1446                          * Domain id 0 is reserved for invalid translation
1447                          * if hardware supports caching mode.
1448                          */
1449                         if (cap_caching_mode(iommu->cap) && i == 0)
1450                                 continue;
1451
1452                         domain = iommu->domains[i];
1453                         clear_bit(i, iommu->domain_ids);
1454
1455                         spin_lock_irqsave(&domain->iommu_lock, flags);
1456                         count = --domain->iommu_count;
1457                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1458                         if (count == 0)
1459                                 domain_exit(domain);
1460                 }
1461         }
1462
1463         if (iommu->gcmd & DMA_GCMD_TE)
1464                 iommu_disable_translation(iommu);
1465
1466         kfree(iommu->domains);
1467         kfree(iommu->domain_ids);
1468         iommu->domains = NULL;
1469         iommu->domain_ids = NULL;
1470
1471         g_iommus[iommu->seq_id] = NULL;
1472
1473         /* free context mapping */
1474         free_context_table(iommu);
1475 }
1476
1477 static struct dmar_domain *alloc_domain(bool vm)
1478 {
1479         /* domain id for virtual machine, it won't be set in context */
1480         static atomic_t vm_domid = ATOMIC_INIT(0);
1481         struct dmar_domain *domain;
1482
1483         domain = alloc_domain_mem();
1484         if (!domain)
1485                 return NULL;
1486
1487         domain->nid = -1;
1488         domain->iommu_count = 0;
1489         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1490         domain->flags = 0;
1491         spin_lock_init(&domain->iommu_lock);
1492         INIT_LIST_HEAD(&domain->devices);
1493         if (vm) {
1494                 domain->id = atomic_inc_return(&vm_domid);
1495                 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1496         }
1497
1498         return domain;
1499 }
1500
1501 static int iommu_attach_domain(struct dmar_domain *domain,
1502                                struct intel_iommu *iommu)
1503 {
1504         int num;
1505         unsigned long ndomains;
1506         unsigned long flags;
1507
1508         ndomains = cap_ndoms(iommu->cap);
1509
1510         spin_lock_irqsave(&iommu->lock, flags);
1511
1512         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513         if (num >= ndomains) {
1514                 spin_unlock_irqrestore(&iommu->lock, flags);
1515                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1516                 return -ENOMEM;
1517         }
1518
1519         domain->id = num;
1520         domain->iommu_count++;
1521         set_bit(num, iommu->domain_ids);
1522         set_bit(iommu->seq_id, domain->iommu_bmp);
1523         iommu->domains[num] = domain;
1524         spin_unlock_irqrestore(&iommu->lock, flags);
1525
1526         return 0;
1527 }
1528
1529 static void iommu_detach_domain(struct dmar_domain *domain,
1530                                 struct intel_iommu *iommu)
1531 {
1532         unsigned long flags;
1533         int num, ndomains;
1534
1535         spin_lock_irqsave(&iommu->lock, flags);
1536         ndomains = cap_ndoms(iommu->cap);
1537         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1538                 if (iommu->domains[num] == domain) {
1539                         clear_bit(num, iommu->domain_ids);
1540                         iommu->domains[num] = NULL;
1541                         break;
1542                 }
1543         }
1544         spin_unlock_irqrestore(&iommu->lock, flags);
1545 }
1546
1547 static struct iova_domain reserved_iova_list;
1548 static struct lock_class_key reserved_rbtree_key;
1549
1550 static int dmar_init_reserved_ranges(void)
1551 {
1552         struct pci_dev *pdev = NULL;
1553         struct iova *iova;
1554         int i;
1555
1556         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1557
1558         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1559                 &reserved_rbtree_key);
1560
1561         /* IOAPIC ranges shouldn't be accessed by DMA */
1562         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1563                 IOVA_PFN(IOAPIC_RANGE_END));
1564         if (!iova) {
1565                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1566                 return -ENODEV;
1567         }
1568
1569         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1570         for_each_pci_dev(pdev) {
1571                 struct resource *r;
1572
1573                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1574                         r = &pdev->resource[i];
1575                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1576                                 continue;
1577                         iova = reserve_iova(&reserved_iova_list,
1578                                             IOVA_PFN(r->start),
1579                                             IOVA_PFN(r->end));
1580                         if (!iova) {
1581                                 printk(KERN_ERR "Reserve iova failed\n");
1582                                 return -ENODEV;
1583                         }
1584                 }
1585         }
1586         return 0;
1587 }
1588
1589 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1590 {
1591         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1592 }
1593
1594 static inline int guestwidth_to_adjustwidth(int gaw)
1595 {
1596         int agaw;
1597         int r = (gaw - 12) % 9;
1598
1599         if (r == 0)
1600                 agaw = gaw;
1601         else
1602                 agaw = gaw + 9 - r;
1603         if (agaw > 64)
1604                 agaw = 64;
1605         return agaw;
1606 }
1607
1608 static int domain_init(struct dmar_domain *domain, int guest_width)
1609 {
1610         struct intel_iommu *iommu;
1611         int adjust_width, agaw;
1612         unsigned long sagaw;
1613
1614         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1615         domain_reserve_special_ranges(domain);
1616
1617         /* calculate AGAW */
1618         iommu = domain_get_iommu(domain);
1619         if (guest_width > cap_mgaw(iommu->cap))
1620                 guest_width = cap_mgaw(iommu->cap);
1621         domain->gaw = guest_width;
1622         adjust_width = guestwidth_to_adjustwidth(guest_width);
1623         agaw = width_to_agaw(adjust_width);
1624         sagaw = cap_sagaw(iommu->cap);
1625         if (!test_bit(agaw, &sagaw)) {
1626                 /* hardware doesn't support it, choose a bigger one */
1627                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1628                 agaw = find_next_bit(&sagaw, 5, agaw);
1629                 if (agaw >= 5)
1630                         return -ENODEV;
1631         }
1632         domain->agaw = agaw;
1633
1634         if (ecap_coherent(iommu->ecap))
1635                 domain->iommu_coherency = 1;
1636         else
1637                 domain->iommu_coherency = 0;
1638
1639         if (ecap_sc_support(iommu->ecap))
1640                 domain->iommu_snooping = 1;
1641         else
1642                 domain->iommu_snooping = 0;
1643
1644         if (intel_iommu_superpage)
1645                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1646         else
1647                 domain->iommu_superpage = 0;
1648
1649         domain->nid = iommu->node;
1650
1651         /* always allocate the top pgd */
1652         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1653         if (!domain->pgd)
1654                 return -ENOMEM;
1655         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1656         return 0;
1657 }
1658
1659 static void domain_exit(struct dmar_domain *domain)
1660 {
1661         struct dmar_drhd_unit *drhd;
1662         struct intel_iommu *iommu;
1663         struct page *freelist = NULL;
1664
1665         /* Domain 0 is reserved, so dont process it */
1666         if (!domain)
1667                 return;
1668
1669         /* Flush any lazy unmaps that may reference this domain */
1670         if (!intel_iommu_strict)
1671                 flush_unmaps_timeout(0);
1672
1673         /* remove associated devices */
1674         domain_remove_dev_info(domain);
1675
1676         /* destroy iovas */
1677         put_iova_domain(&domain->iovad);
1678
1679         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1680
1681         /* clear attached or cached domains */
1682         rcu_read_lock();
1683         for_each_active_iommu(iommu, drhd)
1684                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1685                     test_bit(iommu->seq_id, domain->iommu_bmp))
1686                         iommu_detach_domain(domain, iommu);
1687         rcu_read_unlock();
1688
1689         dma_free_pagelist(freelist);
1690
1691         free_domain_mem(domain);
1692 }
1693
1694 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1695                                  u8 bus, u8 devfn, int translation)
1696 {
1697         struct context_entry *context;
1698         unsigned long flags;
1699         struct intel_iommu *iommu;
1700         struct dma_pte *pgd;
1701         unsigned long num;
1702         unsigned long ndomains;
1703         int id;
1704         int agaw;
1705         struct device_domain_info *info = NULL;
1706
1707         pr_debug("Set context mapping for %02x:%02x.%d\n",
1708                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1709
1710         BUG_ON(!domain->pgd);
1711         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1712                translation != CONTEXT_TT_MULTI_LEVEL);
1713
1714         iommu = device_to_iommu(segment, bus, devfn);
1715         if (!iommu)
1716                 return -ENODEV;
1717
1718         context = device_to_context_entry(iommu, bus, devfn);
1719         if (!context)
1720                 return -ENOMEM;
1721         spin_lock_irqsave(&iommu->lock, flags);
1722         if (context_present(context)) {
1723                 spin_unlock_irqrestore(&iommu->lock, flags);
1724                 return 0;
1725         }
1726
1727         id = domain->id;
1728         pgd = domain->pgd;
1729
1730         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1731             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1732                 int found = 0;
1733
1734                 /* find an available domain id for this device in iommu */
1735                 ndomains = cap_ndoms(iommu->cap);
1736                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1737                         if (iommu->domains[num] == domain) {
1738                                 id = num;
1739                                 found = 1;
1740                                 break;
1741                         }
1742                 }
1743
1744                 if (found == 0) {
1745                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1746                         if (num >= ndomains) {
1747                                 spin_unlock_irqrestore(&iommu->lock, flags);
1748                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1749                                 return -EFAULT;
1750                         }
1751
1752                         set_bit(num, iommu->domain_ids);
1753                         iommu->domains[num] = domain;
1754                         id = num;
1755                 }
1756
1757                 /* Skip top levels of page tables for
1758                  * iommu which has less agaw than default.
1759                  * Unnecessary for PT mode.
1760                  */
1761                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1762                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1763                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1764                                 if (!dma_pte_present(pgd)) {
1765                                         spin_unlock_irqrestore(&iommu->lock, flags);
1766                                         return -ENOMEM;
1767                                 }
1768                         }
1769                 }
1770         }
1771
1772         context_set_domain_id(context, id);
1773
1774         if (translation != CONTEXT_TT_PASS_THROUGH) {
1775                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1776                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1777                                      CONTEXT_TT_MULTI_LEVEL;
1778         }
1779         /*
1780          * In pass through mode, AW must be programmed to indicate the largest
1781          * AGAW value supported by hardware. And ASR is ignored by hardware.
1782          */
1783         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1784                 context_set_address_width(context, iommu->msagaw);
1785         else {
1786                 context_set_address_root(context, virt_to_phys(pgd));
1787                 context_set_address_width(context, iommu->agaw);
1788         }
1789
1790         context_set_translation_type(context, translation);
1791         context_set_fault_enable(context);
1792         context_set_present(context);
1793         domain_flush_cache(domain, context, sizeof(*context));
1794
1795         /*
1796          * It's a non-present to present mapping. If hardware doesn't cache
1797          * non-present entry we only need to flush the write-buffer. If the
1798          * _does_ cache non-present entries, then it does so in the special
1799          * domain #0, which we have to flush:
1800          */
1801         if (cap_caching_mode(iommu->cap)) {
1802                 iommu->flush.flush_context(iommu, 0,
1803                                            (((u16)bus) << 8) | devfn,
1804                                            DMA_CCMD_MASK_NOBIT,
1805                                            DMA_CCMD_DEVICE_INVL);
1806                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1807         } else {
1808                 iommu_flush_write_buffer(iommu);
1809         }
1810         iommu_enable_dev_iotlb(info);
1811         spin_unlock_irqrestore(&iommu->lock, flags);
1812
1813         spin_lock_irqsave(&domain->iommu_lock, flags);
1814         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1815                 domain->iommu_count++;
1816                 if (domain->iommu_count == 1)
1817                         domain->nid = iommu->node;
1818                 domain_update_iommu_cap(domain);
1819         }
1820         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1821         return 0;
1822 }
1823
1824 static int
1825 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1826                         int translation)
1827 {
1828         int ret;
1829         struct pci_dev *tmp, *parent;
1830
1831         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1832                                          pdev->bus->number, pdev->devfn,
1833                                          translation);
1834         if (ret)
1835                 return ret;
1836
1837         /* dependent device mapping */
1838         tmp = pci_find_upstream_pcie_bridge(pdev);
1839         if (!tmp)
1840                 return 0;
1841         /* Secondary interface's bus number and devfn 0 */
1842         parent = pdev->bus->self;
1843         while (parent != tmp) {
1844                 ret = domain_context_mapping_one(domain,
1845                                                  pci_domain_nr(parent->bus),
1846                                                  parent->bus->number,
1847                                                  parent->devfn, translation);
1848                 if (ret)
1849                         return ret;
1850                 parent = parent->bus->self;
1851         }
1852         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1853                 return domain_context_mapping_one(domain,
1854                                         pci_domain_nr(tmp->subordinate),
1855                                         tmp->subordinate->number, 0,
1856                                         translation);
1857         else /* this is a legacy PCI bridge */
1858                 return domain_context_mapping_one(domain,
1859                                                   pci_domain_nr(tmp->bus),
1860                                                   tmp->bus->number,
1861                                                   tmp->devfn,
1862                                                   translation);
1863 }
1864
1865 static int domain_context_mapped(struct pci_dev *pdev)
1866 {
1867         int ret;
1868         struct pci_dev *tmp, *parent;
1869         struct intel_iommu *iommu;
1870
1871         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1872                                 pdev->devfn);
1873         if (!iommu)
1874                 return -ENODEV;
1875
1876         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1877         if (!ret)
1878                 return ret;
1879         /* dependent device mapping */
1880         tmp = pci_find_upstream_pcie_bridge(pdev);
1881         if (!tmp)
1882                 return ret;
1883         /* Secondary interface's bus number and devfn 0 */
1884         parent = pdev->bus->self;
1885         while (parent != tmp) {
1886                 ret = device_context_mapped(iommu, parent->bus->number,
1887                                             parent->devfn);
1888                 if (!ret)
1889                         return ret;
1890                 parent = parent->bus->self;
1891         }
1892         if (pci_is_pcie(tmp))
1893                 return device_context_mapped(iommu, tmp->subordinate->number,
1894                                              0);
1895         else
1896                 return device_context_mapped(iommu, tmp->bus->number,
1897                                              tmp->devfn);
1898 }
1899
1900 /* Returns a number of VTD pages, but aligned to MM page size */
1901 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1902                                             size_t size)
1903 {
1904         host_addr &= ~PAGE_MASK;
1905         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1906 }
1907
1908 /* Return largest possible superpage level for a given mapping */
1909 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1910                                           unsigned long iov_pfn,
1911                                           unsigned long phy_pfn,
1912                                           unsigned long pages)
1913 {
1914         int support, level = 1;
1915         unsigned long pfnmerge;
1916
1917         support = domain->iommu_superpage;
1918
1919         /* To use a large page, the virtual *and* physical addresses
1920            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1921            of them will mean we have to use smaller pages. So just
1922            merge them and check both at once. */
1923         pfnmerge = iov_pfn | phy_pfn;
1924
1925         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1926                 pages >>= VTD_STRIDE_SHIFT;
1927                 if (!pages)
1928                         break;
1929                 pfnmerge >>= VTD_STRIDE_SHIFT;
1930                 level++;
1931                 support--;
1932         }
1933         return level;
1934 }
1935
1936 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1937                             struct scatterlist *sg, unsigned long phys_pfn,
1938                             unsigned long nr_pages, int prot)
1939 {
1940         struct dma_pte *first_pte = NULL, *pte = NULL;
1941         phys_addr_t uninitialized_var(pteval);
1942         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1943         unsigned long sg_res;
1944         unsigned int largepage_lvl = 0;
1945         unsigned long lvl_pages = 0;
1946
1947         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1948
1949         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1950                 return -EINVAL;
1951
1952         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1953
1954         if (sg)
1955                 sg_res = 0;
1956         else {
1957                 sg_res = nr_pages + 1;
1958                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1959         }
1960
1961         while (nr_pages > 0) {
1962                 uint64_t tmp;
1963
1964                 if (!sg_res) {
1965                         sg_res = aligned_nrpages(sg->offset, sg->length);
1966                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1967                         sg->dma_length = sg->length;
1968                         pteval = page_to_phys(sg_page(sg)) | prot;
1969                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1970                 }
1971
1972                 if (!pte) {
1973                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1974
1975                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1976                         if (!pte)
1977                                 return -ENOMEM;
1978                         /* It is large page*/
1979                         if (largepage_lvl > 1) {
1980                                 pteval |= DMA_PTE_LARGE_PAGE;
1981                                 /* Ensure that old small page tables are removed to make room
1982                                    for superpage, if they exist. */
1983                                 dma_pte_clear_range(domain, iov_pfn,
1984                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1985                                 dma_pte_free_pagetable(domain, iov_pfn,
1986                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1987                         } else {
1988                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1989                         }
1990
1991                 }
1992                 /* We don't need lock here, nobody else
1993                  * touches the iova range
1994                  */
1995                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1996                 if (tmp) {
1997                         static int dumps = 5;
1998                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1999                                iov_pfn, tmp, (unsigned long long)pteval);
2000                         if (dumps) {
2001                                 dumps--;
2002                                 debug_dma_dump_mappings(NULL);
2003                         }
2004                         WARN_ON(1);
2005                 }
2006
2007                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2008
2009                 BUG_ON(nr_pages < lvl_pages);
2010                 BUG_ON(sg_res < lvl_pages);
2011
2012                 nr_pages -= lvl_pages;
2013                 iov_pfn += lvl_pages;
2014                 phys_pfn += lvl_pages;
2015                 pteval += lvl_pages * VTD_PAGE_SIZE;
2016                 sg_res -= lvl_pages;
2017
2018                 /* If the next PTE would be the first in a new page, then we
2019                    need to flush the cache on the entries we've just written.
2020                    And then we'll need to recalculate 'pte', so clear it and
2021                    let it get set again in the if (!pte) block above.
2022
2023                    If we're done (!nr_pages) we need to flush the cache too.
2024
2025                    Also if we've been setting superpages, we may need to
2026                    recalculate 'pte' and switch back to smaller pages for the
2027                    end of the mapping, if the trailing size is not enough to
2028                    use another superpage (i.e. sg_res < lvl_pages). */
2029                 pte++;
2030                 if (!nr_pages || first_pte_in_page(pte) ||
2031                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2032                         domain_flush_cache(domain, first_pte,
2033                                            (void *)pte - (void *)first_pte);
2034                         pte = NULL;
2035                 }
2036
2037                 if (!sg_res && nr_pages)
2038                         sg = sg_next(sg);
2039         }
2040         return 0;
2041 }
2042
2043 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2044                                     struct scatterlist *sg, unsigned long nr_pages,
2045                                     int prot)
2046 {
2047         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2048 }
2049
2050 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051                                      unsigned long phys_pfn, unsigned long nr_pages,
2052                                      int prot)
2053 {
2054         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2055 }
2056
2057 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2058 {
2059         if (!iommu)
2060                 return;
2061
2062         clear_context_table(iommu, bus, devfn);
2063         iommu->flush.flush_context(iommu, 0, 0, 0,
2064                                            DMA_CCMD_GLOBAL_INVL);
2065         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2066 }
2067
2068 static inline void unlink_domain_info(struct device_domain_info *info)
2069 {
2070         assert_spin_locked(&device_domain_lock);
2071         list_del(&info->link);
2072         list_del(&info->global);
2073         if (info->dev)
2074                 info->dev->dev.archdata.iommu = NULL;
2075 }
2076
2077 static void domain_remove_dev_info(struct dmar_domain *domain)
2078 {
2079         struct device_domain_info *info;
2080         unsigned long flags, flags2;
2081         struct intel_iommu *iommu;
2082
2083         spin_lock_irqsave(&device_domain_lock, flags);
2084         while (!list_empty(&domain->devices)) {
2085                 info = list_entry(domain->devices.next,
2086                         struct device_domain_info, link);
2087                 unlink_domain_info(info);
2088                 spin_unlock_irqrestore(&device_domain_lock, flags);
2089
2090                 iommu_disable_dev_iotlb(info);
2091                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2092                 iommu_detach_dev(iommu, info->bus, info->devfn);
2093
2094                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2095                         iommu_detach_dependent_devices(iommu, info->dev);
2096                         /* clear this iommu in iommu_bmp, update iommu count
2097                          * and capabilities
2098                          */
2099                         spin_lock_irqsave(&domain->iommu_lock, flags2);
2100                         if (test_and_clear_bit(iommu->seq_id,
2101                                                domain->iommu_bmp)) {
2102                                 domain->iommu_count--;
2103                                 domain_update_iommu_cap(domain);
2104                         }
2105                         spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2106                 }
2107
2108                 free_devinfo_mem(info);
2109                 spin_lock_irqsave(&device_domain_lock, flags);
2110         }
2111         spin_unlock_irqrestore(&device_domain_lock, flags);
2112 }
2113
2114 /*
2115  * find_domain
2116  * Note: we use struct device->archdata.iommu stores the info
2117  */
2118 static struct dmar_domain *find_domain(struct device *dev)
2119 {
2120         struct device_domain_info *info;
2121
2122         /* No lock here, assumes no domain exit in normal case */
2123         info = dev->archdata.iommu;
2124         if (info)
2125                 return info->domain;
2126         return NULL;
2127 }
2128
2129 static inline struct dmar_domain *
2130 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2131 {
2132         struct device_domain_info *info;
2133
2134         list_for_each_entry(info, &device_domain_list, global)
2135                 if (info->segment == segment && info->bus == bus &&
2136                     info->devfn == devfn)
2137                         return info->domain;
2138
2139         return NULL;
2140 }
2141
2142 static int dmar_insert_dev_info(int segment, int bus, int devfn,
2143                                 struct pci_dev *dev, struct dmar_domain **domp)
2144 {
2145         struct dmar_domain *found, *domain = *domp;
2146         struct device_domain_info *info;
2147         unsigned long flags;
2148
2149         info = alloc_devinfo_mem();
2150         if (!info)
2151                 return -ENOMEM;
2152
2153         info->segment = segment;
2154         info->bus = bus;
2155         info->devfn = devfn;
2156         info->dev = dev;
2157         info->domain = domain;
2158         if (!dev)
2159                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2160
2161         spin_lock_irqsave(&device_domain_lock, flags);
2162         if (dev)
2163                 found = find_domain(&dev->dev);
2164         else
2165                 found = dmar_search_domain_by_dev_info(segment, bus, devfn);
2166         if (found) {
2167                 spin_unlock_irqrestore(&device_domain_lock, flags);
2168                 free_devinfo_mem(info);
2169                 if (found != domain) {
2170                         domain_exit(domain);
2171                         *domp = found;
2172                 }
2173         } else {
2174                 list_add(&info->link, &domain->devices);
2175                 list_add(&info->global, &device_domain_list);
2176                 if (dev)
2177                         dev->dev.archdata.iommu = info;
2178                 spin_unlock_irqrestore(&device_domain_lock, flags);
2179         }
2180
2181         return 0;
2182 }
2183
2184 /* domain is initialized */
2185 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2186 {
2187         struct dmar_domain *domain, *free = NULL;
2188         struct intel_iommu *iommu;
2189         struct dmar_drhd_unit *drhd;
2190         struct pci_dev *dev_tmp;
2191         unsigned long flags;
2192         int bus = 0, devfn = 0;
2193         int segment;
2194
2195         domain = find_domain(&pdev->dev);
2196         if (domain)
2197                 return domain;
2198
2199         segment = pci_domain_nr(pdev->bus);
2200
2201         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2202         if (dev_tmp) {
2203                 if (pci_is_pcie(dev_tmp)) {
2204                         bus = dev_tmp->subordinate->number;
2205                         devfn = 0;
2206                 } else {
2207                         bus = dev_tmp->bus->number;
2208                         devfn = dev_tmp->devfn;
2209                 }
2210                 spin_lock_irqsave(&device_domain_lock, flags);
2211                 domain = dmar_search_domain_by_dev_info(segment, bus, devfn);
2212                 spin_unlock_irqrestore(&device_domain_lock, flags);
2213                 /* pcie-pci bridge already has a domain, uses it */
2214                 if (domain)
2215                         goto found_domain;
2216         }
2217
2218         drhd = dmar_find_matched_drhd_unit(pdev);
2219         if (!drhd) {
2220                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2221                         pci_name(pdev));
2222                 return NULL;
2223         }
2224         iommu = drhd->iommu;
2225
2226         /* Allocate and intialize new domain for the device */
2227         domain = alloc_domain(false);
2228         if (!domain)
2229                 goto error;
2230         if (iommu_attach_domain(domain, iommu)) {
2231                 free_domain_mem(domain);
2232                 goto error;
2233         }
2234         free = domain;
2235         if (domain_init(domain, gaw))
2236                 goto error;
2237
2238         /* register pcie-to-pci device */
2239         if (dev_tmp) {
2240                 if (dmar_insert_dev_info(segment, bus, devfn, NULL, &domain))
2241                         goto error;
2242                 else
2243                         free = NULL;
2244         }
2245
2246 found_domain:
2247         if (dmar_insert_dev_info(segment, pdev->bus->number, pdev->devfn,
2248                                  pdev, &domain) == 0)
2249                 return domain;
2250 error:
2251         if (free)
2252                 domain_exit(free);
2253         /* recheck it here, maybe others set it */
2254         return find_domain(&pdev->dev);
2255 }
2256
2257 static int iommu_identity_mapping;
2258 #define IDENTMAP_ALL            1
2259 #define IDENTMAP_GFX            2
2260 #define IDENTMAP_AZALIA         4
2261
2262 static int iommu_domain_identity_map(struct dmar_domain *domain,
2263                                      unsigned long long start,
2264                                      unsigned long long end)
2265 {
2266         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2267         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2268
2269         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2270                           dma_to_mm_pfn(last_vpfn))) {
2271                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2272                 return -ENOMEM;
2273         }
2274
2275         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2276                  start, end, domain->id);
2277         /*
2278          * RMRR range might have overlap with physical memory range,
2279          * clear it first
2280          */
2281         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2282
2283         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2284                                   last_vpfn - first_vpfn + 1,
2285                                   DMA_PTE_READ|DMA_PTE_WRITE);
2286 }
2287
2288 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2289                                       unsigned long long start,
2290                                       unsigned long long end)
2291 {
2292         struct dmar_domain *domain;
2293         int ret;
2294
2295         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2296         if (!domain)
2297                 return -ENOMEM;
2298
2299         /* For _hardware_ passthrough, don't bother. But for software
2300            passthrough, we do it anyway -- it may indicate a memory
2301            range which is reserved in E820, so which didn't get set
2302            up to start with in si_domain */
2303         if (domain == si_domain && hw_pass_through) {
2304                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2305                        pci_name(pdev), start, end);
2306                 return 0;
2307         }
2308
2309         printk(KERN_INFO
2310                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2311                pci_name(pdev), start, end);
2312         
2313         if (end < start) {
2314                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2315                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2316                         dmi_get_system_info(DMI_BIOS_VENDOR),
2317                         dmi_get_system_info(DMI_BIOS_VERSION),
2318                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2319                 ret = -EIO;
2320                 goto error;
2321         }
2322
2323         if (end >> agaw_to_width(domain->agaw)) {
2324                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2325                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2326                      agaw_to_width(domain->agaw),
2327                      dmi_get_system_info(DMI_BIOS_VENDOR),
2328                      dmi_get_system_info(DMI_BIOS_VERSION),
2329                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2330                 ret = -EIO;
2331                 goto error;
2332         }
2333
2334         ret = iommu_domain_identity_map(domain, start, end);
2335         if (ret)
2336                 goto error;
2337
2338         /* context entry init */
2339         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2340         if (ret)
2341                 goto error;
2342
2343         return 0;
2344
2345  error:
2346         domain_exit(domain);
2347         return ret;
2348 }
2349
2350 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2351         struct pci_dev *pdev)
2352 {
2353         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2354                 return 0;
2355         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2356                 rmrr->end_address);
2357 }
2358
2359 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2360 static inline void iommu_prepare_isa(void)
2361 {
2362         struct pci_dev *pdev;
2363         int ret;
2364
2365         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2366         if (!pdev)
2367                 return;
2368
2369         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2370         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2371
2372         if (ret)
2373                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2374                        "floppy might not work\n");
2375
2376 }
2377 #else
2378 static inline void iommu_prepare_isa(void)
2379 {
2380         return;
2381 }
2382 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2383
2384 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2385
2386 static int __init si_domain_init(int hw)
2387 {
2388         struct dmar_drhd_unit *drhd;
2389         struct intel_iommu *iommu;
2390         int nid, ret = 0;
2391
2392         si_domain = alloc_domain(false);
2393         if (!si_domain)
2394                 return -EFAULT;
2395
2396         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2397
2398         for_each_active_iommu(iommu, drhd) {
2399                 ret = iommu_attach_domain(si_domain, iommu);
2400                 if (ret) {
2401                         domain_exit(si_domain);
2402                         return -EFAULT;
2403                 }
2404         }
2405
2406         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2407                 domain_exit(si_domain);
2408                 return -EFAULT;
2409         }
2410
2411         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2412                  si_domain->id);
2413
2414         if (hw)
2415                 return 0;
2416
2417         for_each_online_node(nid) {
2418                 unsigned long start_pfn, end_pfn;
2419                 int i;
2420
2421                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2422                         ret = iommu_domain_identity_map(si_domain,
2423                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2424                         if (ret)
2425                                 return ret;
2426                 }
2427         }
2428
2429         return 0;
2430 }
2431
2432 static int identity_mapping(struct pci_dev *pdev)
2433 {
2434         struct device_domain_info *info;
2435
2436         if (likely(!iommu_identity_mapping))
2437                 return 0;
2438
2439         info = pdev->dev.archdata.iommu;
2440         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2441                 return (info->domain == si_domain);
2442
2443         return 0;
2444 }
2445
2446 static int domain_add_dev_info(struct dmar_domain *domain,
2447                                struct pci_dev *pdev,
2448                                int translation)
2449 {
2450         struct device_domain_info *info;
2451         unsigned long flags;
2452         int ret;
2453
2454         info = alloc_devinfo_mem();
2455         if (!info)
2456                 return -ENOMEM;
2457
2458         info->segment = pci_domain_nr(pdev->bus);
2459         info->bus = pdev->bus->number;
2460         info->devfn = pdev->devfn;
2461         info->dev = pdev;
2462         info->domain = domain;
2463
2464         spin_lock_irqsave(&device_domain_lock, flags);
2465         list_add(&info->link, &domain->devices);
2466         list_add(&info->global, &device_domain_list);
2467         pdev->dev.archdata.iommu = info;
2468         spin_unlock_irqrestore(&device_domain_lock, flags);
2469
2470         ret = domain_context_mapping(domain, pdev, translation);
2471         if (ret) {
2472                 spin_lock_irqsave(&device_domain_lock, flags);
2473                 unlink_domain_info(info);
2474                 spin_unlock_irqrestore(&device_domain_lock, flags);
2475                 free_devinfo_mem(info);
2476                 return ret;
2477         }
2478
2479         return 0;
2480 }
2481
2482 static bool device_has_rmrr(struct pci_dev *dev)
2483 {
2484         struct dmar_rmrr_unit *rmrr;
2485         struct device *tmp;
2486         int i;
2487
2488         rcu_read_lock();
2489         for_each_rmrr_units(rmrr) {
2490                 /*
2491                  * Return TRUE if this RMRR contains the device that
2492                  * is passed in.
2493                  */
2494                 for_each_active_dev_scope(rmrr->devices,
2495                                           rmrr->devices_cnt, i, tmp)
2496                         if (tmp == &dev->dev) {
2497                                 rcu_read_unlock();
2498                                 return true;
2499                         }
2500         }
2501         rcu_read_unlock();
2502         return false;
2503 }
2504
2505 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2506 {
2507
2508         /*
2509          * We want to prevent any device associated with an RMRR from
2510          * getting placed into the SI Domain. This is done because
2511          * problems exist when devices are moved in and out of domains
2512          * and their respective RMRR info is lost. We exempt USB devices
2513          * from this process due to their usage of RMRRs that are known
2514          * to not be needed after BIOS hand-off to OS.
2515          */
2516         if (device_has_rmrr(pdev) &&
2517             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2518                 return 0;
2519
2520         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2521                 return 1;
2522
2523         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2524                 return 1;
2525
2526         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2527                 return 0;
2528
2529         /*
2530          * We want to start off with all devices in the 1:1 domain, and
2531          * take them out later if we find they can't access all of memory.
2532          *
2533          * However, we can't do this for PCI devices behind bridges,
2534          * because all PCI devices behind the same bridge will end up
2535          * with the same source-id on their transactions.
2536          *
2537          * Practically speaking, we can't change things around for these
2538          * devices at run-time, because we can't be sure there'll be no
2539          * DMA transactions in flight for any of their siblings.
2540          * 
2541          * So PCI devices (unless they're on the root bus) as well as
2542          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2543          * the 1:1 domain, just in _case_ one of their siblings turns out
2544          * not to be able to map all of memory.
2545          */
2546         if (!pci_is_pcie(pdev)) {
2547                 if (!pci_is_root_bus(pdev->bus))
2548                         return 0;
2549                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2550                         return 0;
2551         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2552                 return 0;
2553
2554         /* 
2555          * At boot time, we don't yet know if devices will be 64-bit capable.
2556          * Assume that they will -- if they turn out not to be, then we can 
2557          * take them out of the 1:1 domain later.
2558          */
2559         if (!startup) {
2560                 /*
2561                  * If the device's dma_mask is less than the system's memory
2562                  * size then this is not a candidate for identity mapping.
2563                  */
2564                 u64 dma_mask = pdev->dma_mask;
2565
2566                 if (pdev->dev.coherent_dma_mask &&
2567                     pdev->dev.coherent_dma_mask < dma_mask)
2568                         dma_mask = pdev->dev.coherent_dma_mask;
2569
2570                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2571         }
2572
2573         return 1;
2574 }
2575
2576 static int __init iommu_prepare_static_identity_mapping(int hw)
2577 {
2578         struct pci_dev *pdev = NULL;
2579         int ret;
2580
2581         ret = si_domain_init(hw);
2582         if (ret)
2583                 return -EFAULT;
2584
2585         for_each_pci_dev(pdev) {
2586                 if (iommu_should_identity_map(pdev, 1)) {
2587                         ret = domain_add_dev_info(si_domain, pdev,
2588                                              hw ? CONTEXT_TT_PASS_THROUGH :
2589                                                   CONTEXT_TT_MULTI_LEVEL);
2590                         if (ret) {
2591                                 /* device not associated with an iommu */
2592                                 if (ret == -ENODEV)
2593                                         continue;
2594                                 return ret;
2595                         }
2596                         pr_info("IOMMU: %s identity mapping for device %s\n",
2597                                 hw ? "hardware" : "software", pci_name(pdev));
2598                 }
2599         }
2600
2601         return 0;
2602 }
2603
2604 static int __init init_dmars(void)
2605 {
2606         struct dmar_drhd_unit *drhd;
2607         struct dmar_rmrr_unit *rmrr;
2608         struct device *dev;
2609         struct intel_iommu *iommu;
2610         int i, ret;
2611
2612         /*
2613          * for each drhd
2614          *    allocate root
2615          *    initialize and program root entry to not present
2616          * endfor
2617          */
2618         for_each_drhd_unit(drhd) {
2619                 /*
2620                  * lock not needed as this is only incremented in the single
2621                  * threaded kernel __init code path all other access are read
2622                  * only
2623                  */
2624                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2625                         g_num_of_iommus++;
2626                         continue;
2627                 }
2628                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2629                           IOMMU_UNITS_SUPPORTED);
2630         }
2631
2632         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2633                         GFP_KERNEL);
2634         if (!g_iommus) {
2635                 printk(KERN_ERR "Allocating global iommu array failed\n");
2636                 ret = -ENOMEM;
2637                 goto error;
2638         }
2639
2640         deferred_flush = kzalloc(g_num_of_iommus *
2641                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2642         if (!deferred_flush) {
2643                 ret = -ENOMEM;
2644                 goto free_g_iommus;
2645         }
2646
2647         for_each_active_iommu(iommu, drhd) {
2648                 g_iommus[iommu->seq_id] = iommu;
2649
2650                 ret = iommu_init_domains(iommu);
2651                 if (ret)
2652                         goto free_iommu;
2653
2654                 /*
2655                  * TBD:
2656                  * we could share the same root & context tables
2657                  * among all IOMMU's. Need to Split it later.
2658                  */
2659                 ret = iommu_alloc_root_entry(iommu);
2660                 if (ret) {
2661                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2662                         goto free_iommu;
2663                 }
2664                 if (!ecap_pass_through(iommu->ecap))
2665                         hw_pass_through = 0;
2666         }
2667
2668         /*
2669          * Start from the sane iommu hardware state.
2670          */
2671         for_each_active_iommu(iommu, drhd) {
2672                 /*
2673                  * If the queued invalidation is already initialized by us
2674                  * (for example, while enabling interrupt-remapping) then
2675                  * we got the things already rolling from a sane state.
2676                  */
2677                 if (iommu->qi)
2678                         continue;
2679
2680                 /*
2681                  * Clear any previous faults.
2682                  */
2683                 dmar_fault(-1, iommu);
2684                 /*
2685                  * Disable queued invalidation if supported and already enabled
2686                  * before OS handover.
2687                  */
2688                 dmar_disable_qi(iommu);
2689         }
2690
2691         for_each_active_iommu(iommu, drhd) {
2692                 if (dmar_enable_qi(iommu)) {
2693                         /*
2694                          * Queued Invalidate not enabled, use Register Based
2695                          * Invalidate
2696                          */
2697                         iommu->flush.flush_context = __iommu_flush_context;
2698                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2699                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2700                                "invalidation\n",
2701                                 iommu->seq_id,
2702                                (unsigned long long)drhd->reg_base_addr);
2703                 } else {
2704                         iommu->flush.flush_context = qi_flush_context;
2705                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2706                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2707                                "invalidation\n",
2708                                 iommu->seq_id,
2709                                (unsigned long long)drhd->reg_base_addr);
2710                 }
2711         }
2712
2713         if (iommu_pass_through)
2714                 iommu_identity_mapping |= IDENTMAP_ALL;
2715
2716 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2717         iommu_identity_mapping |= IDENTMAP_GFX;
2718 #endif
2719
2720         check_tylersburg_isoch();
2721
2722         /*
2723          * If pass through is not set or not enabled, setup context entries for
2724          * identity mappings for rmrr, gfx, and isa and may fall back to static
2725          * identity mapping if iommu_identity_mapping is set.
2726          */
2727         if (iommu_identity_mapping) {
2728                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2729                 if (ret) {
2730                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2731                         goto free_iommu;
2732                 }
2733         }
2734         /*
2735          * For each rmrr
2736          *   for each dev attached to rmrr
2737          *   do
2738          *     locate drhd for dev, alloc domain for dev
2739          *     allocate free domain
2740          *     allocate page table entries for rmrr
2741          *     if context not allocated for bus
2742          *           allocate and init context
2743          *           set present in root table for this bus
2744          *     init context with domain, translation etc
2745          *    endfor
2746          * endfor
2747          */
2748         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2749         for_each_rmrr_units(rmrr) {
2750                 /* some BIOS lists non-exist devices in DMAR table. */
2751                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2752                                           i, dev) {
2753                         if (!dev_is_pci(dev))
2754                                 continue;
2755                         ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2756                         if (ret)
2757                                 printk(KERN_ERR
2758                                        "IOMMU: mapping reserved region failed\n");
2759                 }
2760         }
2761
2762         iommu_prepare_isa();
2763
2764         /*
2765          * for each drhd
2766          *   enable fault log
2767          *   global invalidate context cache
2768          *   global invalidate iotlb
2769          *   enable translation
2770          */
2771         for_each_iommu(iommu, drhd) {
2772                 if (drhd->ignored) {
2773                         /*
2774                          * we always have to disable PMRs or DMA may fail on
2775                          * this device
2776                          */
2777                         if (force_on)
2778                                 iommu_disable_protect_mem_regions(iommu);
2779                         continue;
2780                 }
2781
2782                 iommu_flush_write_buffer(iommu);
2783
2784                 ret = dmar_set_interrupt(iommu);
2785                 if (ret)
2786                         goto free_iommu;
2787
2788                 iommu_set_root_entry(iommu);
2789
2790                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2791                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2792
2793                 ret = iommu_enable_translation(iommu);
2794                 if (ret)
2795                         goto free_iommu;
2796
2797                 iommu_disable_protect_mem_regions(iommu);
2798         }
2799
2800         return 0;
2801
2802 free_iommu:
2803         for_each_active_iommu(iommu, drhd)
2804                 free_dmar_iommu(iommu);
2805         kfree(deferred_flush);
2806 free_g_iommus:
2807         kfree(g_iommus);
2808 error:
2809         return ret;
2810 }
2811
2812 /* This takes a number of _MM_ pages, not VTD pages */
2813 static struct iova *intel_alloc_iova(struct device *dev,
2814                                      struct dmar_domain *domain,
2815                                      unsigned long nrpages, uint64_t dma_mask)
2816 {
2817         struct pci_dev *pdev = to_pci_dev(dev);
2818         struct iova *iova = NULL;
2819
2820         /* Restrict dma_mask to the width that the iommu can handle */
2821         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2822
2823         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2824                 /*
2825                  * First try to allocate an io virtual address in
2826                  * DMA_BIT_MASK(32) and if that fails then try allocating
2827                  * from higher range
2828                  */
2829                 iova = alloc_iova(&domain->iovad, nrpages,
2830                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2831                 if (iova)
2832                         return iova;
2833         }
2834         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2835         if (unlikely(!iova)) {
2836                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2837                        nrpages, pci_name(pdev));
2838                 return NULL;
2839         }
2840
2841         return iova;
2842 }
2843
2844 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2845 {
2846         struct dmar_domain *domain;
2847         int ret;
2848
2849         domain = get_domain_for_dev(pdev,
2850                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2851         if (!domain) {
2852                 printk(KERN_ERR
2853                         "Allocating domain for %s failed", pci_name(pdev));
2854                 return NULL;
2855         }
2856
2857         /* make sure context mapping is ok */
2858         if (unlikely(!domain_context_mapped(pdev))) {
2859                 ret = domain_context_mapping(domain, pdev,
2860                                              CONTEXT_TT_MULTI_LEVEL);
2861                 if (ret) {
2862                         printk(KERN_ERR
2863                                 "Domain context map for %s failed",
2864                                 pci_name(pdev));
2865                         return NULL;
2866                 }
2867         }
2868
2869         return domain;
2870 }
2871
2872 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2873 {
2874         struct device_domain_info *info;
2875
2876         /* No lock here, assumes no domain exit in normal case */
2877         info = dev->dev.archdata.iommu;
2878         if (likely(info))
2879                 return info->domain;
2880
2881         return __get_valid_domain_for_dev(dev);
2882 }
2883
2884 static int iommu_dummy(struct device *dev)
2885 {
2886         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2887 }
2888
2889 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2890 static int iommu_no_mapping(struct device *dev)
2891 {
2892         struct pci_dev *pdev;
2893         int found;
2894
2895         if (unlikely(!dev_is_pci(dev)))
2896                 return 1;
2897
2898         if (iommu_dummy(dev))
2899                 return 1;
2900
2901         if (!iommu_identity_mapping)
2902                 return 0;
2903
2904         pdev = to_pci_dev(dev);
2905         found = identity_mapping(pdev);
2906         if (found) {
2907                 if (iommu_should_identity_map(pdev, 0))
2908                         return 1;
2909                 else {
2910                         /*
2911                          * 32 bit DMA is removed from si_domain and fall back
2912                          * to non-identity mapping.
2913                          */
2914                         domain_remove_one_dev_info(si_domain, pdev);
2915                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2916                                pci_name(pdev));
2917                         return 0;
2918                 }
2919         } else {
2920                 /*
2921                  * In case of a detached 64 bit DMA device from vm, the device
2922                  * is put into si_domain for identity mapping.
2923                  */
2924                 if (iommu_should_identity_map(pdev, 0)) {
2925                         int ret;
2926                         ret = domain_add_dev_info(si_domain, pdev,
2927                                                   hw_pass_through ?
2928                                                   CONTEXT_TT_PASS_THROUGH :
2929                                                   CONTEXT_TT_MULTI_LEVEL);
2930                         if (!ret) {
2931                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2932                                        pci_name(pdev));
2933                                 return 1;
2934                         }
2935                 }
2936         }
2937
2938         return 0;
2939 }
2940
2941 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2942                                      size_t size, int dir, u64 dma_mask)
2943 {
2944         struct pci_dev *pdev = to_pci_dev(hwdev);
2945         struct dmar_domain *domain;
2946         phys_addr_t start_paddr;
2947         struct iova *iova;
2948         int prot = 0;
2949         int ret;
2950         struct intel_iommu *iommu;
2951         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2952
2953         BUG_ON(dir == DMA_NONE);
2954
2955         if (iommu_no_mapping(hwdev))
2956                 return paddr;
2957
2958         domain = get_valid_domain_for_dev(pdev);
2959         if (!domain)
2960                 return 0;
2961
2962         iommu = domain_get_iommu(domain);
2963         size = aligned_nrpages(paddr, size);
2964
2965         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2966         if (!iova)
2967                 goto error;
2968
2969         /*
2970          * Check if DMAR supports zero-length reads on write only
2971          * mappings..
2972          */
2973         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2974                         !cap_zlr(iommu->cap))
2975                 prot |= DMA_PTE_READ;
2976         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2977                 prot |= DMA_PTE_WRITE;
2978         /*
2979          * paddr - (paddr + size) might be partial page, we should map the whole
2980          * page.  Note: if two part of one page are separately mapped, we
2981          * might have two guest_addr mapping to the same host paddr, but this
2982          * is not a big problem
2983          */
2984         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2985                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2986         if (ret)
2987                 goto error;
2988
2989         /* it's a non-present to present mapping. Only flush if caching mode */
2990         if (cap_caching_mode(iommu->cap))
2991                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2992         else
2993                 iommu_flush_write_buffer(iommu);
2994
2995         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2996         start_paddr += paddr & ~PAGE_MASK;
2997         return start_paddr;
2998
2999 error:
3000         if (iova)
3001                 __free_iova(&domain->iovad, iova);
3002         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3003                 pci_name(pdev), size, (unsigned long long)paddr, dir);
3004         return 0;
3005 }
3006
3007 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3008                                  unsigned long offset, size_t size,
3009                                  enum dma_data_direction dir,
3010                                  struct dma_attrs *attrs)
3011 {
3012         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3013                                   dir, to_pci_dev(dev)->dma_mask);
3014 }
3015
3016 static void flush_unmaps(void)
3017 {
3018         int i, j;
3019
3020         timer_on = 0;
3021
3022         /* just flush them all */
3023         for (i = 0; i < g_num_of_iommus; i++) {
3024                 struct intel_iommu *iommu = g_iommus[i];
3025                 if (!iommu)
3026                         continue;
3027
3028                 if (!deferred_flush[i].next)
3029                         continue;
3030
3031                 /* In caching mode, global flushes turn emulation expensive */
3032                 if (!cap_caching_mode(iommu->cap))
3033                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3034                                          DMA_TLB_GLOBAL_FLUSH);
3035                 for (j = 0; j < deferred_flush[i].next; j++) {
3036                         unsigned long mask;
3037                         struct iova *iova = deferred_flush[i].iova[j];
3038                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3039
3040                         /* On real hardware multiple invalidations are expensive */
3041                         if (cap_caching_mode(iommu->cap))
3042                                 iommu_flush_iotlb_psi(iommu, domain->id,
3043                                         iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3044                                         !deferred_flush[i].freelist[j], 0);
3045                         else {
3046                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3047                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3048                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3049                         }
3050                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3051                         if (deferred_flush[i].freelist[j])
3052                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3053                 }
3054                 deferred_flush[i].next = 0;
3055         }
3056
3057         list_size = 0;
3058 }
3059
3060 static void flush_unmaps_timeout(unsigned long data)
3061 {
3062         unsigned long flags;
3063
3064         spin_lock_irqsave(&async_umap_flush_lock, flags);
3065         flush_unmaps();
3066         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3067 }
3068
3069 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3070 {
3071         unsigned long flags;
3072         int next, iommu_id;
3073         struct intel_iommu *iommu;
3074
3075         spin_lock_irqsave(&async_umap_flush_lock, flags);
3076         if (list_size == HIGH_WATER_MARK)
3077                 flush_unmaps();
3078
3079         iommu = domain_get_iommu(dom);
3080         iommu_id = iommu->seq_id;
3081
3082         next = deferred_flush[iommu_id].next;
3083         deferred_flush[iommu_id].domain[next] = dom;
3084         deferred_flush[iommu_id].iova[next] = iova;
3085         deferred_flush[iommu_id].freelist[next] = freelist;
3086         deferred_flush[iommu_id].next++;
3087
3088         if (!timer_on) {
3089                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3090                 timer_on = 1;
3091         }
3092         list_size++;
3093         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3094 }
3095
3096 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3097                              size_t size, enum dma_data_direction dir,
3098                              struct dma_attrs *attrs)
3099 {
3100         struct pci_dev *pdev = to_pci_dev(dev);
3101         struct dmar_domain *domain;
3102         unsigned long start_pfn, last_pfn;
3103         struct iova *iova;
3104         struct intel_iommu *iommu;
3105         struct page *freelist;
3106
3107         if (iommu_no_mapping(dev))
3108                 return;
3109
3110         domain = find_domain(dev);
3111         BUG_ON(!domain);
3112
3113         iommu = domain_get_iommu(domain);
3114
3115         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3116         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3117                       (unsigned long long)dev_addr))
3118                 return;
3119
3120         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3121         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3122
3123         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3124                  pci_name(pdev), start_pfn, last_pfn);
3125
3126         freelist = domain_unmap(domain, start_pfn, last_pfn);
3127
3128         if (intel_iommu_strict) {
3129                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3130                                       last_pfn - start_pfn + 1, !freelist, 0);
3131                 /* free iova */
3132                 __free_iova(&domain->iovad, iova);
3133                 dma_free_pagelist(freelist);
3134         } else {
3135                 add_unmap(domain, iova, freelist);
3136                 /*
3137                  * queue up the release of the unmap to save the 1/6th of the
3138                  * cpu used up by the iotlb flush operation...
3139                  */
3140         }
3141 }
3142
3143 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3144                                   dma_addr_t *dma_handle, gfp_t flags,
3145                                   struct dma_attrs *attrs)
3146 {
3147         void *vaddr;
3148         int order;
3149
3150         size = PAGE_ALIGN(size);
3151         order = get_order(size);
3152
3153         if (!iommu_no_mapping(hwdev))
3154                 flags &= ~(GFP_DMA | GFP_DMA32);
3155         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3156                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3157                         flags |= GFP_DMA;
3158                 else
3159                         flags |= GFP_DMA32;
3160         }
3161
3162         vaddr = (void *)__get_free_pages(flags, order);
3163         if (!vaddr)
3164                 return NULL;
3165         memset(vaddr, 0, size);
3166
3167         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3168                                          DMA_BIDIRECTIONAL,
3169                                          hwdev->coherent_dma_mask);
3170         if (*dma_handle)
3171                 return vaddr;
3172         free_pages((unsigned long)vaddr, order);
3173         return NULL;
3174 }
3175
3176 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3177                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3178 {
3179         int order;
3180
3181         size = PAGE_ALIGN(size);
3182         order = get_order(size);
3183
3184         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3185         free_pages((unsigned long)vaddr, order);
3186 }
3187
3188 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3189                            int nelems, enum dma_data_direction dir,
3190                            struct dma_attrs *attrs)
3191 {
3192         struct pci_dev *pdev = to_pci_dev(hwdev);
3193         struct dmar_domain *domain;
3194         unsigned long start_pfn, last_pfn;
3195         struct iova *iova;
3196         struct intel_iommu *iommu;
3197         struct page *freelist;
3198
3199         if (iommu_no_mapping(hwdev))
3200                 return;
3201
3202         domain = find_domain(hwdev);
3203         BUG_ON(!domain);
3204
3205         iommu = domain_get_iommu(domain);
3206
3207         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3208         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3209                       (unsigned long long)sglist[0].dma_address))
3210                 return;
3211
3212         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3213         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3214
3215         freelist = domain_unmap(domain, start_pfn, last_pfn);
3216
3217         if (intel_iommu_strict) {
3218                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219                                       last_pfn - start_pfn + 1, !freelist, 0);
3220                 /* free iova */
3221                 __free_iova(&domain->iovad, iova);
3222                 dma_free_pagelist(freelist);
3223         } else {
3224                 add_unmap(domain, iova, freelist);
3225                 /*
3226                  * queue up the release of the unmap to save the 1/6th of the
3227                  * cpu used up by the iotlb flush operation...
3228                  */
3229         }
3230 }
3231
3232 static int intel_nontranslate_map_sg(struct device *hddev,
3233         struct scatterlist *sglist, int nelems, int dir)
3234 {
3235         int i;
3236         struct scatterlist *sg;
3237
3238         for_each_sg(sglist, sg, nelems, i) {
3239                 BUG_ON(!sg_page(sg));
3240                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3241                 sg->dma_length = sg->length;
3242         }
3243         return nelems;
3244 }
3245
3246 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3247                         enum dma_data_direction dir, struct dma_attrs *attrs)
3248 {
3249         int i;
3250         struct pci_dev *pdev = to_pci_dev(hwdev);
3251         struct dmar_domain *domain;
3252         size_t size = 0;
3253         int prot = 0;
3254         struct iova *iova = NULL;
3255         int ret;
3256         struct scatterlist *sg;
3257         unsigned long start_vpfn;
3258         struct intel_iommu *iommu;
3259
3260         BUG_ON(dir == DMA_NONE);
3261         if (iommu_no_mapping(hwdev))
3262                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3263
3264         domain = get_valid_domain_for_dev(pdev);
3265         if (!domain)
3266                 return 0;
3267
3268         iommu = domain_get_iommu(domain);
3269
3270         for_each_sg(sglist, sg, nelems, i)
3271                 size += aligned_nrpages(sg->offset, sg->length);
3272
3273         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3274                                 pdev->dma_mask);
3275         if (!iova) {
3276                 sglist->dma_length = 0;
3277                 return 0;
3278         }
3279
3280         /*
3281          * Check if DMAR supports zero-length reads on write only
3282          * mappings..
3283          */
3284         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3285                         !cap_zlr(iommu->cap))
3286                 prot |= DMA_PTE_READ;
3287         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3288                 prot |= DMA_PTE_WRITE;
3289
3290         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3291
3292         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3293         if (unlikely(ret)) {
3294                 /*  clear the page */
3295                 dma_pte_clear_range(domain, start_vpfn,
3296                                     start_vpfn + size - 1);
3297                 /* free page tables */
3298                 dma_pte_free_pagetable(domain, start_vpfn,
3299                                        start_vpfn + size - 1);
3300                 /* free iova */
3301                 __free_iova(&domain->iovad, iova);
3302                 return 0;
3303         }
3304
3305         /* it's a non-present to present mapping. Only flush if caching mode */
3306         if (cap_caching_mode(iommu->cap))
3307                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3308         else
3309                 iommu_flush_write_buffer(iommu);
3310
3311         return nelems;
3312 }
3313
3314 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3315 {
3316         return !dma_addr;
3317 }
3318
3319 struct dma_map_ops intel_dma_ops = {
3320         .alloc = intel_alloc_coherent,
3321         .free = intel_free_coherent,
3322         .map_sg = intel_map_sg,
3323         .unmap_sg = intel_unmap_sg,
3324         .map_page = intel_map_page,
3325         .unmap_page = intel_unmap_page,
3326         .mapping_error = intel_mapping_error,
3327 };
3328
3329 static inline int iommu_domain_cache_init(void)
3330 {
3331         int ret = 0;
3332
3333         iommu_domain_cache = kmem_cache_create("iommu_domain",
3334                                          sizeof(struct dmar_domain),
3335                                          0,
3336                                          SLAB_HWCACHE_ALIGN,
3337
3338                                          NULL);
3339         if (!iommu_domain_cache) {
3340                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3341                 ret = -ENOMEM;
3342         }
3343
3344         return ret;
3345 }
3346
3347 static inline int iommu_devinfo_cache_init(void)
3348 {
3349         int ret = 0;
3350
3351         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3352                                          sizeof(struct device_domain_info),
3353                                          0,
3354                                          SLAB_HWCACHE_ALIGN,
3355                                          NULL);
3356         if (!iommu_devinfo_cache) {
3357                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3358                 ret = -ENOMEM;
3359         }
3360
3361         return ret;
3362 }
3363
3364 static inline int iommu_iova_cache_init(void)
3365 {
3366         int ret = 0;
3367
3368         iommu_iova_cache = kmem_cache_create("iommu_iova",
3369                                          sizeof(struct iova),
3370                                          0,
3371                                          SLAB_HWCACHE_ALIGN,
3372                                          NULL);
3373         if (!iommu_iova_cache) {
3374                 printk(KERN_ERR "Couldn't create iova cache\n");
3375                 ret = -ENOMEM;
3376         }
3377
3378         return ret;
3379 }
3380
3381 static int __init iommu_init_mempool(void)
3382 {
3383         int ret;
3384         ret = iommu_iova_cache_init();
3385         if (ret)
3386                 return ret;
3387
3388         ret = iommu_domain_cache_init();
3389         if (ret)
3390                 goto domain_error;
3391
3392         ret = iommu_devinfo_cache_init();
3393         if (!ret)
3394                 return ret;
3395
3396         kmem_cache_destroy(iommu_domain_cache);
3397 domain_error:
3398         kmem_cache_destroy(iommu_iova_cache);
3399
3400         return -ENOMEM;
3401 }
3402
3403 static void __init iommu_exit_mempool(void)
3404 {
3405         kmem_cache_destroy(iommu_devinfo_cache);
3406         kmem_cache_destroy(iommu_domain_cache);
3407         kmem_cache_destroy(iommu_iova_cache);
3408
3409 }
3410
3411 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3412 {
3413         struct dmar_drhd_unit *drhd;
3414         u32 vtbar;
3415         int rc;
3416
3417         /* We know that this device on this chipset has its own IOMMU.
3418          * If we find it under a different IOMMU, then the BIOS is lying
3419          * to us. Hope that the IOMMU for this device is actually
3420          * disabled, and it needs no translation...
3421          */
3422         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3423         if (rc) {
3424                 /* "can't" happen */
3425                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3426                 return;
3427         }
3428         vtbar &= 0xffff0000;
3429
3430         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3431         drhd = dmar_find_matched_drhd_unit(pdev);
3432         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3433                             TAINT_FIRMWARE_WORKAROUND,
3434                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3435                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3436 }
3437 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3438
3439 static void __init init_no_remapping_devices(void)
3440 {
3441         struct dmar_drhd_unit *drhd;
3442         struct device *dev;
3443         int i;
3444
3445         for_each_drhd_unit(drhd) {
3446                 if (!drhd->include_all) {
3447                         for_each_active_dev_scope(drhd->devices,
3448                                                   drhd->devices_cnt, i, dev)
3449                                 break;
3450                         /* ignore DMAR unit if no devices exist */
3451                         if (i == drhd->devices_cnt)
3452                                 drhd->ignored = 1;
3453                 }
3454         }
3455
3456         for_each_active_drhd_unit(drhd) {
3457                 if (drhd->include_all)
3458                         continue;
3459
3460                 for_each_active_dev_scope(drhd->devices,
3461                                           drhd->devices_cnt, i, dev)
3462                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3463                                 break;
3464                 if (i < drhd->devices_cnt)
3465                         continue;
3466
3467                 /* This IOMMU has *only* gfx devices. Either bypass it or
3468                    set the gfx_mapped flag, as appropriate */
3469                 if (dmar_map_gfx) {
3470                         intel_iommu_gfx_mapped = 1;
3471                 } else {
3472                         drhd->ignored = 1;
3473                         for_each_active_dev_scope(drhd->devices,
3474                                                   drhd->devices_cnt, i, dev)
3475                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3476                 }
3477         }
3478 }
3479
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3482 {
3483         struct dmar_drhd_unit *drhd;
3484         struct intel_iommu *iommu = NULL;
3485
3486         for_each_active_iommu(iommu, drhd)
3487                 if (iommu->qi)
3488                         dmar_reenable_qi(iommu);
3489
3490         for_each_iommu(iommu, drhd) {
3491                 if (drhd->ignored) {
3492                         /*
3493                          * we always have to disable PMRs or DMA may fail on
3494                          * this device
3495                          */
3496                         if (force_on)
3497                                 iommu_disable_protect_mem_regions(iommu);
3498                         continue;
3499                 }
3500         
3501                 iommu_flush_write_buffer(iommu);
3502
3503                 iommu_set_root_entry(iommu);
3504
3505                 iommu->flush.flush_context(iommu, 0, 0, 0,
3506                                            DMA_CCMD_GLOBAL_INVL);
3507                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3508                                          DMA_TLB_GLOBAL_FLUSH);
3509                 if (iommu_enable_translation(iommu))
3510                         return 1;
3511                 iommu_disable_protect_mem_regions(iommu);
3512         }
3513
3514         return 0;
3515 }
3516
3517 static void iommu_flush_all(void)
3518 {
3519         struct dmar_drhd_unit *drhd;
3520         struct intel_iommu *iommu;
3521
3522         for_each_active_iommu(iommu, drhd) {
3523                 iommu->flush.flush_context(iommu, 0, 0, 0,
3524                                            DMA_CCMD_GLOBAL_INVL);
3525                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3526                                          DMA_TLB_GLOBAL_FLUSH);
3527         }
3528 }
3529
3530 static int iommu_suspend(void)
3531 {
3532         struct dmar_drhd_unit *drhd;
3533         struct intel_iommu *iommu = NULL;
3534         unsigned long flag;
3535
3536         for_each_active_iommu(iommu, drhd) {
3537                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3538                                                  GFP_ATOMIC);
3539                 if (!iommu->iommu_state)
3540                         goto nomem;
3541         }
3542
3543         iommu_flush_all();
3544
3545         for_each_active_iommu(iommu, drhd) {
3546                 iommu_disable_translation(iommu);
3547
3548                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3549
3550                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3551                         readl(iommu->reg + DMAR_FECTL_REG);
3552                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3553                         readl(iommu->reg + DMAR_FEDATA_REG);
3554                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3555                         readl(iommu->reg + DMAR_FEADDR_REG);
3556                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3557                         readl(iommu->reg + DMAR_FEUADDR_REG);
3558
3559                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3560         }
3561         return 0;
3562
3563 nomem:
3564         for_each_active_iommu(iommu, drhd)
3565                 kfree(iommu->iommu_state);
3566
3567         return -ENOMEM;
3568 }
3569
3570 static void iommu_resume(void)
3571 {
3572         struct dmar_drhd_unit *drhd;
3573         struct intel_iommu *iommu = NULL;
3574         unsigned long flag;
3575
3576         if (init_iommu_hw()) {
3577                 if (force_on)
3578                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3579                 else
3580                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3581                 return;
3582         }
3583
3584         for_each_active_iommu(iommu, drhd) {
3585
3586                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3587
3588                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3589                         iommu->reg + DMAR_FECTL_REG);
3590                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3591                         iommu->reg + DMAR_FEDATA_REG);
3592                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3593                         iommu->reg + DMAR_FEADDR_REG);
3594                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3595                         iommu->reg + DMAR_FEUADDR_REG);
3596
3597                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3598         }
3599
3600         for_each_active_iommu(iommu, drhd)
3601                 kfree(iommu->iommu_state);
3602 }
3603
3604 static struct syscore_ops iommu_syscore_ops = {
3605         .resume         = iommu_resume,
3606         .suspend        = iommu_suspend,
3607 };
3608
3609 static void __init init_iommu_pm_ops(void)
3610 {
3611         register_syscore_ops(&iommu_syscore_ops);
3612 }
3613
3614 #else
3615 static inline void init_iommu_pm_ops(void) {}
3616 #endif  /* CONFIG_PM */
3617
3618
3619 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3620 {
3621         struct acpi_dmar_reserved_memory *rmrr;
3622         struct dmar_rmrr_unit *rmrru;
3623
3624         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3625         if (!rmrru)
3626                 return -ENOMEM;
3627
3628         rmrru->hdr = header;
3629         rmrr = (struct acpi_dmar_reserved_memory *)header;
3630         rmrru->base_address = rmrr->base_address;
3631         rmrru->end_address = rmrr->end_address;
3632         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3633                                 ((void *)rmrr) + rmrr->header.length,
3634                                 &rmrru->devices_cnt);
3635         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3636                 kfree(rmrru);
3637                 return -ENOMEM;
3638         }
3639
3640         list_add(&rmrru->list, &dmar_rmrr_units);
3641
3642         return 0;
3643 }
3644
3645 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3646 {
3647         struct acpi_dmar_atsr *atsr;
3648         struct dmar_atsr_unit *atsru;
3649
3650         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3651         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3652         if (!atsru)
3653                 return -ENOMEM;
3654
3655         atsru->hdr = hdr;
3656         atsru->include_all = atsr->flags & 0x1;
3657         if (!atsru->include_all) {
3658                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3659                                 (void *)atsr + atsr->header.length,
3660                                 &atsru->devices_cnt);
3661                 if (atsru->devices_cnt && atsru->devices == NULL) {
3662                         kfree(atsru);
3663                         return -ENOMEM;
3664                 }
3665         }
3666
3667         list_add_rcu(&atsru->list, &dmar_atsr_units);
3668
3669         return 0;
3670 }
3671
3672 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3673 {
3674         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3675         kfree(atsru);
3676 }
3677
3678 static void intel_iommu_free_dmars(void)
3679 {
3680         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3681         struct dmar_atsr_unit *atsru, *atsr_n;
3682
3683         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3684                 list_del(&rmrru->list);
3685                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3686                 kfree(rmrru);
3687         }
3688
3689         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3690                 list_del(&atsru->list);
3691                 intel_iommu_free_atsr(atsru);
3692         }
3693 }
3694
3695 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3696 {
3697         int i, ret = 1;
3698         struct pci_bus *bus;
3699         struct pci_dev *bridge = NULL;
3700         struct device *tmp;
3701         struct acpi_dmar_atsr *atsr;
3702         struct dmar_atsr_unit *atsru;
3703
3704         dev = pci_physfn(dev);
3705         for (bus = dev->bus; bus; bus = bus->parent) {
3706                 bridge = bus->self;
3707                 if (!bridge || !pci_is_pcie(bridge) ||
3708                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3709                         return 0;
3710                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3711                         break;
3712         }
3713         if (!bridge)
3714                 return 0;
3715
3716         rcu_read_lock();
3717         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3718                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3719                 if (atsr->segment != pci_domain_nr(dev->bus))
3720                         continue;
3721
3722                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3723                         if (tmp == &bridge->dev)
3724                                 goto out;
3725
3726                 if (atsru->include_all)
3727                         goto out;
3728         }
3729         ret = 0;
3730 out:
3731         rcu_read_unlock();
3732
3733         return ret;
3734 }
3735
3736 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3737 {
3738         int ret = 0;
3739         struct dmar_rmrr_unit *rmrru;
3740         struct dmar_atsr_unit *atsru;
3741         struct acpi_dmar_atsr *atsr;
3742         struct acpi_dmar_reserved_memory *rmrr;
3743
3744         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3745                 return 0;
3746
3747         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3748                 rmrr = container_of(rmrru->hdr,
3749                                     struct acpi_dmar_reserved_memory, header);
3750                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3751                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3752                                 ((void *)rmrr) + rmrr->header.length,
3753                                 rmrr->segment, rmrru->devices,
3754                                 rmrru->devices_cnt);
3755                         if (ret > 0)
3756                                 break;
3757                         else if(ret < 0)
3758                                 return ret;
3759                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3760                         if (dmar_remove_dev_scope(info, rmrr->segment,
3761                                 rmrru->devices, rmrru->devices_cnt))
3762                                 break;
3763                 }
3764         }
3765
3766         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3767                 if (atsru->include_all)
3768                         continue;
3769
3770                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3771                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3772                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3773                                         (void *)atsr + atsr->header.length,
3774                                         atsr->segment, atsru->devices,
3775                                         atsru->devices_cnt);
3776                         if (ret > 0)
3777                                 break;
3778                         else if(ret < 0)
3779                                 return ret;
3780                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3781                         if (dmar_remove_dev_scope(info, atsr->segment,
3782                                         atsru->devices, atsru->devices_cnt))
3783                                 break;
3784                 }
3785         }
3786
3787         return 0;
3788 }
3789
3790 /*
3791  * Here we only respond to action of unbound device from driver.
3792  *
3793  * Added device is not attached to its DMAR domain here yet. That will happen
3794  * when mapping the device to iova.
3795  */
3796 static int device_notifier(struct notifier_block *nb,
3797                                   unsigned long action, void *data)
3798 {
3799         struct device *dev = data;
3800         struct pci_dev *pdev = to_pci_dev(dev);
3801         struct dmar_domain *domain;
3802
3803         if (iommu_dummy(dev))
3804                 return 0;
3805
3806         if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3807             action != BUS_NOTIFY_DEL_DEVICE)
3808                 return 0;
3809
3810         domain = find_domain(dev);
3811         if (!domain)
3812                 return 0;
3813
3814         down_read(&dmar_global_lock);
3815         domain_remove_one_dev_info(domain, pdev);
3816         if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3818             list_empty(&domain->devices))
3819                 domain_exit(domain);
3820         up_read(&dmar_global_lock);
3821
3822         return 0;
3823 }
3824
3825 static struct notifier_block device_nb = {
3826         .notifier_call = device_notifier,
3827 };
3828
3829 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3830                                        unsigned long val, void *v)
3831 {
3832         struct memory_notify *mhp = v;
3833         unsigned long long start, end;
3834         unsigned long start_vpfn, last_vpfn;
3835
3836         switch (val) {
3837         case MEM_GOING_ONLINE:
3838                 start = mhp->start_pfn << PAGE_SHIFT;
3839                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3840                 if (iommu_domain_identity_map(si_domain, start, end)) {
3841                         pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3842                                 start, end);
3843                         return NOTIFY_BAD;
3844                 }
3845                 break;
3846
3847         case MEM_OFFLINE:
3848         case MEM_CANCEL_ONLINE:
3849                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3850                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3851                 while (start_vpfn <= last_vpfn) {
3852                         struct iova *iova;
3853                         struct dmar_drhd_unit *drhd;
3854                         struct intel_iommu *iommu;
3855                         struct page *freelist;
3856
3857                         iova = find_iova(&si_domain->iovad, start_vpfn);
3858                         if (iova == NULL) {
3859                                 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3860                                          start_vpfn);
3861                                 break;
3862                         }
3863
3864                         iova = split_and_remove_iova(&si_domain->iovad, iova,
3865                                                      start_vpfn, last_vpfn);
3866                         if (iova == NULL) {
3867                                 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3868                                         start_vpfn, last_vpfn);
3869                                 return NOTIFY_BAD;
3870                         }
3871
3872                         freelist = domain_unmap(si_domain, iova->pfn_lo,
3873                                                iova->pfn_hi);
3874
3875                         rcu_read_lock();
3876                         for_each_active_iommu(iommu, drhd)
3877                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
3878                                         iova->pfn_lo,
3879                                         iova->pfn_hi - iova->pfn_lo + 1,
3880                                         !freelist, 0);
3881                         rcu_read_unlock();
3882                         dma_free_pagelist(freelist);
3883
3884                         start_vpfn = iova->pfn_hi + 1;
3885                         free_iova_mem(iova);
3886                 }
3887                 break;
3888         }
3889
3890         return NOTIFY_OK;
3891 }
3892
3893 static struct notifier_block intel_iommu_memory_nb = {
3894         .notifier_call = intel_iommu_memory_notifier,
3895         .priority = 0
3896 };
3897
3898 int __init intel_iommu_init(void)
3899 {
3900         int ret = -ENODEV;
3901         struct dmar_drhd_unit *drhd;
3902         struct intel_iommu *iommu;
3903
3904         /* VT-d is required for a TXT/tboot launch, so enforce that */
3905         force_on = tboot_force_iommu();
3906
3907         if (iommu_init_mempool()) {
3908                 if (force_on)
3909                         panic("tboot: Failed to initialize iommu memory\n");
3910                 return -ENOMEM;
3911         }
3912
3913         down_write(&dmar_global_lock);
3914         if (dmar_table_init()) {
3915                 if (force_on)
3916                         panic("tboot: Failed to initialize DMAR table\n");
3917                 goto out_free_dmar;
3918         }
3919
3920         /*
3921          * Disable translation if already enabled prior to OS handover.
3922          */
3923         for_each_active_iommu(iommu, drhd)
3924                 if (iommu->gcmd & DMA_GCMD_TE)
3925                         iommu_disable_translation(iommu);
3926
3927         if (dmar_dev_scope_init() < 0) {
3928                 if (force_on)
3929                         panic("tboot: Failed to initialize DMAR device scope\n");
3930                 goto out_free_dmar;
3931         }
3932
3933         if (no_iommu || dmar_disabled)
3934                 goto out_free_dmar;
3935
3936         if (list_empty(&dmar_rmrr_units))
3937                 printk(KERN_INFO "DMAR: No RMRR found\n");
3938
3939         if (list_empty(&dmar_atsr_units))
3940                 printk(KERN_INFO "DMAR: No ATSR found\n");
3941
3942         if (dmar_init_reserved_ranges()) {
3943                 if (force_on)
3944                         panic("tboot: Failed to reserve iommu ranges\n");
3945                 goto out_free_reserved_range;
3946         }
3947
3948         init_no_remapping_devices();
3949
3950         ret = init_dmars();
3951         if (ret) {
3952                 if (force_on)
3953                         panic("tboot: Failed to initialize DMARs\n");
3954                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3955                 goto out_free_reserved_range;
3956         }
3957         up_write(&dmar_global_lock);
3958         printk(KERN_INFO
3959         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3960
3961         init_timer(&unmap_timer);
3962 #ifdef CONFIG_SWIOTLB
3963         swiotlb = 0;
3964 #endif
3965         dma_ops = &intel_dma_ops;
3966
3967         init_iommu_pm_ops();
3968
3969         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3970         bus_register_notifier(&pci_bus_type, &device_nb);
3971         if (si_domain && !hw_pass_through)
3972                 register_memory_notifier(&intel_iommu_memory_nb);
3973
3974         intel_iommu_enabled = 1;
3975
3976         return 0;
3977
3978 out_free_reserved_range:
3979         put_iova_domain(&reserved_iova_list);
3980 out_free_dmar:
3981         intel_iommu_free_dmars();
3982         up_write(&dmar_global_lock);
3983         iommu_exit_mempool();
3984         return ret;
3985 }
3986
3987 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3988                                            struct pci_dev *pdev)
3989 {
3990         struct pci_dev *tmp, *parent;
3991
3992         if (!iommu || !pdev)
3993                 return;
3994
3995         /* dependent device detach */
3996         tmp = pci_find_upstream_pcie_bridge(pdev);
3997         /* Secondary interface's bus number and devfn 0 */
3998         if (tmp) {
3999                 parent = pdev->bus->self;
4000                 while (parent != tmp) {
4001                         iommu_detach_dev(iommu, parent->bus->number,
4002                                          parent->devfn);
4003                         parent = parent->bus->self;
4004                 }
4005                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4006                         iommu_detach_dev(iommu,
4007                                 tmp->subordinate->number, 0);
4008                 else /* this is a legacy PCI bridge */
4009                         iommu_detach_dev(iommu, tmp->bus->number,
4010                                          tmp->devfn);
4011         }
4012 }
4013
4014 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4015                                           struct pci_dev *pdev)
4016 {
4017         struct device_domain_info *info, *tmp;
4018         struct intel_iommu *iommu;
4019         unsigned long flags;
4020         int found = 0;
4021
4022         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4023                                 pdev->devfn);
4024         if (!iommu)
4025                 return;
4026
4027         spin_lock_irqsave(&device_domain_lock, flags);
4028         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4029                 if (info->segment == pci_domain_nr(pdev->bus) &&
4030                     info->bus == pdev->bus->number &&
4031                     info->devfn == pdev->devfn) {
4032                         unlink_domain_info(info);
4033                         spin_unlock_irqrestore(&device_domain_lock, flags);
4034
4035                         iommu_disable_dev_iotlb(info);
4036                         iommu_detach_dev(iommu, info->bus, info->devfn);
4037                         iommu_detach_dependent_devices(iommu, pdev);
4038                         free_devinfo_mem(info);
4039
4040                         spin_lock_irqsave(&device_domain_lock, flags);
4041
4042                         if (found)
4043                                 break;
4044                         else
4045                                 continue;
4046                 }
4047
4048                 /* if there is no other devices under the same iommu
4049                  * owned by this domain, clear this iommu in iommu_bmp
4050                  * update iommu count and coherency
4051                  */
4052                 if (iommu == device_to_iommu(info->segment, info->bus,
4053                                             info->devfn))
4054                         found = 1;
4055         }
4056
4057         spin_unlock_irqrestore(&device_domain_lock, flags);
4058
4059         if (found == 0) {
4060                 unsigned long tmp_flags;
4061                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4062                 clear_bit(iommu->seq_id, domain->iommu_bmp);
4063                 domain->iommu_count--;
4064                 domain_update_iommu_cap(domain);
4065                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4066
4067                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4068                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4069                         spin_lock_irqsave(&iommu->lock, tmp_flags);
4070                         clear_bit(domain->id, iommu->domain_ids);
4071                         iommu->domains[domain->id] = NULL;
4072                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4073                 }
4074         }
4075 }
4076
4077 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4078 {
4079         int adjust_width;
4080
4081         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4082         domain_reserve_special_ranges(domain);
4083
4084         /* calculate AGAW */
4085         domain->gaw = guest_width;
4086         adjust_width = guestwidth_to_adjustwidth(guest_width);
4087         domain->agaw = width_to_agaw(adjust_width);
4088
4089         domain->iommu_coherency = 0;
4090         domain->iommu_snooping = 0;
4091         domain->iommu_superpage = 0;
4092         domain->max_addr = 0;
4093         domain->nid = -1;
4094
4095         /* always allocate the top pgd */
4096         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4097         if (!domain->pgd)
4098                 return -ENOMEM;
4099         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4100         return 0;
4101 }
4102
4103 static int intel_iommu_domain_init(struct iommu_domain *domain)
4104 {
4105         struct dmar_domain *dmar_domain;
4106
4107         dmar_domain = alloc_domain(true);
4108         if (!dmar_domain) {
4109                 printk(KERN_ERR
4110                         "intel_iommu_domain_init: dmar_domain == NULL\n");
4111                 return -ENOMEM;
4112         }
4113         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4114                 printk(KERN_ERR
4115                         "intel_iommu_domain_init() failed\n");
4116                 domain_exit(dmar_domain);
4117                 return -ENOMEM;
4118         }
4119         domain_update_iommu_cap(dmar_domain);
4120         domain->priv = dmar_domain;
4121
4122         domain->geometry.aperture_start = 0;
4123         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4124         domain->geometry.force_aperture = true;
4125
4126         return 0;
4127 }
4128
4129 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4130 {
4131         struct dmar_domain *dmar_domain = domain->priv;
4132
4133         domain->priv = NULL;
4134         domain_exit(dmar_domain);
4135 }
4136
4137 static int intel_iommu_attach_device(struct iommu_domain *domain,
4138                                      struct device *dev)
4139 {
4140         struct dmar_domain *dmar_domain = domain->priv;
4141         struct pci_dev *pdev = to_pci_dev(dev);
4142         struct intel_iommu *iommu;
4143         int addr_width;
4144
4145         /* normally pdev is not mapped */
4146         if (unlikely(domain_context_mapped(pdev))) {
4147                 struct dmar_domain *old_domain;
4148
4149                 old_domain = find_domain(dev);
4150                 if (old_domain) {
4151                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4152                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4153                                 domain_remove_one_dev_info(old_domain, pdev);
4154                         else
4155                                 domain_remove_dev_info(old_domain);
4156                 }
4157         }
4158
4159         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4160                                 pdev->devfn);
4161         if (!iommu)
4162                 return -ENODEV;
4163
4164         /* check if this iommu agaw is sufficient for max mapped address */
4165         addr_width = agaw_to_width(iommu->agaw);
4166         if (addr_width > cap_mgaw(iommu->cap))
4167                 addr_width = cap_mgaw(iommu->cap);
4168
4169         if (dmar_domain->max_addr > (1LL << addr_width)) {
4170                 printk(KERN_ERR "%s: iommu width (%d) is not "
4171                        "sufficient for the mapped address (%llx)\n",
4172                        __func__, addr_width, dmar_domain->max_addr);
4173                 return -EFAULT;
4174         }
4175         dmar_domain->gaw = addr_width;
4176
4177         /*
4178          * Knock out extra levels of page tables if necessary
4179          */
4180         while (iommu->agaw < dmar_domain->agaw) {
4181                 struct dma_pte *pte;
4182
4183                 pte = dmar_domain->pgd;
4184                 if (dma_pte_present(pte)) {
4185                         dmar_domain->pgd = (struct dma_pte *)
4186                                 phys_to_virt(dma_pte_addr(pte));
4187                         free_pgtable_page(pte);
4188                 }
4189                 dmar_domain->agaw--;
4190         }
4191
4192         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4193 }
4194
4195 static void intel_iommu_detach_device(struct iommu_domain *domain,
4196                                       struct device *dev)
4197 {
4198         struct dmar_domain *dmar_domain = domain->priv;
4199         struct pci_dev *pdev = to_pci_dev(dev);
4200
4201         domain_remove_one_dev_info(dmar_domain, pdev);
4202 }
4203
4204 static int intel_iommu_map(struct iommu_domain *domain,
4205                            unsigned long iova, phys_addr_t hpa,
4206                            size_t size, int iommu_prot)
4207 {
4208         struct dmar_domain *dmar_domain = domain->priv;
4209         u64 max_addr;
4210         int prot = 0;
4211         int ret;
4212
4213         if (iommu_prot & IOMMU_READ)
4214                 prot |= DMA_PTE_READ;
4215         if (iommu_prot & IOMMU_WRITE)
4216                 prot |= DMA_PTE_WRITE;
4217         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4218                 prot |= DMA_PTE_SNP;
4219
4220         max_addr = iova + size;
4221         if (dmar_domain->max_addr < max_addr) {
4222                 u64 end;
4223
4224                 /* check if minimum agaw is sufficient for mapped address */
4225                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4226                 if (end < max_addr) {
4227                         printk(KERN_ERR "%s: iommu width (%d) is not "
4228                                "sufficient for the mapped address (%llx)\n",
4229                                __func__, dmar_domain->gaw, max_addr);
4230                         return -EFAULT;
4231                 }
4232                 dmar_domain->max_addr = max_addr;
4233         }
4234         /* Round up size to next multiple of PAGE_SIZE, if it and
4235            the low bits of hpa would take us onto the next page */
4236         size = aligned_nrpages(hpa, size);
4237         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4238                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4239         return ret;
4240 }
4241
4242 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4243                                 unsigned long iova, size_t size)
4244 {
4245         struct dmar_domain *dmar_domain = domain->priv;
4246         struct page *freelist = NULL;
4247         struct intel_iommu *iommu;
4248         unsigned long start_pfn, last_pfn;
4249         unsigned int npages;
4250         int iommu_id, num, ndomains, level = 0;
4251
4252         /* Cope with horrid API which requires us to unmap more than the
4253            size argument if it happens to be a large-page mapping. */
4254         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4255                 BUG();
4256
4257         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4258                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4259
4260         start_pfn = iova >> VTD_PAGE_SHIFT;
4261         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4262
4263         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4264
4265         npages = last_pfn - start_pfn + 1;
4266
4267         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4268                iommu = g_iommus[iommu_id];
4269
4270                /*
4271                 * find bit position of dmar_domain
4272                 */
4273                ndomains = cap_ndoms(iommu->cap);
4274                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4275                        if (iommu->domains[num] == dmar_domain)
4276                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4277                                                      npages, !freelist, 0);
4278                }
4279
4280         }
4281
4282         dma_free_pagelist(freelist);
4283
4284         if (dmar_domain->max_addr == iova + size)
4285                 dmar_domain->max_addr = iova;
4286
4287         return size;
4288 }
4289
4290 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4291                                             dma_addr_t iova)
4292 {
4293         struct dmar_domain *dmar_domain = domain->priv;
4294         struct dma_pte *pte;
4295         int level = 0;
4296         u64 phys = 0;
4297
4298         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4299         if (pte)
4300                 phys = dma_pte_addr(pte);
4301
4302         return phys;
4303 }
4304
4305 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4306                                       unsigned long cap)
4307 {
4308         struct dmar_domain *dmar_domain = domain->priv;
4309
4310         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4311                 return dmar_domain->iommu_snooping;
4312         if (cap == IOMMU_CAP_INTR_REMAP)
4313                 return irq_remapping_enabled;
4314
4315         return 0;
4316 }
4317
4318 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4319
4320 static int intel_iommu_add_device(struct device *dev)
4321 {
4322         struct pci_dev *pdev = to_pci_dev(dev);
4323         struct pci_dev *bridge, *dma_pdev = NULL;
4324         struct iommu_group *group;
4325         int ret;
4326
4327         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4328                              pdev->bus->number, pdev->devfn))
4329                 return -ENODEV;
4330
4331         bridge = pci_find_upstream_pcie_bridge(pdev);
4332         if (bridge) {
4333                 if (pci_is_pcie(bridge))
4334                         dma_pdev = pci_get_domain_bus_and_slot(
4335                                                 pci_domain_nr(pdev->bus),
4336                                                 bridge->subordinate->number, 0);
4337                 if (!dma_pdev)
4338                         dma_pdev = pci_dev_get(bridge);
4339         } else
4340                 dma_pdev = pci_dev_get(pdev);
4341
4342         /* Account for quirked devices */
4343         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4344
4345         /*
4346          * If it's a multifunction device that does not support our
4347          * required ACS flags, add to the same group as lowest numbered
4348          * function that also does not suport the required ACS flags.
4349          */
4350         if (dma_pdev->multifunction &&
4351             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4352                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4353
4354                 for (i = 0; i < 8; i++) {
4355                         struct pci_dev *tmp;
4356
4357                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4358                         if (!tmp)
4359                                 continue;
4360
4361                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4362                                 swap_pci_ref(&dma_pdev, tmp);
4363                                 break;
4364                         }
4365                         pci_dev_put(tmp);
4366                 }
4367         }
4368
4369         /*
4370          * Devices on the root bus go through the iommu.  If that's not us,
4371          * find the next upstream device and test ACS up to the root bus.
4372          * Finding the next device may require skipping virtual buses.
4373          */
4374         while (!pci_is_root_bus(dma_pdev->bus)) {
4375                 struct pci_bus *bus = dma_pdev->bus;
4376
4377                 while (!bus->self) {
4378                         if (!pci_is_root_bus(bus))
4379                                 bus = bus->parent;
4380                         else
4381                                 goto root_bus;
4382                 }
4383
4384                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4385                         break;
4386
4387                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4388         }
4389
4390 root_bus:
4391         group = iommu_group_get(&dma_pdev->dev);
4392         pci_dev_put(dma_pdev);
4393         if (!group) {
4394                 group = iommu_group_alloc();
4395                 if (IS_ERR(group))
4396                         return PTR_ERR(group);
4397         }
4398
4399         ret = iommu_group_add_device(group, dev);
4400
4401         iommu_group_put(group);
4402         return ret;
4403 }
4404
4405 static void intel_iommu_remove_device(struct device *dev)
4406 {
4407         iommu_group_remove_device(dev);
4408 }
4409
4410 static struct iommu_ops intel_iommu_ops = {
4411         .domain_init    = intel_iommu_domain_init,
4412         .domain_destroy = intel_iommu_domain_destroy,
4413         .attach_dev     = intel_iommu_attach_device,
4414         .detach_dev     = intel_iommu_detach_device,
4415         .map            = intel_iommu_map,
4416         .unmap          = intel_iommu_unmap,
4417         .iova_to_phys   = intel_iommu_iova_to_phys,
4418         .domain_has_cap = intel_iommu_domain_has_cap,
4419         .add_device     = intel_iommu_add_device,
4420         .remove_device  = intel_iommu_remove_device,
4421         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4422 };
4423
4424 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4425 {
4426         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4427         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4428         dmar_map_gfx = 0;
4429 }
4430
4431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4438
4439 static void quirk_iommu_rwbf(struct pci_dev *dev)
4440 {
4441         /*
4442          * Mobile 4 Series Chipset neglects to set RWBF capability,
4443          * but needs it. Same seems to hold for the desktop versions.
4444          */
4445         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4446         rwbf_quirk = 1;
4447 }
4448
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4456
4457 #define GGC 0x52
4458 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4459 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4460 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4461 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4462 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4463 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4464 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4465 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4466
4467 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4468 {
4469         unsigned short ggc;
4470
4471         if (pci_read_config_word(dev, GGC, &ggc))
4472                 return;
4473
4474         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4475                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4476                 dmar_map_gfx = 0;
4477         } else if (dmar_map_gfx) {
4478                 /* we have to ensure the gfx device is idle before we flush */
4479                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4480                 intel_iommu_strict = 1;
4481        }
4482 }
4483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4487
4488 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4489    ISOCH DMAR unit for the Azalia sound device, but not give it any
4490    TLB entries, which causes it to deadlock. Check for that.  We do
4491    this in a function called from init_dmars(), instead of in a PCI
4492    quirk, because we don't want to print the obnoxious "BIOS broken"
4493    message if VT-d is actually disabled.
4494 */
4495 static void __init check_tylersburg_isoch(void)
4496 {
4497         struct pci_dev *pdev;
4498         uint32_t vtisochctrl;
4499
4500         /* If there's no Azalia in the system anyway, forget it. */
4501         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4502         if (!pdev)
4503                 return;
4504         pci_dev_put(pdev);
4505
4506         /* System Management Registers. Might be hidden, in which case
4507            we can't do the sanity check. But that's OK, because the
4508            known-broken BIOSes _don't_ actually hide it, so far. */
4509         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4510         if (!pdev)
4511                 return;
4512
4513         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4514                 pci_dev_put(pdev);
4515                 return;
4516         }
4517
4518         pci_dev_put(pdev);
4519
4520         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4521         if (vtisochctrl & 1)
4522                 return;
4523
4524         /* Drop all bits other than the number of TLB entries */
4525         vtisochctrl &= 0x1c;
4526
4527         /* If we have the recommended number of TLB entries (16), fine. */
4528         if (vtisochctrl == 0x10)
4529                 return;
4530
4531         /* Zero TLB entries? You get to ride the short bus to school. */
4532         if (!vtisochctrl) {
4533                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4534                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4535                      dmi_get_system_info(DMI_BIOS_VENDOR),
4536                      dmi_get_system_info(DMI_BIOS_VERSION),
4537                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4538                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4539                 return;
4540         }
4541         
4542         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4543                vtisochctrl);
4544 }