drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-map-ops.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/dma-iommu.h>
  35 #include <linux/intel-iommu.h>
  36 #include <linux/syscore_ops.h>
  37 #include <linux/tboot.h>
  38 #include <linux/dmi.h>
  39 #include <linux/pci-ats.h>
  40 #include <linux/memblock.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <asm/irq_remapping.h>
  45 #include <asm/cacheflush.h>
  46 #include <asm/iommu.h>
  47
  48 #include "../irq_remapping.h"
  49 #include "../iommu-sva-lib.h"
  50 #include "pasid.h"
  51 #include "cap_audit.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 static inline int agaw_to_level(int agaw)
  89 {
  90         return agaw + 2;
  91 }
  92
  93 static inline int agaw_to_width(int agaw)
  94 {
  95         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  96 }
  97
  98 static inline int width_to_agaw(int width)
  99 {
 100         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 101 }
 102
 103 static inline unsigned int level_to_offset_bits(int level)
 104 {
 105         return (level - 1) * LEVEL_STRIDE;
 106 }
 107
 108 static inline int pfn_level_offset(u64 pfn, int level)
 109 {
 110         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 111 }
 112
 113 static inline u64 level_mask(int level)
 114 {
 115         return -1ULL << level_to_offset_bits(level);
 116 }
 117
 118 static inline u64 level_size(int level)
 119 {
 120         return 1ULL << level_to_offset_bits(level);
 121 }
 122
 123 static inline u64 align_to_level(u64 pfn, int level)
 124 {
 125         return (pfn + level_size(level) - 1) & level_mask(level);
 126 }
 127
 128 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 129 {
 130         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 131 }
 132
 133 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 134    are never going to work. */
 135 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 136 {
 137         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 138 }
 139
 140 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 141 {
 142         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 143 }
 144 static inline unsigned long page_to_dma_pfn(struct page *pg)
 145 {
 146         return mm_to_dma_pfn(page_to_pfn(pg));
 147 }
 148 static inline unsigned long virt_to_dma_pfn(void *p)
 149 {
 150         return page_to_dma_pfn(virt_to_page(p));
 151 }
 152
 153 /* global iommu list, set NULL for ignored DMAR units */
 154 static struct intel_iommu **g_iommus;
 155
 156 static void __init check_tylersburg_isoch(void);
 157 static int rwbf_quirk;
 158
 159 /*
 160  * set to 1 to panic kernel if can't successfully enable VT-d
 161  * (used when kernel is launched w/ TXT)
 162  */
 163 static int force_on = 0;
 164 static int intel_iommu_tboot_noforce;
 165 static int no_platform_optin;
 166
 167 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 168
 169 /*
 170  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 171  * if marked present.
 172  */
 173 static phys_addr_t root_entry_lctp(struct root_entry *re)
 174 {
 175         if (!(re->lo & 1))
 176                 return 0;
 177
 178         return re->lo & VTD_PAGE_MASK;
 179 }
 180
 181 /*
 182  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 183  * if marked present.
 184  */
 185 static phys_addr_t root_entry_uctp(struct root_entry *re)
 186 {
 187         if (!(re->hi & 1))
 188                 return 0;
 189
 190         return re->hi & VTD_PAGE_MASK;
 191 }
 192
 193 static inline void context_clear_pasid_enable(struct context_entry *context)
 194 {
 195         context->lo &= ~(1ULL << 11);
 196 }
 197
 198 static inline bool context_pasid_enabled(struct context_entry *context)
 199 {
 200         return !!(context->lo & (1ULL << 11));
 201 }
 202
 203 static inline void context_set_copied(struct context_entry *context)
 204 {
 205         context->hi |= (1ull << 3);
 206 }
 207
 208 static inline bool context_copied(struct context_entry *context)
 209 {
 210         return !!(context->hi & (1ULL << 3));
 211 }
 212
 213 static inline bool __context_present(struct context_entry *context)
 214 {
 215         return (context->lo & 1);
 216 }
 217
 218 bool context_present(struct context_entry *context)
 219 {
 220         return context_pasid_enabled(context) ?
 221              __context_present(context) :
 222              __context_present(context) && !context_copied(context);
 223 }
 224
 225 static inline void context_set_present(struct context_entry *context)
 226 {
 227         context->lo |= 1;
 228 }
 229
 230 static inline void context_set_fault_enable(struct context_entry *context)
 231 {
 232         context->lo &= (((u64)-1) << 2) | 1;
 233 }
 234
 235 static inline void context_set_translation_type(struct context_entry *context,
 236                                                 unsigned long value)
 237 {
 238         context->lo &= (((u64)-1) << 4) | 3;
 239         context->lo |= (value & 3) << 2;
 240 }
 241
 242 static inline void context_set_address_root(struct context_entry *context,
 243                                             unsigned long value)
 244 {
 245         context->lo &= ~VTD_PAGE_MASK;
 246         context->lo |= value & VTD_PAGE_MASK;
 247 }
 248
 249 static inline void context_set_address_width(struct context_entry *context,
 250                                              unsigned long value)
 251 {
 252         context->hi |= value & 7;
 253 }
 254
 255 static inline void context_set_domain_id(struct context_entry *context,
 256                                          unsigned long value)
 257 {
 258         context->hi |= (value & ((1 << 16) - 1)) << 8;
 259 }
 260
 261 static inline int context_domain_id(struct context_entry *c)
 262 {
 263         return((c->hi >> 8) & 0xffff);
 264 }
 265
 266 static inline void context_clear_entry(struct context_entry *context)
 267 {
 268         context->lo = 0;
 269         context->hi = 0;
 270 }
 271
 272 /*
 273  * This domain is a statically identity mapping domain.
 274  *      1. This domain creats a static 1:1 mapping to all usable memory.
 275  *      2. It maps to each iommu if successful.
 276  *      3. Each iommu mapps to this domain if successful.
 277  */
 278 static struct dmar_domain *si_domain;
 279 static int hw_pass_through = 1;
 280
 281 #define for_each_domain_iommu(idx, domain)                      \
 282         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 283                 if (domain->iommu_refcnt[idx])
 284
 285 struct dmar_rmrr_unit {
 286         struct list_head list;          /* list of rmrr units   */
 287         struct acpi_dmar_header *hdr;   /* ACPI header          */
 288         u64     base_address;           /* reserved base address*/
 289         u64     end_address;            /* reserved end address */
 290         struct dmar_dev_scope *devices; /* target devices */
 291         int     devices_cnt;            /* target device count */
 292 };
 293
 294 struct dmar_atsr_unit {
 295         struct list_head list;          /* list of ATSR units */
 296         struct acpi_dmar_header *hdr;   /* ACPI header */
 297         struct dmar_dev_scope *devices; /* target devices */
 298         int devices_cnt;                /* target device count */
 299         u8 include_all:1;               /* include all ports */
 300 };
 301
 302 struct dmar_satc_unit {
 303         struct list_head list;          /* list of SATC units */
 304         struct acpi_dmar_header *hdr;   /* ACPI header */
 305         struct dmar_dev_scope *devices; /* target devices */
 306         struct intel_iommu *iommu;      /* the corresponding iommu */
 307         int devices_cnt;                /* target device count */
 308         u8 atc_required:1;              /* ATS is required */
 309 };
 310
 311 static LIST_HEAD(dmar_atsr_units);
 312 static LIST_HEAD(dmar_rmrr_units);
 313 static LIST_HEAD(dmar_satc_units);
 314
 315 #define for_each_rmrr_units(rmrr) \
 316         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 317
 318 /* bitmap for indexing intel_iommus */
 319 static int g_num_of_iommus;
 320
 321 static void domain_exit(struct dmar_domain *domain);
 322 static void domain_remove_dev_info(struct dmar_domain *domain);
 323 static void dmar_remove_one_dev_info(struct device *dev);
 324 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 325 static int intel_iommu_attach_device(struct iommu_domain *domain,
 326                                      struct device *dev);
 327 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 328                                             dma_addr_t iova);
 329
 330 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 331 int dmar_disabled = 0;
 332 #else
 333 int dmar_disabled = 1;
 334 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 335
 336 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 337 int intel_iommu_sm = 1;
 338 #else
 339 int intel_iommu_sm;
 340 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 341
 342 int intel_iommu_enabled = 0;
 343 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 344
 345 static int dmar_map_gfx = 1;
 346 static int intel_iommu_superpage = 1;
 347 static int iommu_identity_mapping;
 348 static int iommu_skip_te_disable;
 349
 350 #define IDENTMAP_GFX            2
 351 #define IDENTMAP_AZALIA         4
 352
 353 int intel_iommu_gfx_mapped;
 354 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 355
 356 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 357 struct device_domain_info *get_domain_info(struct device *dev)
 358 {
 359         struct device_domain_info *info;
 360
 361         if (!dev)
 362                 return NULL;
 363
 364         info = dev_iommu_priv_get(dev);
 365         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 366                 return NULL;
 367
 368         return info;
 369 }
 370
 371 DEFINE_SPINLOCK(device_domain_lock);
 372 static LIST_HEAD(device_domain_list);
 373
 374 /*
 375  * Iterate over elements in device_domain_list and call the specified
 376  * callback @fn against each element.
 377  */
 378 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 379                                      void *data), void *data)
 380 {
 381         int ret = 0;
 382         unsigned long flags;
 383         struct device_domain_info *info;
 384
 385         spin_lock_irqsave(&device_domain_lock, flags);
 386         list_for_each_entry(info, &device_domain_list, global) {
 387                 ret = fn(info, data);
 388                 if (ret) {
 389                         spin_unlock_irqrestore(&device_domain_lock, flags);
 390                         return ret;
 391                 }
 392         }
 393         spin_unlock_irqrestore(&device_domain_lock, flags);
 394
 395         return 0;
 396 }
 397
 398 const struct iommu_ops intel_iommu_ops;
 399
 400 static bool translation_pre_enabled(struct intel_iommu *iommu)
 401 {
 402         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 403 }
 404
 405 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 406 {
 407         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 408 }
 409
 410 static void init_translation_status(struct intel_iommu *iommu)
 411 {
 412         u32 gsts;
 413
 414         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 415         if (gsts & DMA_GSTS_TES)
 416                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 417 }
 418
 419 static int __init intel_iommu_setup(char *str)
 420 {
 421         if (!str)
 422                 return -EINVAL;
 423         while (*str) {
 424                 if (!strncmp(str, "on", 2)) {
 425                         dmar_disabled = 0;
 426                         pr_info("IOMMU enabled\n");
 427                 } else if (!strncmp(str, "off", 3)) {
 428                         dmar_disabled = 1;
 429                         no_platform_optin = 1;
 430                         pr_info("IOMMU disabled\n");
 431                 } else if (!strncmp(str, "igfx_off", 8)) {
 432                         dmar_map_gfx = 0;
 433                         pr_info("Disable GFX device mapping\n");
 434                 } else if (!strncmp(str, "forcedac", 8)) {
 435                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 436                         iommu_dma_forcedac = true;
 437                 } else if (!strncmp(str, "strict", 6)) {
 438                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 439                         iommu_set_dma_strict();
 440                 } else if (!strncmp(str, "sp_off", 6)) {
 441                         pr_info("Disable supported super page\n");
 442                         intel_iommu_superpage = 0;
 443                 } else if (!strncmp(str, "sm_on", 5)) {
 444                         pr_info("Intel-IOMMU: scalable mode supported\n");
 445                         intel_iommu_sm = 1;
 446                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 447                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 448                         intel_iommu_tboot_noforce = 1;
 449                 }
 450
 451                 str += strcspn(str, ",");
 452                 while (*str == ',')
 453                         str++;
 454         }
 455         return 0;
 456 }
 457 __setup("intel_iommu=", intel_iommu_setup);
 458
 459 static struct kmem_cache *iommu_domain_cache;
 460 static struct kmem_cache *iommu_devinfo_cache;
 461
 462 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 463 {
 464         struct dmar_domain **domains;
 465         int idx = did >> 8;
 466
 467         domains = iommu->domains[idx];
 468         if (!domains)
 469                 return NULL;
 470
 471         return domains[did & 0xff];
 472 }
 473
 474 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 475                              struct dmar_domain *domain)
 476 {
 477         struct dmar_domain **domains;
 478         int idx = did >> 8;
 479
 480         if (!iommu->domains[idx]) {
 481                 size_t size = 256 * sizeof(struct dmar_domain *);
 482                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 483         }
 484
 485         domains = iommu->domains[idx];
 486         if (WARN_ON(!domains))
 487                 return;
 488         else
 489                 domains[did & 0xff] = domain;
 490 }
 491
 492 void *alloc_pgtable_page(int node)
 493 {
 494         struct page *page;
 495         void *vaddr = NULL;
 496
 497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 498         if (page)
 499                 vaddr = page_address(page);
 500         return vaddr;
 501 }
 502
 503 void free_pgtable_page(void *vaddr)
 504 {
 505         free_page((unsigned long)vaddr);
 506 }
 507
 508 static inline void *alloc_domain_mem(void)
 509 {
 510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 511 }
 512
 513 static void free_domain_mem(void *vaddr)
 514 {
 515         kmem_cache_free(iommu_domain_cache, vaddr);
 516 }
 517
 518 static inline void * alloc_devinfo_mem(void)
 519 {
 520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 521 }
 522
 523 static inline void free_devinfo_mem(void *vaddr)
 524 {
 525         kmem_cache_free(iommu_devinfo_cache, vaddr);
 526 }
 527
 528 static inline int domain_type_is_si(struct dmar_domain *domain)
 529 {
 530         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 531 }
 532
 533 static inline bool domain_use_first_level(struct dmar_domain *domain)
 534 {
 535         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 536 }
 537
 538 static inline int domain_pfn_supported(struct dmar_domain *domain,
 539                                        unsigned long pfn)
 540 {
 541         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 542
 543         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 544 }
 545
 546 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 547 {
 548         unsigned long sagaw;
 549         int agaw;
 550
 551         sagaw = cap_sagaw(iommu->cap);
 552         for (agaw = width_to_agaw(max_gaw);
 553              agaw >= 0; agaw--) {
 554                 if (test_bit(agaw, &sagaw))
 555                         break;
 556         }
 557
 558         return agaw;
 559 }
 560
 561 /*
 562  * Calculate max SAGAW for each iommu.
 563  */
 564 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 565 {
 566         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 567 }
 568
 569 /*
 570  * calculate agaw for each iommu.
 571  * "SAGAW" may be different across iommus, use a default agaw, and
 572  * get a supported less agaw for iommus that don't support the default agaw.
 573  */
 574 int iommu_calculate_agaw(struct intel_iommu *iommu)
 575 {
 576         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 577 }
 578
 579 /* This functionin only returns single iommu in a domain */
 580 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 581 {
 582         int iommu_id;
 583
 584         /* si_domain and vm domain should not get here. */
 585         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 586                 return NULL;
 587
 588         for_each_domain_iommu(iommu_id, domain)
 589                 break;
 590
 591         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 592                 return NULL;
 593
 594         return g_iommus[iommu_id];
 595 }
 596
 597 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 598 {
 599         return sm_supported(iommu) ?
 600                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 601 }
 602
 603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 604 {
 605         struct dmar_drhd_unit *drhd;
 606         struct intel_iommu *iommu;
 607         bool found = false;
 608         int i;
 609
 610         domain->iommu_coherency = true;
 611
 612         for_each_domain_iommu(i, domain) {
 613                 found = true;
 614                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
 615                         domain->iommu_coherency = false;
 616                         break;
 617                 }
 618         }
 619         if (found)
 620                 return;
 621
 622         /* No hardware attached; use lowest common denominator */
 623         rcu_read_lock();
 624         for_each_active_iommu(iommu, drhd) {
 625                 if (!iommu_paging_structure_coherency(iommu)) {
 626                         domain->iommu_coherency = false;
 627                         break;
 628                 }
 629         }
 630         rcu_read_unlock();
 631 }
 632
 633 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 634 {
 635         struct dmar_drhd_unit *drhd;
 636         struct intel_iommu *iommu;
 637         bool ret = true;
 638
 639         rcu_read_lock();
 640         for_each_active_iommu(iommu, drhd) {
 641                 if (iommu != skip) {
 642                         /*
 643                          * If the hardware is operating in the scalable mode,
 644                          * the snooping control is always supported since we
 645                          * always set PASID-table-entry.PGSNP bit if the domain
 646                          * is managed outside (UNMANAGED).
 647                          */
 648                         if (!sm_supported(iommu) &&
 649                             !ecap_sc_support(iommu->ecap)) {
 650                                 ret = false;
 651                                 break;
 652                         }
 653                 }
 654         }
 655         rcu_read_unlock();
 656
 657         return ret;
 658 }
 659
 660 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 661                                          struct intel_iommu *skip)
 662 {
 663         struct dmar_drhd_unit *drhd;
 664         struct intel_iommu *iommu;
 665         int mask = 0x3;
 666
 667         if (!intel_iommu_superpage)
 668                 return 0;
 669
 670         /* set iommu_superpage to the smallest common denominator */
 671         rcu_read_lock();
 672         for_each_active_iommu(iommu, drhd) {
 673                 if (iommu != skip) {
 674                         if (domain && domain_use_first_level(domain)) {
 675                                 if (!cap_fl1gp_support(iommu->cap))
 676                                         mask = 0x1;
 677                         } else {
 678                                 mask &= cap_super_page_val(iommu->cap);
 679                         }
 680
 681                         if (!mask)
 682                                 break;
 683                 }
 684         }
 685         rcu_read_unlock();
 686
 687         return fls(mask);
 688 }
 689
 690 static int domain_update_device_node(struct dmar_domain *domain)
 691 {
 692         struct device_domain_info *info;
 693         int nid = NUMA_NO_NODE;
 694
 695         assert_spin_locked(&device_domain_lock);
 696
 697         if (list_empty(&domain->devices))
 698                 return NUMA_NO_NODE;
 699
 700         list_for_each_entry(info, &domain->devices, link) {
 701                 if (!info->dev)
 702                         continue;
 703
 704                 /*
 705                  * There could possibly be multiple device numa nodes as devices
 706                  * within the same domain may sit behind different IOMMUs. There
 707                  * isn't perfect answer in such situation, so we select first
 708                  * come first served policy.
 709                  */
 710                 nid = dev_to_node(info->dev);
 711                 if (nid != NUMA_NO_NODE)
 712                         break;
 713         }
 714
 715         return nid;
 716 }
 717
 718 static void domain_update_iotlb(struct dmar_domain *domain);
 719
 720 /* Return the super pagesize bitmap if supported. */
 721 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 722 {
 723         unsigned long bitmap = 0;
 724
 725         /*
 726          * 1-level super page supports page size of 2MiB, 2-level super page
 727          * supports page size of both 2MiB and 1GiB.
 728          */
 729         if (domain->iommu_superpage == 1)
 730                 bitmap |= SZ_2M;
 731         else if (domain->iommu_superpage == 2)
 732                 bitmap |= SZ_2M | SZ_1G;
 733
 734         return bitmap;
 735 }
 736
 737 /* Some capabilities may be different across iommus */
 738 static void domain_update_iommu_cap(struct dmar_domain *domain)
 739 {
 740         domain_update_iommu_coherency(domain);
 741         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 742         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 743
 744         /*
 745          * If RHSA is missing, we should default to the device numa domain
 746          * as fall back.
 747          */
 748         if (domain->nid == NUMA_NO_NODE)
 749                 domain->nid = domain_update_device_node(domain);
 750
 751         /*
 752          * First-level translation restricts the input-address to a
 753          * canonical address (i.e., address bits 63:N have the same
 754          * value as address bit [N-1], where N is 48-bits with 4-level
 755          * paging and 57-bits with 5-level paging). Hence, skip bit
 756          * [N-1].
 757          */
 758         if (domain_use_first_level(domain))
 759                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 760         else
 761                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 762
 763         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 764         domain_update_iotlb(domain);
 765 }
 766
 767 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 768                                          u8 devfn, int alloc)
 769 {
 770         struct root_entry *root = &iommu->root_entry[bus];
 771         struct context_entry *context;
 772         u64 *entry;
 773
 774         entry = &root->lo;
 775         if (sm_supported(iommu)) {
 776                 if (devfn >= 0x80) {
 777                         devfn -= 0x80;
 778                         entry = &root->hi;
 779                 }
 780                 devfn *= 2;
 781         }
 782         if (*entry & 1)
 783                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 784         else {
 785                 unsigned long phy_addr;
 786                 if (!alloc)
 787                         return NULL;
 788
 789                 context = alloc_pgtable_page(iommu->node);
 790                 if (!context)
 791                         return NULL;
 792
 793                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 794                 phy_addr = virt_to_phys((void *)context);
 795                 *entry = phy_addr | 1;
 796                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 797         }
 798         return &context[devfn];
 799 }
 800
 801 static bool attach_deferred(struct device *dev)
 802 {
 803         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 804 }
 805
 806 /**
 807  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 808  *                               sub-hierarchy of a candidate PCI-PCI bridge
 809  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 810  * @bridge: the candidate PCI-PCI bridge
 811  *
 812  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 813  */
 814 static bool
 815 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 816 {
 817         struct pci_dev *pdev, *pbridge;
 818
 819         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 820                 return false;
 821
 822         pdev = to_pci_dev(dev);
 823         pbridge = to_pci_dev(bridge);
 824
 825         if (pbridge->subordinate &&
 826             pbridge->subordinate->number <= pdev->bus->number &&
 827             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 828                 return true;
 829
 830         return false;
 831 }
 832
 833 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 834 {
 835         struct dmar_drhd_unit *drhd;
 836         u32 vtbar;
 837         int rc;
 838
 839         /* We know that this device on this chipset has its own IOMMU.
 840          * If we find it under a different IOMMU, then the BIOS is lying
 841          * to us. Hope that the IOMMU for this device is actually
 842          * disabled, and it needs no translation...
 843          */
 844         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 845         if (rc) {
 846                 /* "can't" happen */
 847                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 848                 return false;
 849         }
 850         vtbar &= 0xffff0000;
 851
 852         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 853         drhd = dmar_find_matched_drhd_unit(pdev);
 854         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 855                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 856                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 857                 return true;
 858         }
 859
 860         return false;
 861 }
 862
 863 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 864 {
 865         if (!iommu || iommu->drhd->ignored)
 866                 return true;
 867
 868         if (dev_is_pci(dev)) {
 869                 struct pci_dev *pdev = to_pci_dev(dev);
 870
 871                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 872                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 873                     quirk_ioat_snb_local_iommu(pdev))
 874                         return true;
 875         }
 876
 877         return false;
 878 }
 879
 880 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 881 {
 882         struct dmar_drhd_unit *drhd = NULL;
 883         struct pci_dev *pdev = NULL;
 884         struct intel_iommu *iommu;
 885         struct device *tmp;
 886         u16 segment = 0;
 887         int i;
 888
 889         if (!dev)
 890                 return NULL;
 891
 892         if (dev_is_pci(dev)) {
 893                 struct pci_dev *pf_pdev;
 894
 895                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 896
 897                 /* VFs aren't listed in scope tables; we need to look up
 898                  * the PF instead to find the IOMMU. */
 899                 pf_pdev = pci_physfn(pdev);
 900                 dev = &pf_pdev->dev;
 901                 segment = pci_domain_nr(pdev->bus);
 902         } else if (has_acpi_companion(dev))
 903                 dev = &ACPI_COMPANION(dev)->dev;
 904
 905         rcu_read_lock();
 906         for_each_iommu(iommu, drhd) {
 907                 if (pdev && segment != drhd->segment)
 908                         continue;
 909
 910                 for_each_active_dev_scope(drhd->devices,
 911                                           drhd->devices_cnt, i, tmp) {
 912                         if (tmp == dev) {
 913                                 /* For a VF use its original BDF# not that of the PF
 914                                  * which we used for the IOMMU lookup. Strictly speaking
 915                                  * we could do this for all PCI devices; we only need to
 916                                  * get the BDF# from the scope table for ACPI matches. */
 917                                 if (pdev && pdev->is_virtfn)
 918                                         goto got_pdev;
 919
 920                                 if (bus && devfn) {
 921                                         *bus = drhd->devices[i].bus;
 922                                         *devfn = drhd->devices[i].devfn;
 923                                 }
 924                                 goto out;
 925                         }
 926
 927                         if (is_downstream_to_pci_bridge(dev, tmp))
 928                                 goto got_pdev;
 929                 }
 930
 931                 if (pdev && drhd->include_all) {
 932                 got_pdev:
 933                         if (bus && devfn) {
 934                                 *bus = pdev->bus->number;
 935                                 *devfn = pdev->devfn;
 936                         }
 937                         goto out;
 938                 }
 939         }
 940         iommu = NULL;
 941  out:
 942         if (iommu_is_dummy(iommu, dev))
 943                 iommu = NULL;
 944
 945         rcu_read_unlock();
 946
 947         return iommu;
 948 }
 949
 950 static void domain_flush_cache(struct dmar_domain *domain,
 951                                void *addr, int size)
 952 {
 953         if (!domain->iommu_coherency)
 954                 clflush_cache_range(addr, size);
 955 }
 956
 957 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 958 {
 959         struct context_entry *context;
 960         int ret = 0;
 961         unsigned long flags;
 962
 963         spin_lock_irqsave(&iommu->lock, flags);
 964         context = iommu_context_addr(iommu, bus, devfn, 0);
 965         if (context)
 966                 ret = context_present(context);
 967         spin_unlock_irqrestore(&iommu->lock, flags);
 968         return ret;
 969 }
 970
 971 static void free_context_table(struct intel_iommu *iommu)
 972 {
 973         int i;
 974         unsigned long flags;
 975         struct context_entry *context;
 976
 977         spin_lock_irqsave(&iommu->lock, flags);
 978         if (!iommu->root_entry) {
 979                 goto out;
 980         }
 981         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 982                 context = iommu_context_addr(iommu, i, 0, 0);
 983                 if (context)
 984                         free_pgtable_page(context);
 985
 986                 if (!sm_supported(iommu))
 987                         continue;
 988
 989                 context = iommu_context_addr(iommu, i, 0x80, 0);
 990                 if (context)
 991                         free_pgtable_page(context);
 992
 993         }
 994         free_pgtable_page(iommu->root_entry);
 995         iommu->root_entry = NULL;
 996 out:
 997         spin_unlock_irqrestore(&iommu->lock, flags);
 998 }
 999
1000 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1001                                       unsigned long pfn, int *target_level)
1002 {
1003         struct dma_pte *parent, *pte;
1004         int level = agaw_to_level(domain->agaw);
1005         int offset;
1006
1007         BUG_ON(!domain->pgd);
1008
1009         if (!domain_pfn_supported(domain, pfn))
1010                 /* Address beyond IOMMU's addressing capabilities. */
1011                 return NULL;
1012
1013         parent = domain->pgd;
1014
1015         while (1) {
1016                 void *tmp_page;
1017
1018                 offset = pfn_level_offset(pfn, level);
1019                 pte = &parent[offset];
1020                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1021                         break;
1022                 if (level == *target_level)
1023                         break;
1024
1025                 if (!dma_pte_present(pte)) {
1026                         uint64_t pteval;
1027
1028                         tmp_page = alloc_pgtable_page(domain->nid);
1029
1030                         if (!tmp_page)
1031                                 return NULL;
1032
1033                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1034                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1035                         if (domain_use_first_level(domain)) {
1036                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1037                                 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1038                                         pteval |= DMA_FL_PTE_ACCESS;
1039                         }
1040                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1041                                 /* Someone else set it while we were thinking; use theirs. */
1042                                 free_pgtable_page(tmp_page);
1043                         else
1044                                 domain_flush_cache(domain, pte, sizeof(*pte));
1045                 }
1046                 if (level == 1)
1047                         break;
1048
1049                 parent = phys_to_virt(dma_pte_addr(pte));
1050                 level--;
1051         }
1052
1053         if (!*target_level)
1054                 *target_level = level;
1055
1056         return pte;
1057 }
1058
1059 /* return address's pte at specific level */
1060 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1061                                          unsigned long pfn,
1062                                          int level, int *large_page)
1063 {
1064         struct dma_pte *parent, *pte;
1065         int total = agaw_to_level(domain->agaw);
1066         int offset;
1067
1068         parent = domain->pgd;
1069         while (level <= total) {
1070                 offset = pfn_level_offset(pfn, total);
1071                 pte = &parent[offset];
1072                 if (level == total)
1073                         return pte;
1074
1075                 if (!dma_pte_present(pte)) {
1076                         *large_page = total;
1077                         break;
1078                 }
1079
1080                 if (dma_pte_superpage(pte)) {
1081                         *large_page = total;
1082                         return pte;
1083                 }
1084
1085                 parent = phys_to_virt(dma_pte_addr(pte));
1086                 total--;
1087         }
1088         return NULL;
1089 }
1090
1091 /* clear last level pte, a tlb flush should be followed */
1092 static void dma_pte_clear_range(struct dmar_domain *domain,
1093                                 unsigned long start_pfn,
1094                                 unsigned long last_pfn)
1095 {
1096         unsigned int large_page;
1097         struct dma_pte *first_pte, *pte;
1098
1099         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1100         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1101         BUG_ON(start_pfn > last_pfn);
1102
1103         /* we don't need lock here; nobody else touches the iova range */
1104         do {
1105                 large_page = 1;
1106                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1107                 if (!pte) {
1108                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1109                         continue;
1110                 }
1111                 do {
1112                         dma_clear_pte(pte);
1113                         start_pfn += lvl_to_nr_pages(large_page);
1114                         pte++;
1115                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1116
1117                 domain_flush_cache(domain, first_pte,
1118                                    (void *)pte - (void *)first_pte);
1119
1120         } while (start_pfn && start_pfn <= last_pfn);
1121 }
1122
1123 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1124                                int retain_level, struct dma_pte *pte,
1125                                unsigned long pfn, unsigned long start_pfn,
1126                                unsigned long last_pfn)
1127 {
1128         pfn = max(start_pfn, pfn);
1129         pte = &pte[pfn_level_offset(pfn, level)];
1130
1131         do {
1132                 unsigned long level_pfn;
1133                 struct dma_pte *level_pte;
1134
1135                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1136                         goto next;
1137
1138                 level_pfn = pfn & level_mask(level);
1139                 level_pte = phys_to_virt(dma_pte_addr(pte));
1140
1141                 if (level > 2) {
1142                         dma_pte_free_level(domain, level - 1, retain_level,
1143                                            level_pte, level_pfn, start_pfn,
1144                                            last_pfn);
1145                 }
1146
1147                 /*
1148                  * Free the page table if we're below the level we want to
1149                  * retain and the range covers the entire table.
1150                  */
1151                 if (level < retain_level && !(start_pfn > level_pfn ||
1152                       last_pfn < level_pfn + level_size(level) - 1)) {
1153                         dma_clear_pte(pte);
1154                         domain_flush_cache(domain, pte, sizeof(*pte));
1155                         free_pgtable_page(level_pte);
1156                 }
1157 next:
1158                 pfn += level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160 }
1161
1162 /*
1163  * clear last level (leaf) ptes and free page table pages below the
1164  * level we wish to keep intact.
1165  */
1166 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1167                                    unsigned long start_pfn,
1168                                    unsigned long last_pfn,
1169                                    int retain_level)
1170 {
1171         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1172         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1173         BUG_ON(start_pfn > last_pfn);
1174
1175         dma_pte_clear_range(domain, start_pfn, last_pfn);
1176
1177         /* We don't need lock here; nobody else touches the iova range */
1178         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1179                            domain->pgd, 0, start_pfn, last_pfn);
1180
1181         /* free pgd */
1182         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1183                 free_pgtable_page(domain->pgd);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* When a page at a given level is being unlinked from its parent, we don't
1189    need to *modify* it at all. All we need to do is make a list of all the
1190    pages which can be freed just as soon as we've flushed the IOTLB and we
1191    know the hardware page-walk will no longer touch them.
1192    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1193    be freed. */
1194 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1195                                             int level, struct dma_pte *pte,
1196                                             struct page *freelist)
1197 {
1198         struct page *pg;
1199
1200         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1201         pg->freelist = freelist;
1202         freelist = pg;
1203
1204         if (level == 1)
1205                 return freelist;
1206
1207         pte = page_address(pg);
1208         do {
1209                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1210                         freelist = dma_pte_list_pagetables(domain, level - 1,
1211                                                            pte, freelist);
1212                 pte++;
1213         } while (!first_pte_in_page(pte));
1214
1215         return freelist;
1216 }
1217
1218 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1219                                         struct dma_pte *pte, unsigned long pfn,
1220                                         unsigned long start_pfn,
1221                                         unsigned long last_pfn,
1222                                         struct page *freelist)
1223 {
1224         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1225
1226         pfn = max(start_pfn, pfn);
1227         pte = &pte[pfn_level_offset(pfn, level)];
1228
1229         do {
1230                 unsigned long level_pfn;
1231
1232                 if (!dma_pte_present(pte))
1233                         goto next;
1234
1235                 level_pfn = pfn & level_mask(level);
1236
1237                 /* If range covers entire pagetable, free it */
1238                 if (start_pfn <= level_pfn &&
1239                     last_pfn >= level_pfn + level_size(level) - 1) {
1240                         /* These suborbinate page tables are going away entirely. Don't
1241                            bother to clear them; we're just going to *free* them. */
1242                         if (level > 1 && !dma_pte_superpage(pte))
1243                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1244
1245                         dma_clear_pte(pte);
1246                         if (!first_pte)
1247                                 first_pte = pte;
1248                         last_pte = pte;
1249                 } else if (level > 1) {
1250                         /* Recurse down into a level that isn't *entirely* obsolete */
1251                         freelist = dma_pte_clear_level(domain, level - 1,
1252                                                        phys_to_virt(dma_pte_addr(pte)),
1253                                                        level_pfn, start_pfn, last_pfn,
1254                                                        freelist);
1255                 }
1256 next:
1257                 pfn += level_size(level);
1258         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1259
1260         if (first_pte)
1261                 domain_flush_cache(domain, first_pte,
1262                                    (void *)++last_pte - (void *)first_pte);
1263
1264         return freelist;
1265 }
1266
1267 /* We can't just free the pages because the IOMMU may still be walking
1268    the page tables, and may have cached the intermediate levels. The
1269    pages can only be freed after the IOTLB flush has been done. */
1270 static struct page *domain_unmap(struct dmar_domain *domain,
1271                                  unsigned long start_pfn,
1272                                  unsigned long last_pfn,
1273                                  struct page *freelist)
1274 {
1275         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1276         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1277         BUG_ON(start_pfn > last_pfn);
1278
1279         /* we don't need lock here; nobody else touches the iova range */
1280         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1281                                        domain->pgd, 0, start_pfn, last_pfn,
1282                                        freelist);
1283
1284         /* free pgd */
1285         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1286                 struct page *pgd_page = virt_to_page(domain->pgd);
1287                 pgd_page->freelist = freelist;
1288                 freelist = pgd_page;
1289
1290                 domain->pgd = NULL;
1291         }
1292
1293         return freelist;
1294 }
1295
1296 static void dma_free_pagelist(struct page *freelist)
1297 {
1298         struct page *pg;
1299
1300         while ((pg = freelist)) {
1301                 freelist = pg->freelist;
1302                 free_pgtable_page(page_address(pg));
1303         }
1304 }
1305
1306 /* iommu handling */
1307 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1308 {
1309         struct root_entry *root;
1310         unsigned long flags;
1311
1312         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1313         if (!root) {
1314                 pr_err("Allocating root entry for %s failed\n",
1315                         iommu->name);
1316                 return -ENOMEM;
1317         }
1318
1319         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1320
1321         spin_lock_irqsave(&iommu->lock, flags);
1322         iommu->root_entry = root;
1323         spin_unlock_irqrestore(&iommu->lock, flags);
1324
1325         return 0;
1326 }
1327
1328 static void iommu_set_root_entry(struct intel_iommu *iommu)
1329 {
1330         u64 addr;
1331         u32 sts;
1332         unsigned long flag;
1333
1334         addr = virt_to_phys(iommu->root_entry);
1335         if (sm_supported(iommu))
1336                 addr |= DMA_RTADDR_SMT;
1337
1338         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1340
1341         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1342
1343         /* Make sure hardware complete it */
1344         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1345                       readl, (sts & DMA_GSTS_RTPS), sts);
1346
1347         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348
1349         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1350         if (sm_supported(iommu))
1351                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1352         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1353 }
1354
1355 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1356 {
1357         u32 val;
1358         unsigned long flag;
1359
1360         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1361                 return;
1362
1363         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1365
1366         /* Make sure hardware complete it */
1367         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1368                       readl, (!(val & DMA_GSTS_WBFS)), val);
1369
1370         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1371 }
1372
1373 /* return value determine if we need a write buffer flush */
1374 static void __iommu_flush_context(struct intel_iommu *iommu,
1375                                   u16 did, u16 source_id, u8 function_mask,
1376                                   u64 type)
1377 {
1378         u64 val = 0;
1379         unsigned long flag;
1380
1381         switch (type) {
1382         case DMA_CCMD_GLOBAL_INVL:
1383                 val = DMA_CCMD_GLOBAL_INVL;
1384                 break;
1385         case DMA_CCMD_DOMAIN_INVL:
1386                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1387                 break;
1388         case DMA_CCMD_DEVICE_INVL:
1389                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1390                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1391                 break;
1392         default:
1393                 BUG();
1394         }
1395         val |= DMA_CCMD_ICC;
1396
1397         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1398         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1399
1400         /* Make sure hardware complete it */
1401         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1402                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1403
1404         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1405 }
1406
1407 /* return value determine if we need a write buffer flush */
1408 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1409                                 u64 addr, unsigned int size_order, u64 type)
1410 {
1411         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1412         u64 val = 0, val_iva = 0;
1413         unsigned long flag;
1414
1415         switch (type) {
1416         case DMA_TLB_GLOBAL_FLUSH:
1417                 /* global flush doesn't need set IVA_REG */
1418                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1419                 break;
1420         case DMA_TLB_DSI_FLUSH:
1421                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1422                 break;
1423         case DMA_TLB_PSI_FLUSH:
1424                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1425                 /* IH bit is passed in as part of address */
1426                 val_iva = size_order | addr;
1427                 break;
1428         default:
1429                 BUG();
1430         }
1431         /* Note: set drain read/write */
1432 #if 0
1433         /*
1434          * This is probably to be super secure.. Looks like we can
1435          * ignore it without any impact.
1436          */
1437         if (cap_read_drain(iommu->cap))
1438                 val |= DMA_TLB_READ_DRAIN;
1439 #endif
1440         if (cap_write_drain(iommu->cap))
1441                 val |= DMA_TLB_WRITE_DRAIN;
1442
1443         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1444         /* Note: Only uses first TLB reg currently */
1445         if (val_iva)
1446                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1447         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1448
1449         /* Make sure hardware complete it */
1450         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1451                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1452
1453         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1454
1455         /* check IOTLB invalidation granularity */
1456         if (DMA_TLB_IAIG(val) == 0)
1457                 pr_err("Flush IOTLB failed\n");
1458         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1459                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1460                         (unsigned long long)DMA_TLB_IIRG(type),
1461                         (unsigned long long)DMA_TLB_IAIG(val));
1462 }
1463
1464 static struct device_domain_info *
1465 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1466                          u8 bus, u8 devfn)
1467 {
1468         struct device_domain_info *info;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         if (!iommu->qi)
1473                 return NULL;
1474
1475         list_for_each_entry(info, &domain->devices, link)
1476                 if (info->iommu == iommu && info->bus == bus &&
1477                     info->devfn == devfn) {
1478                         if (info->ats_supported && info->dev)
1479                                 return info;
1480                         break;
1481                 }
1482
1483         return NULL;
1484 }
1485
1486 static void domain_update_iotlb(struct dmar_domain *domain)
1487 {
1488         struct device_domain_info *info;
1489         bool has_iotlb_device = false;
1490
1491         assert_spin_locked(&device_domain_lock);
1492
1493         list_for_each_entry(info, &domain->devices, link)
1494                 if (info->ats_enabled) {
1495                         has_iotlb_device = true;
1496                         break;
1497                 }
1498
1499         if (!has_iotlb_device) {
1500                 struct subdev_domain_info *sinfo;
1501
1502                 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1503                         info = get_domain_info(sinfo->pdev);
1504                         if (info && info->ats_enabled) {
1505                                 has_iotlb_device = true;
1506                                 break;
1507                         }
1508                 }
1509         }
1510
1511         domain->has_iotlb_device = has_iotlb_device;
1512 }
1513
1514 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1515 {
1516         struct pci_dev *pdev;
1517
1518         assert_spin_locked(&device_domain_lock);
1519
1520         if (!info || !dev_is_pci(info->dev))
1521                 return;
1522
1523         pdev = to_pci_dev(info->dev);
1524         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1525          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1526          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1527          * reserved, which should be set to 0.
1528          */
1529         if (!ecap_dit(info->iommu->ecap))
1530                 info->pfsid = 0;
1531         else {
1532                 struct pci_dev *pf_pdev;
1533
1534                 /* pdev will be returned if device is not a vf */
1535                 pf_pdev = pci_physfn(pdev);
1536                 info->pfsid = pci_dev_id(pf_pdev);
1537         }
1538
1539 #ifdef CONFIG_INTEL_IOMMU_SVM
1540         /* The PCIe spec, in its wisdom, declares that the behaviour of
1541            the device if you enable PASID support after ATS support is
1542            undefined. So always enable PASID support on devices which
1543            have it, even if we can't yet know if we're ever going to
1544            use it. */
1545         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1546                 info->pasid_enabled = 1;
1547
1548         if (info->pri_supported &&
1549             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1550             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1551                 info->pri_enabled = 1;
1552 #endif
1553         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1554             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1555                 info->ats_enabled = 1;
1556                 domain_update_iotlb(info->domain);
1557                 info->ats_qdep = pci_ats_queue_depth(pdev);
1558         }
1559 }
1560
1561 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1562 {
1563         struct pci_dev *pdev;
1564
1565         assert_spin_locked(&device_domain_lock);
1566
1567         if (!dev_is_pci(info->dev))
1568                 return;
1569
1570         pdev = to_pci_dev(info->dev);
1571
1572         if (info->ats_enabled) {
1573                 pci_disable_ats(pdev);
1574                 info->ats_enabled = 0;
1575                 domain_update_iotlb(info->domain);
1576         }
1577 #ifdef CONFIG_INTEL_IOMMU_SVM
1578         if (info->pri_enabled) {
1579                 pci_disable_pri(pdev);
1580                 info->pri_enabled = 0;
1581         }
1582         if (info->pasid_enabled) {
1583                 pci_disable_pasid(pdev);
1584                 info->pasid_enabled = 0;
1585         }
1586 #endif
1587 }
1588
1589 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1590                                     u64 addr, unsigned int mask)
1591 {
1592         u16 sid, qdep;
1593
1594         if (!info || !info->ats_enabled)
1595                 return;
1596
1597         sid = info->bus << 8 | info->devfn;
1598         qdep = info->ats_qdep;
1599         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1600                            qdep, addr, mask);
1601 }
1602
1603 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1604                                   u64 addr, unsigned mask)
1605 {
1606         unsigned long flags;
1607         struct device_domain_info *info;
1608         struct subdev_domain_info *sinfo;
1609
1610         if (!domain->has_iotlb_device)
1611                 return;
1612
1613         spin_lock_irqsave(&device_domain_lock, flags);
1614         list_for_each_entry(info, &domain->devices, link)
1615                 __iommu_flush_dev_iotlb(info, addr, mask);
1616
1617         list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1618                 info = get_domain_info(sinfo->pdev);
1619                 __iommu_flush_dev_iotlb(info, addr, mask);
1620         }
1621         spin_unlock_irqrestore(&device_domain_lock, flags);
1622 }
1623
1624 static void domain_flush_piotlb(struct intel_iommu *iommu,
1625                                 struct dmar_domain *domain,
1626                                 u64 addr, unsigned long npages, bool ih)
1627 {
1628         u16 did = domain->iommu_did[iommu->seq_id];
1629
1630         if (domain->default_pasid)
1631                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1632                                 addr, npages, ih);
1633
1634         if (!list_empty(&domain->devices))
1635                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1636 }
1637
1638 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1639                                   struct dmar_domain *domain,
1640                                   unsigned long pfn, unsigned int pages,
1641                                   int ih, int map)
1642 {
1643         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1644         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1645         u16 did = domain->iommu_did[iommu->seq_id];
1646
1647         BUG_ON(pages == 0);
1648
1649         if (ih)
1650                 ih = 1 << 6;
1651
1652         if (domain_use_first_level(domain)) {
1653                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1654         } else {
1655                 /*
1656                  * Fallback to domain selective flush if no PSI support or
1657                  * the size is too big. PSI requires page size to be 2 ^ x,
1658                  * and the base address is naturally aligned to the size.
1659                  */
1660                 if (!cap_pgsel_inv(iommu->cap) ||
1661                     mask > cap_max_amask_val(iommu->cap))
1662                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1663                                                         DMA_TLB_DSI_FLUSH);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1666                                                         DMA_TLB_PSI_FLUSH);
1667         }
1668
1669         /*
1670          * In caching mode, changes of pages from non-present to present require
1671          * flush. However, device IOTLB doesn't need to be flushed in this case.
1672          */
1673         if (!cap_caching_mode(iommu->cap) || !map)
1674                 iommu_flush_dev_iotlb(domain, addr, mask);
1675 }
1676
1677 /* Notification for newly created mappings */
1678 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1679                                         struct dmar_domain *domain,
1680                                         unsigned long pfn, unsigned int pages)
1681 {
1682         /*
1683          * It's a non-present to present mapping. Only flush if caching mode
1684          * and second level.
1685          */
1686         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1687                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1688         else
1689                 iommu_flush_write_buffer(iommu);
1690 }
1691
1692 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1693 {
1694         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1695         int idx;
1696
1697         for_each_domain_iommu(idx, dmar_domain) {
1698                 struct intel_iommu *iommu = g_iommus[idx];
1699                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1700
1701                 if (domain_use_first_level(dmar_domain))
1702                         domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1703                 else
1704                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1705                                                  DMA_TLB_DSI_FLUSH);
1706
1707                 if (!cap_caching_mode(iommu->cap))
1708                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1709                                               0, MAX_AGAW_PFN_WIDTH);
1710         }
1711 }
1712
1713 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1714 {
1715         u32 pmen;
1716         unsigned long flags;
1717
1718         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1719                 return;
1720
1721         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1722         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1723         pmen &= ~DMA_PMEN_EPM;
1724         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1725
1726         /* wait for the protected region status bit to clear */
1727         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1728                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1729
1730         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1731 }
1732
1733 static void iommu_enable_translation(struct intel_iommu *iommu)
1734 {
1735         u32 sts;
1736         unsigned long flags;
1737
1738         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1739         iommu->gcmd |= DMA_GCMD_TE;
1740         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1741
1742         /* Make sure hardware complete it */
1743         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1744                       readl, (sts & DMA_GSTS_TES), sts);
1745
1746         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1747 }
1748
1749 static void iommu_disable_translation(struct intel_iommu *iommu)
1750 {
1751         u32 sts;
1752         unsigned long flag;
1753
1754         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1755             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1756                 return;
1757
1758         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1759         iommu->gcmd &= ~DMA_GCMD_TE;
1760         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1761
1762         /* Make sure hardware complete it */
1763         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1764                       readl, (!(sts & DMA_GSTS_TES)), sts);
1765
1766         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1767 }
1768
1769 static int iommu_init_domains(struct intel_iommu *iommu)
1770 {
1771         u32 ndomains, nlongs;
1772         size_t size;
1773
1774         ndomains = cap_ndoms(iommu->cap);
1775         pr_debug("%s: Number of Domains supported <%d>\n",
1776                  iommu->name, ndomains);
1777         nlongs = BITS_TO_LONGS(ndomains);
1778
1779         spin_lock_init(&iommu->lock);
1780
1781         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1782         if (!iommu->domain_ids) {
1783                 pr_err("%s: Allocating domain id array failed\n",
1784                        iommu->name);
1785                 return -ENOMEM;
1786         }
1787
1788         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1789         iommu->domains = kzalloc(size, GFP_KERNEL);
1790
1791         if (iommu->domains) {
1792                 size = 256 * sizeof(struct dmar_domain *);
1793                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1794         }
1795
1796         if (!iommu->domains || !iommu->domains[0]) {
1797                 pr_err("%s: Allocating domain array failed\n",
1798                        iommu->name);
1799                 kfree(iommu->domain_ids);
1800                 kfree(iommu->domains);
1801                 iommu->domain_ids = NULL;
1802                 iommu->domains    = NULL;
1803                 return -ENOMEM;
1804         }
1805
1806         /*
1807          * If Caching mode is set, then invalid translations are tagged
1808          * with domain-id 0, hence we need to pre-allocate it. We also
1809          * use domain-id 0 as a marker for non-allocated domain-id, so
1810          * make sure it is not used for a real domain.
1811          */
1812         set_bit(0, iommu->domain_ids);
1813
1814         /*
1815          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1816          * entry for first-level or pass-through translation modes should
1817          * be programmed with a domain id different from those used for
1818          * second-level or nested translation. We reserve a domain id for
1819          * this purpose.
1820          */
1821         if (sm_supported(iommu))
1822                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1823
1824         return 0;
1825 }
1826
1827 static void disable_dmar_iommu(struct intel_iommu *iommu)
1828 {
1829         struct device_domain_info *info, *tmp;
1830         unsigned long flags;
1831
1832         if (!iommu->domains || !iommu->domain_ids)
1833                 return;
1834
1835         spin_lock_irqsave(&device_domain_lock, flags);
1836         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1837                 if (info->iommu != iommu)
1838                         continue;
1839
1840                 if (!info->dev || !info->domain)
1841                         continue;
1842
1843                 __dmar_remove_one_dev_info(info);
1844         }
1845         spin_unlock_irqrestore(&device_domain_lock, flags);
1846
1847         if (iommu->gcmd & DMA_GCMD_TE)
1848                 iommu_disable_translation(iommu);
1849 }
1850
1851 static void free_dmar_iommu(struct intel_iommu *iommu)
1852 {
1853         if ((iommu->domains) && (iommu->domain_ids)) {
1854                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1855                 int i;
1856
1857                 for (i = 0; i < elems; i++)
1858                         kfree(iommu->domains[i]);
1859                 kfree(iommu->domains);
1860                 kfree(iommu->domain_ids);
1861                 iommu->domains = NULL;
1862                 iommu->domain_ids = NULL;
1863         }
1864
1865         g_iommus[iommu->seq_id] = NULL;
1866
1867         /* free context mapping */
1868         free_context_table(iommu);
1869
1870 #ifdef CONFIG_INTEL_IOMMU_SVM
1871         if (pasid_supported(iommu)) {
1872                 if (ecap_prs(iommu->ecap))
1873                         intel_svm_finish_prq(iommu);
1874         }
1875         if (vccap_pasid(iommu->vccap))
1876                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1877
1878 #endif
1879 }
1880
1881 /*
1882  * Check and return whether first level is used by default for
1883  * DMA translation.
1884  */
1885 static bool first_level_by_default(void)
1886 {
1887         return scalable_mode_support() && intel_cap_flts_sanity();
1888 }
1889
1890 static struct dmar_domain *alloc_domain(int flags)
1891 {
1892         struct dmar_domain *domain;
1893
1894         domain = alloc_domain_mem();
1895         if (!domain)
1896                 return NULL;
1897
1898         memset(domain, 0, sizeof(*domain));
1899         domain->nid = NUMA_NO_NODE;
1900         domain->flags = flags;
1901         if (first_level_by_default())
1902                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1903         domain->has_iotlb_device = false;
1904         INIT_LIST_HEAD(&domain->devices);
1905         INIT_LIST_HEAD(&domain->subdevices);
1906
1907         return domain;
1908 }
1909
1910 /* Must be called with iommu->lock */
1911 static int domain_attach_iommu(struct dmar_domain *domain,
1912                                struct intel_iommu *iommu)
1913 {
1914         unsigned long ndomains;
1915         int num;
1916
1917         assert_spin_locked(&device_domain_lock);
1918         assert_spin_locked(&iommu->lock);
1919
1920         domain->iommu_refcnt[iommu->seq_id] += 1;
1921         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1922                 ndomains = cap_ndoms(iommu->cap);
1923                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1924
1925                 if (num >= ndomains) {
1926                         pr_err("%s: No free domain ids\n", iommu->name);
1927                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1928                         return -ENOSPC;
1929                 }
1930
1931                 set_bit(num, iommu->domain_ids);
1932                 set_iommu_domain(iommu, num, domain);
1933
1934                 domain->iommu_did[iommu->seq_id] = num;
1935                 domain->nid                      = iommu->node;
1936
1937                 domain_update_iommu_cap(domain);
1938         }
1939
1940         return 0;
1941 }
1942
1943 static void domain_detach_iommu(struct dmar_domain *domain,
1944                                 struct intel_iommu *iommu)
1945 {
1946         int num;
1947
1948         assert_spin_locked(&device_domain_lock);
1949         assert_spin_locked(&iommu->lock);
1950
1951         domain->iommu_refcnt[iommu->seq_id] -= 1;
1952         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1953                 num = domain->iommu_did[iommu->seq_id];
1954                 clear_bit(num, iommu->domain_ids);
1955                 set_iommu_domain(iommu, num, NULL);
1956
1957                 domain_update_iommu_cap(domain);
1958                 domain->iommu_did[iommu->seq_id] = 0;
1959         }
1960 }
1961
1962 static inline int guestwidth_to_adjustwidth(int gaw)
1963 {
1964         int agaw;
1965         int r = (gaw - 12) % 9;
1966
1967         if (r == 0)
1968                 agaw = gaw;
1969         else
1970                 agaw = gaw + 9 - r;
1971         if (agaw > 64)
1972                 agaw = 64;
1973         return agaw;
1974 }
1975
1976 static void domain_exit(struct dmar_domain *domain)
1977 {
1978
1979         /* Remove associated devices and clear attached or cached domains */
1980         domain_remove_dev_info(domain);
1981
1982         /* destroy iovas */
1983         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1984                 iommu_put_dma_cookie(&domain->domain);
1985
1986         if (domain->pgd) {
1987                 struct page *freelist;
1988
1989                 freelist = domain_unmap(domain, 0,
1990                                         DOMAIN_MAX_PFN(domain->gaw), NULL);
1991                 dma_free_pagelist(freelist);
1992         }
1993
1994         free_domain_mem(domain);
1995 }
1996
1997 /*
1998  * Get the PASID directory size for scalable mode context entry.
1999  * Value of X in the PDTS field of a scalable mode context entry
2000  * indicates PASID directory with 2^(X + 7) entries.
2001  */
2002 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2003 {
2004         int pds, max_pde;
2005
2006         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2007         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2008         if (pds < 7)
2009                 return 0;
2010
2011         return pds - 7;
2012 }
2013
2014 /*
2015  * Set the RID_PASID field of a scalable mode context entry. The
2016  * IOMMU hardware will use the PASID value set in this field for
2017  * DMA translations of DMA requests without PASID.
2018  */
2019 static inline void
2020 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2021 {
2022         context->hi |= pasid & ((1 << 20) - 1);
2023 }
2024
2025 /*
2026  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2027  * entry.
2028  */
2029 static inline void context_set_sm_dte(struct context_entry *context)
2030 {
2031         context->lo |= (1 << 2);
2032 }
2033
2034 /*
2035  * Set the PRE(Page Request Enable) field of a scalable mode context
2036  * entry.
2037  */
2038 static inline void context_set_sm_pre(struct context_entry *context)
2039 {
2040         context->lo |= (1 << 4);
2041 }
2042
2043 /* Convert value to context PASID directory size field coding. */
2044 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2045
2046 static int domain_context_mapping_one(struct dmar_domain *domain,
2047                                       struct intel_iommu *iommu,
2048                                       struct pasid_table *table,
2049                                       u8 bus, u8 devfn)
2050 {
2051         u16 did = domain->iommu_did[iommu->seq_id];
2052         int translation = CONTEXT_TT_MULTI_LEVEL;
2053         struct device_domain_info *info = NULL;
2054         struct context_entry *context;
2055         unsigned long flags;
2056         int ret;
2057
2058         WARN_ON(did == 0);
2059
2060         if (hw_pass_through && domain_type_is_si(domain))
2061                 translation = CONTEXT_TT_PASS_THROUGH;
2062
2063         pr_debug("Set context mapping for %02x:%02x.%d\n",
2064                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2065
2066         BUG_ON(!domain->pgd);
2067
2068         spin_lock_irqsave(&device_domain_lock, flags);
2069         spin_lock(&iommu->lock);
2070
2071         ret = -ENOMEM;
2072         context = iommu_context_addr(iommu, bus, devfn, 1);
2073         if (!context)
2074                 goto out_unlock;
2075
2076         ret = 0;
2077         if (context_present(context))
2078                 goto out_unlock;
2079
2080         /*
2081          * For kdump cases, old valid entries may be cached due to the
2082          * in-flight DMA and copied pgtable, but there is no unmapping
2083          * behaviour for them, thus we need an explicit cache flush for
2084          * the newly-mapped device. For kdump, at this point, the device
2085          * is supposed to finish reset at its driver probe stage, so no
2086          * in-flight DMA will exist, and we don't need to worry anymore
2087          * hereafter.
2088          */
2089         if (context_copied(context)) {
2090                 u16 did_old = context_domain_id(context);
2091
2092                 if (did_old < cap_ndoms(iommu->cap)) {
2093                         iommu->flush.flush_context(iommu, did_old,
2094                                                    (((u16)bus) << 8) | devfn,
2095                                                    DMA_CCMD_MASK_NOBIT,
2096                                                    DMA_CCMD_DEVICE_INVL);
2097                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2098                                                  DMA_TLB_DSI_FLUSH);
2099                 }
2100         }
2101
2102         context_clear_entry(context);
2103
2104         if (sm_supported(iommu)) {
2105                 unsigned long pds;
2106
2107                 WARN_ON(!table);
2108
2109                 /* Setup the PASID DIR pointer: */
2110                 pds = context_get_sm_pds(table);
2111                 context->lo = (u64)virt_to_phys(table->table) |
2112                                 context_pdts(pds);
2113
2114                 /* Setup the RID_PASID field: */
2115                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2116
2117                 /*
2118                  * Setup the Device-TLB enable bit and Page request
2119                  * Enable bit:
2120                  */
2121                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2122                 if (info && info->ats_supported)
2123                         context_set_sm_dte(context);
2124                 if (info && info->pri_supported)
2125                         context_set_sm_pre(context);
2126         } else {
2127                 struct dma_pte *pgd = domain->pgd;
2128                 int agaw;
2129
2130                 context_set_domain_id(context, did);
2131
2132                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2133                         /*
2134                          * Skip top levels of page tables for iommu which has
2135                          * less agaw than default. Unnecessary for PT mode.
2136                          */
2137                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2138                                 ret = -ENOMEM;
2139                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2140                                 if (!dma_pte_present(pgd))
2141                                         goto out_unlock;
2142                         }
2143
2144                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2145                         if (info && info->ats_supported)
2146                                 translation = CONTEXT_TT_DEV_IOTLB;
2147                         else
2148                                 translation = CONTEXT_TT_MULTI_LEVEL;
2149
2150                         context_set_address_root(context, virt_to_phys(pgd));
2151                         context_set_address_width(context, agaw);
2152                 } else {
2153                         /*
2154                          * In pass through mode, AW must be programmed to
2155                          * indicate the largest AGAW value supported by
2156                          * hardware. And ASR is ignored by hardware.
2157                          */
2158                         context_set_address_width(context, iommu->msagaw);
2159                 }
2160
2161                 context_set_translation_type(context, translation);
2162         }
2163
2164         context_set_fault_enable(context);
2165         context_set_present(context);
2166         if (!ecap_coherent(iommu->ecap))
2167                 clflush_cache_range(context, sizeof(*context));
2168
2169         /*
2170          * It's a non-present to present mapping. If hardware doesn't cache
2171          * non-present entry we only need to flush the write-buffer. If the
2172          * _does_ cache non-present entries, then it does so in the special
2173          * domain #0, which we have to flush:
2174          */
2175         if (cap_caching_mode(iommu->cap)) {
2176                 iommu->flush.flush_context(iommu, 0,
2177                                            (((u16)bus) << 8) | devfn,
2178                                            DMA_CCMD_MASK_NOBIT,
2179                                            DMA_CCMD_DEVICE_INVL);
2180                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2181         } else {
2182                 iommu_flush_write_buffer(iommu);
2183         }
2184         iommu_enable_dev_iotlb(info);
2185
2186         ret = 0;
2187
2188 out_unlock:
2189         spin_unlock(&iommu->lock);
2190         spin_unlock_irqrestore(&device_domain_lock, flags);
2191
2192         return ret;
2193 }
2194
2195 struct domain_context_mapping_data {
2196         struct dmar_domain *domain;
2197         struct intel_iommu *iommu;
2198         struct pasid_table *table;
2199 };
2200
2201 static int domain_context_mapping_cb(struct pci_dev *pdev,
2202                                      u16 alias, void *opaque)
2203 {
2204         struct domain_context_mapping_data *data = opaque;
2205
2206         return domain_context_mapping_one(data->domain, data->iommu,
2207                                           data->table, PCI_BUS_NUM(alias),
2208                                           alias & 0xff);
2209 }
2210
2211 static int
2212 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2213 {
2214         struct domain_context_mapping_data data;
2215         struct pasid_table *table;
2216         struct intel_iommu *iommu;
2217         u8 bus, devfn;
2218
2219         iommu = device_to_iommu(dev, &bus, &devfn);
2220         if (!iommu)
2221                 return -ENODEV;
2222
2223         table = intel_pasid_get_table(dev);
2224
2225         if (!dev_is_pci(dev))
2226                 return domain_context_mapping_one(domain, iommu, table,
2227                                                   bus, devfn);
2228
2229         data.domain = domain;
2230         data.iommu = iommu;
2231         data.table = table;
2232
2233         return pci_for_each_dma_alias(to_pci_dev(dev),
2234                                       &domain_context_mapping_cb, &data);
2235 }
2236
2237 static int domain_context_mapped_cb(struct pci_dev *pdev,
2238                                     u16 alias, void *opaque)
2239 {
2240         struct intel_iommu *iommu = opaque;
2241
2242         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2243 }
2244
2245 static int domain_context_mapped(struct device *dev)
2246 {
2247         struct intel_iommu *iommu;
2248         u8 bus, devfn;
2249
2250         iommu = device_to_iommu(dev, &bus, &devfn);
2251         if (!iommu)
2252                 return -ENODEV;
2253
2254         if (!dev_is_pci(dev))
2255                 return device_context_mapped(iommu, bus, devfn);
2256
2257         return !pci_for_each_dma_alias(to_pci_dev(dev),
2258                                        domain_context_mapped_cb, iommu);
2259 }
2260
2261 /* Returns a number of VTD pages, but aligned to MM page size */
2262 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2263                                             size_t size)
2264 {
2265         host_addr &= ~PAGE_MASK;
2266         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2267 }
2268
2269 /* Return largest possible superpage level for a given mapping */
2270 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2271                                           unsigned long iov_pfn,
2272                                           unsigned long phy_pfn,
2273                                           unsigned long pages)
2274 {
2275         int support, level = 1;
2276         unsigned long pfnmerge;
2277
2278         support = domain->iommu_superpage;
2279
2280         /* To use a large page, the virtual *and* physical addresses
2281            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2282            of them will mean we have to use smaller pages. So just
2283            merge them and check both at once. */
2284         pfnmerge = iov_pfn | phy_pfn;
2285
2286         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2287                 pages >>= VTD_STRIDE_SHIFT;
2288                 if (!pages)
2289                         break;
2290                 pfnmerge >>= VTD_STRIDE_SHIFT;
2291                 level++;
2292                 support--;
2293         }
2294         return level;
2295 }
2296
2297 /*
2298  * Ensure that old small page tables are removed to make room for superpage(s).
2299  * We're going to add new large pages, so make sure we don't remove their parent
2300  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2301  */
2302 static void switch_to_super_page(struct dmar_domain *domain,
2303                                  unsigned long start_pfn,
2304                                  unsigned long end_pfn, int level)
2305 {
2306         unsigned long lvl_pages = lvl_to_nr_pages(level);
2307         struct dma_pte *pte = NULL;
2308         int i;
2309
2310         while (start_pfn <= end_pfn) {
2311                 if (!pte)
2312                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2313
2314                 if (dma_pte_present(pte)) {
2315                         dma_pte_free_pagetable(domain, start_pfn,
2316                                                start_pfn + lvl_pages - 1,
2317                                                level + 1);
2318
2319                         for_each_domain_iommu(i, domain)
2320                                 iommu_flush_iotlb_psi(g_iommus[i], domain,
2321                                                       start_pfn, lvl_pages,
2322                                                       0, 0);
2323                 }
2324
2325                 pte++;
2326                 start_pfn += lvl_pages;
2327                 if (first_pte_in_page(pte))
2328                         pte = NULL;
2329         }
2330 }
2331
2332 static int
2333 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2334                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2335 {
2336         unsigned int largepage_lvl = 0;
2337         unsigned long lvl_pages = 0;
2338         struct dma_pte *pte = NULL;
2339         phys_addr_t pteval;
2340         u64 attr;
2341
2342         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2343
2344         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2345                 return -EINVAL;
2346
2347         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2348         attr |= DMA_FL_PTE_PRESENT;
2349         if (domain_use_first_level(domain)) {
2350                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2351
2352                 if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2353                         attr |= DMA_FL_PTE_ACCESS;
2354                         if (prot & DMA_PTE_WRITE)
2355                                 attr |= DMA_FL_PTE_DIRTY;
2356                 }
2357         }
2358
2359         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2360
2361         while (nr_pages > 0) {
2362                 uint64_t tmp;
2363
2364                 if (!pte) {
2365                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2366                                         phys_pfn, nr_pages);
2367
2368                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2369                         if (!pte)
2370                                 return -ENOMEM;
2371                         /* It is large page*/
2372                         if (largepage_lvl > 1) {
2373                                 unsigned long end_pfn;
2374
2375                                 pteval |= DMA_PTE_LARGE_PAGE;
2376                                 end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2377                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2378                         } else {
2379                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2380                         }
2381
2382                 }
2383                 /* We don't need lock here, nobody else
2384                  * touches the iova range
2385                  */
2386                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2387                 if (tmp) {
2388                         static int dumps = 5;
2389                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2390                                 iov_pfn, tmp, (unsigned long long)pteval);
2391                         if (dumps) {
2392                                 dumps--;
2393                                 debug_dma_dump_mappings(NULL);
2394                         }
2395                         WARN_ON(1);
2396                 }
2397
2398                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2399
2400                 BUG_ON(nr_pages < lvl_pages);
2401
2402                 nr_pages -= lvl_pages;
2403                 iov_pfn += lvl_pages;
2404                 phys_pfn += lvl_pages;
2405                 pteval += lvl_pages * VTD_PAGE_SIZE;
2406
2407                 /* If the next PTE would be the first in a new page, then we
2408                  * need to flush the cache on the entries we've just written.
2409                  * And then we'll need to recalculate 'pte', so clear it and
2410                  * let it get set again in the if (!pte) block above.
2411                  *
2412                  * If we're done (!nr_pages) we need to flush the cache too.
2413                  *
2414                  * Also if we've been setting superpages, we may need to
2415                  * recalculate 'pte' and switch back to smaller pages for the
2416                  * end of the mapping, if the trailing size is not enough to
2417                  * use another superpage (i.e. nr_pages < lvl_pages).
2418                  *
2419                  * We leave clflush for the leaf pte changes to iotlb_sync_map()
2420                  * callback.
2421                  */
2422                 pte++;
2423                 if (!nr_pages || first_pte_in_page(pte) ||
2424                     (largepage_lvl > 1 && nr_pages < lvl_pages))
2425                         pte = NULL;
2426         }
2427
2428         return 0;
2429 }
2430
2431 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2432 {
2433         struct intel_iommu *iommu = info->iommu;
2434         struct context_entry *context;
2435         unsigned long flags;
2436         u16 did_old;
2437
2438         if (!iommu)
2439                 return;
2440
2441         spin_lock_irqsave(&iommu->lock, flags);
2442         context = iommu_context_addr(iommu, bus, devfn, 0);
2443         if (!context) {
2444                 spin_unlock_irqrestore(&iommu->lock, flags);
2445                 return;
2446         }
2447
2448         if (sm_supported(iommu)) {
2449                 if (hw_pass_through && domain_type_is_si(info->domain))
2450                         did_old = FLPT_DEFAULT_DID;
2451                 else
2452                         did_old = info->domain->iommu_did[iommu->seq_id];
2453         } else {
2454                 did_old = context_domain_id(context);
2455         }
2456
2457         context_clear_entry(context);
2458         __iommu_flush_cache(iommu, context, sizeof(*context));
2459         spin_unlock_irqrestore(&iommu->lock, flags);
2460         iommu->flush.flush_context(iommu,
2461                                    did_old,
2462                                    (((u16)bus) << 8) | devfn,
2463                                    DMA_CCMD_MASK_NOBIT,
2464                                    DMA_CCMD_DEVICE_INVL);
2465
2466         if (sm_supported(iommu))
2467                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2468
2469         iommu->flush.flush_iotlb(iommu,
2470                                  did_old,
2471                                  0,
2472                                  0,
2473                                  DMA_TLB_DSI_FLUSH);
2474
2475         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2476 }
2477
2478 static inline void unlink_domain_info(struct device_domain_info *info)
2479 {
2480         assert_spin_locked(&device_domain_lock);
2481         list_del(&info->link);
2482         list_del(&info->global);
2483         if (info->dev)
2484                 dev_iommu_priv_set(info->dev, NULL);
2485 }
2486
2487 static void domain_remove_dev_info(struct dmar_domain *domain)
2488 {
2489         struct device_domain_info *info, *tmp;
2490         unsigned long flags;
2491
2492         spin_lock_irqsave(&device_domain_lock, flags);
2493         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2494                 __dmar_remove_one_dev_info(info);
2495         spin_unlock_irqrestore(&device_domain_lock, flags);
2496 }
2497
2498 struct dmar_domain *find_domain(struct device *dev)
2499 {
2500         struct device_domain_info *info;
2501
2502         if (unlikely(!dev || !dev->iommu))
2503                 return NULL;
2504
2505         if (unlikely(attach_deferred(dev)))
2506                 return NULL;
2507
2508         /* No lock here, assumes no domain exit in normal case */
2509         info = get_domain_info(dev);
2510         if (likely(info))
2511                 return info->domain;
2512
2513         return NULL;
2514 }
2515
2516 static inline struct device_domain_info *
2517 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2518 {
2519         struct device_domain_info *info;
2520
2521         list_for_each_entry(info, &device_domain_list, global)
2522                 if (info->segment == segment && info->bus == bus &&
2523                     info->devfn == devfn)
2524                         return info;
2525
2526         return NULL;
2527 }
2528
2529 static int domain_setup_first_level(struct intel_iommu *iommu,
2530                                     struct dmar_domain *domain,
2531                                     struct device *dev,
2532                                     u32 pasid)
2533 {
2534         struct dma_pte *pgd = domain->pgd;
2535         int agaw, level;
2536         int flags = 0;
2537
2538         /*
2539          * Skip top levels of page tables for iommu which has
2540          * less agaw than default. Unnecessary for PT mode.
2541          */
2542         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2543                 pgd = phys_to_virt(dma_pte_addr(pgd));
2544                 if (!dma_pte_present(pgd))
2545                         return -ENOMEM;
2546         }
2547
2548         level = agaw_to_level(agaw);
2549         if (level != 4 && level != 5)
2550                 return -EINVAL;
2551
2552         if (pasid != PASID_RID2PASID)
2553                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2554         if (level == 5)
2555                 flags |= PASID_FLAG_FL5LP;
2556
2557         if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2558                 flags |= PASID_FLAG_PAGE_SNOOP;
2559
2560         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2561                                              domain->iommu_did[iommu->seq_id],
2562                                              flags);
2563 }
2564
2565 static bool dev_is_real_dma_subdevice(struct device *dev)
2566 {
2567         return dev && dev_is_pci(dev) &&
2568                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2569 }
2570
2571 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2572                                                     int bus, int devfn,
2573                                                     struct device *dev,
2574                                                     struct dmar_domain *domain)
2575 {
2576         struct dmar_domain *found = NULL;
2577         struct device_domain_info *info;
2578         unsigned long flags;
2579         int ret;
2580
2581         info = alloc_devinfo_mem();
2582         if (!info)
2583                 return NULL;
2584
2585         if (!dev_is_real_dma_subdevice(dev)) {
2586                 info->bus = bus;
2587                 info->devfn = devfn;
2588                 info->segment = iommu->segment;
2589         } else {
2590                 struct pci_dev *pdev = to_pci_dev(dev);
2591
2592                 info->bus = pdev->bus->number;
2593                 info->devfn = pdev->devfn;
2594                 info->segment = pci_domain_nr(pdev->bus);
2595         }
2596
2597         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2598         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2599         info->ats_qdep = 0;
2600         info->dev = dev;
2601         info->domain = domain;
2602         info->iommu = iommu;
2603         info->pasid_table = NULL;
2604         info->auxd_enabled = 0;
2605         INIT_LIST_HEAD(&info->subdevices);
2606
2607         if (dev && dev_is_pci(dev)) {
2608                 struct pci_dev *pdev = to_pci_dev(info->dev);
2609
2610                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2611                     pci_ats_supported(pdev) &&
2612                     dmar_find_matched_atsr_unit(pdev))
2613                         info->ats_supported = 1;
2614
2615                 if (sm_supported(iommu)) {
2616                         if (pasid_supported(iommu)) {
2617                                 int features = pci_pasid_features(pdev);
2618                                 if (features >= 0)
2619                                         info->pasid_supported = features | 1;
2620                         }
2621
2622                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2623                             pci_pri_supported(pdev))
2624                                 info->pri_supported = 1;
2625                 }
2626         }
2627
2628         spin_lock_irqsave(&device_domain_lock, flags);
2629         if (dev)
2630                 found = find_domain(dev);
2631
2632         if (!found) {
2633                 struct device_domain_info *info2;
2634                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2635                                                        info->devfn);
2636                 if (info2) {
2637                         found      = info2->domain;
2638                         info2->dev = dev;
2639                 }
2640         }
2641
2642         if (found) {
2643                 spin_unlock_irqrestore(&device_domain_lock, flags);
2644                 free_devinfo_mem(info);
2645                 /* Caller must free the original domain */
2646                 return found;
2647         }
2648
2649         spin_lock(&iommu->lock);
2650         ret = domain_attach_iommu(domain, iommu);
2651         spin_unlock(&iommu->lock);
2652
2653         if (ret) {
2654                 spin_unlock_irqrestore(&device_domain_lock, flags);
2655                 free_devinfo_mem(info);
2656                 return NULL;
2657         }
2658
2659         list_add(&info->link, &domain->devices);
2660         list_add(&info->global, &device_domain_list);
2661         if (dev)
2662                 dev_iommu_priv_set(dev, info);
2663         spin_unlock_irqrestore(&device_domain_lock, flags);
2664
2665         /* PASID table is mandatory for a PCI device in scalable mode. */
2666         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2667                 ret = intel_pasid_alloc_table(dev);
2668                 if (ret) {
2669                         dev_err(dev, "PASID table allocation failed\n");
2670                         dmar_remove_one_dev_info(dev);
2671                         return NULL;
2672                 }
2673
2674                 /* Setup the PASID entry for requests without PASID: */
2675                 spin_lock_irqsave(&iommu->lock, flags);
2676                 if (hw_pass_through && domain_type_is_si(domain))
2677                         ret = intel_pasid_setup_pass_through(iommu, domain,
2678                                         dev, PASID_RID2PASID);
2679                 else if (domain_use_first_level(domain))
2680                         ret = domain_setup_first_level(iommu, domain, dev,
2681                                         PASID_RID2PASID);
2682                 else
2683                         ret = intel_pasid_setup_second_level(iommu, domain,
2684                                         dev, PASID_RID2PASID);
2685                 spin_unlock_irqrestore(&iommu->lock, flags);
2686                 if (ret) {
2687                         dev_err(dev, "Setup RID2PASID failed\n");
2688                         dmar_remove_one_dev_info(dev);
2689                         return NULL;
2690                 }
2691         }
2692
2693         if (dev && domain_context_mapping(domain, dev)) {
2694                 dev_err(dev, "Domain context map failed\n");
2695                 dmar_remove_one_dev_info(dev);
2696                 return NULL;
2697         }
2698
2699         return domain;
2700 }
2701
2702 static int iommu_domain_identity_map(struct dmar_domain *domain,
2703                                      unsigned long first_vpfn,
2704                                      unsigned long last_vpfn)
2705 {
2706         /*
2707          * RMRR range might have overlap with physical memory range,
2708          * clear it first
2709          */
2710         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2711
2712         return __domain_mapping(domain, first_vpfn,
2713                                 first_vpfn, last_vpfn - first_vpfn + 1,
2714                                 DMA_PTE_READ|DMA_PTE_WRITE);
2715 }
2716
2717 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2718
2719 static int __init si_domain_init(int hw)
2720 {
2721         struct dmar_rmrr_unit *rmrr;
2722         struct device *dev;
2723         int i, nid, ret;
2724
2725         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2726         if (!si_domain)
2727                 return -EFAULT;
2728
2729         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2730                 domain_exit(si_domain);
2731                 return -EFAULT;
2732         }
2733
2734         if (hw)
2735                 return 0;
2736
2737         for_each_online_node(nid) {
2738                 unsigned long start_pfn, end_pfn;
2739                 int i;
2740
2741                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2742                         ret = iommu_domain_identity_map(si_domain,
2743                                         mm_to_dma_pfn(start_pfn),
2744                                         mm_to_dma_pfn(end_pfn));
2745                         if (ret)
2746                                 return ret;
2747                 }
2748         }
2749
2750         /*
2751          * Identity map the RMRRs so that devices with RMRRs could also use
2752          * the si_domain.
2753          */
2754         for_each_rmrr_units(rmrr) {
2755                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2756                                           i, dev) {
2757                         unsigned long long start = rmrr->base_address;
2758                         unsigned long long end = rmrr->end_address;
2759
2760                         if (WARN_ON(end < start ||
2761                                     end >> agaw_to_width(si_domain->agaw)))
2762                                 continue;
2763
2764                         ret = iommu_domain_identity_map(si_domain,
2765                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2766                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2767                         if (ret)
2768                                 return ret;
2769                 }
2770         }
2771
2772         return 0;
2773 }
2774
2775 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2776 {
2777         struct dmar_domain *ndomain;
2778         struct intel_iommu *iommu;
2779         u8 bus, devfn;
2780
2781         iommu = device_to_iommu(dev, &bus, &devfn);
2782         if (!iommu)
2783                 return -ENODEV;
2784
2785         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2786         if (ndomain != domain)
2787                 return -EBUSY;
2788
2789         return 0;
2790 }
2791
2792 static bool device_has_rmrr(struct device *dev)
2793 {
2794         struct dmar_rmrr_unit *rmrr;
2795         struct device *tmp;
2796         int i;
2797
2798         rcu_read_lock();
2799         for_each_rmrr_units(rmrr) {
2800                 /*
2801                  * Return TRUE if this RMRR contains the device that
2802                  * is passed in.
2803                  */
2804                 for_each_active_dev_scope(rmrr->devices,
2805                                           rmrr->devices_cnt, i, tmp)
2806                         if (tmp == dev ||
2807                             is_downstream_to_pci_bridge(dev, tmp)) {
2808                                 rcu_read_unlock();
2809                                 return true;
2810                         }
2811         }
2812         rcu_read_unlock();
2813         return false;
2814 }
2815
2816 /**
2817  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2818  * is relaxable (ie. is allowed to be not enforced under some conditions)
2819  * @dev: device handle
2820  *
2821  * We assume that PCI USB devices with RMRRs have them largely
2822  * for historical reasons and that the RMRR space is not actively used post
2823  * boot.  This exclusion may change if vendors begin to abuse it.
2824  *
2825  * The same exception is made for graphics devices, with the requirement that
2826  * any use of the RMRR regions will be torn down before assigning the device
2827  * to a guest.
2828  *
2829  * Return: true if the RMRR is relaxable, false otherwise
2830  */
2831 static bool device_rmrr_is_relaxable(struct device *dev)
2832 {
2833         struct pci_dev *pdev;
2834
2835         if (!dev_is_pci(dev))
2836                 return false;
2837
2838         pdev = to_pci_dev(dev);
2839         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2840                 return true;
2841         else
2842                 return false;
2843 }
2844
2845 /*
2846  * There are a couple cases where we need to restrict the functionality of
2847  * devices associated with RMRRs.  The first is when evaluating a device for
2848  * identity mapping because problems exist when devices are moved in and out
2849  * of domains and their respective RMRR information is lost.  This means that
2850  * a device with associated RMRRs will never be in a "passthrough" domain.
2851  * The second is use of the device through the IOMMU API.  This interface
2852  * expects to have full control of the IOVA space for the device.  We cannot
2853  * satisfy both the requirement that RMRR access is maintained and have an
2854  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2855  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2856  * We therefore prevent devices associated with an RMRR from participating in
2857  * the IOMMU API, which eliminates them from device assignment.
2858  *
2859  * In both cases, devices which have relaxable RMRRs are not concerned by this
2860  * restriction. See device_rmrr_is_relaxable comment.
2861  */
2862 static bool device_is_rmrr_locked(struct device *dev)
2863 {
2864         if (!device_has_rmrr(dev))
2865                 return false;
2866
2867         if (device_rmrr_is_relaxable(dev))
2868                 return false;
2869
2870         return true;
2871 }
2872
2873 /*
2874  * Return the required default domain type for a specific device.
2875  *
2876  * @dev: the device in query
2877  * @startup: true if this is during early boot
2878  *
2879  * Returns:
2880  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2881  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2882  *  - 0: both identity and dynamic domains work for this device
2883  */
2884 static int device_def_domain_type(struct device *dev)
2885 {
2886         if (dev_is_pci(dev)) {
2887                 struct pci_dev *pdev = to_pci_dev(dev);
2888
2889                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2890                         return IOMMU_DOMAIN_IDENTITY;
2891
2892                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2893                         return IOMMU_DOMAIN_IDENTITY;
2894         }
2895
2896         return 0;
2897 }
2898
2899 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2900 {
2901         /*
2902          * Start from the sane iommu hardware state.
2903          * If the queued invalidation is already initialized by us
2904          * (for example, while enabling interrupt-remapping) then
2905          * we got the things already rolling from a sane state.
2906          */
2907         if (!iommu->qi) {
2908                 /*
2909                  * Clear any previous faults.
2910                  */
2911                 dmar_fault(-1, iommu);
2912                 /*
2913                  * Disable queued invalidation if supported and already enabled
2914                  * before OS handover.
2915                  */
2916                 dmar_disable_qi(iommu);
2917         }
2918
2919         if (dmar_enable_qi(iommu)) {
2920                 /*
2921                  * Queued Invalidate not enabled, use Register Based Invalidate
2922                  */
2923                 iommu->flush.flush_context = __iommu_flush_context;
2924                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2925                 pr_info("%s: Using Register based invalidation\n",
2926                         iommu->name);
2927         } else {
2928                 iommu->flush.flush_context = qi_flush_context;
2929                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2930                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2931         }
2932 }
2933
2934 static int copy_context_table(struct intel_iommu *iommu,
2935                               struct root_entry *old_re,
2936                               struct context_entry **tbl,
2937                               int bus, bool ext)
2938 {
2939         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2940         struct context_entry *new_ce = NULL, ce;
2941         struct context_entry *old_ce = NULL;
2942         struct root_entry re;
2943         phys_addr_t old_ce_phys;
2944
2945         tbl_idx = ext ? bus * 2 : bus;
2946         memcpy(&re, old_re, sizeof(re));
2947
2948         for (devfn = 0; devfn < 256; devfn++) {
2949                 /* First calculate the correct index */
2950                 idx = (ext ? devfn * 2 : devfn) % 256;
2951
2952                 if (idx == 0) {
2953                         /* First save what we may have and clean up */
2954                         if (new_ce) {
2955                                 tbl[tbl_idx] = new_ce;
2956                                 __iommu_flush_cache(iommu, new_ce,
2957                                                     VTD_PAGE_SIZE);
2958                                 pos = 1;
2959                         }
2960
2961                         if (old_ce)
2962                                 memunmap(old_ce);
2963
2964                         ret = 0;
2965                         if (devfn < 0x80)
2966                                 old_ce_phys = root_entry_lctp(&re);
2967                         else
2968                                 old_ce_phys = root_entry_uctp(&re);
2969
2970                         if (!old_ce_phys) {
2971                                 if (ext && devfn == 0) {
2972                                         /* No LCTP, try UCTP */
2973                                         devfn = 0x7f;
2974                                         continue;
2975                                 } else {
2976                                         goto out;
2977                                 }
2978                         }
2979
2980                         ret = -ENOMEM;
2981                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2982                                         MEMREMAP_WB);
2983                         if (!old_ce)
2984                                 goto out;
2985
2986                         new_ce = alloc_pgtable_page(iommu->node);
2987                         if (!new_ce)
2988                                 goto out_unmap;
2989
2990                         ret = 0;
2991                 }
2992
2993                 /* Now copy the context entry */
2994                 memcpy(&ce, old_ce + idx, sizeof(ce));
2995
2996                 if (!__context_present(&ce))
2997                         continue;
2998
2999                 did = context_domain_id(&ce);
3000                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3001                         set_bit(did, iommu->domain_ids);
3002
3003                 /*
3004                  * We need a marker for copied context entries. This
3005                  * marker needs to work for the old format as well as
3006                  * for extended context entries.
3007                  *
3008                  * Bit 67 of the context entry is used. In the old
3009                  * format this bit is available to software, in the
3010                  * extended format it is the PGE bit, but PGE is ignored
3011                  * by HW if PASIDs are disabled (and thus still
3012                  * available).
3013                  *
3014                  * So disable PASIDs first and then mark the entry
3015                  * copied. This means that we don't copy PASID
3016                  * translations from the old kernel, but this is fine as
3017                  * faults there are not fatal.
3018                  */
3019                 context_clear_pasid_enable(&ce);
3020                 context_set_copied(&ce);
3021
3022                 new_ce[idx] = ce;
3023         }
3024
3025         tbl[tbl_idx + pos] = new_ce;
3026
3027         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3028
3029 out_unmap:
3030         memunmap(old_ce);
3031
3032 out:
3033         return ret;
3034 }
3035
3036 static int copy_translation_tables(struct intel_iommu *iommu)
3037 {
3038         struct context_entry **ctxt_tbls;
3039         struct root_entry *old_rt;
3040         phys_addr_t old_rt_phys;
3041         int ctxt_table_entries;
3042         unsigned long flags;
3043         u64 rtaddr_reg;
3044         int bus, ret;
3045         bool new_ext, ext;
3046
3047         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3048         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3049         new_ext    = !!ecap_ecs(iommu->ecap);
3050
3051         /*
3052          * The RTT bit can only be changed when translation is disabled,
3053          * but disabling translation means to open a window for data
3054          * corruption. So bail out and don't copy anything if we would
3055          * have to change the bit.
3056          */
3057         if (new_ext != ext)
3058                 return -EINVAL;
3059
3060         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3061         if (!old_rt_phys)
3062                 return -EINVAL;
3063
3064         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3065         if (!old_rt)
3066                 return -ENOMEM;
3067
3068         /* This is too big for the stack - allocate it from slab */
3069         ctxt_table_entries = ext ? 512 : 256;
3070         ret = -ENOMEM;
3071         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3072         if (!ctxt_tbls)
3073                 goto out_unmap;
3074
3075         for (bus = 0; bus < 256; bus++) {
3076                 ret = copy_context_table(iommu, &old_rt[bus],
3077                                          ctxt_tbls, bus, ext);
3078                 if (ret) {
3079                         pr_err("%s: Failed to copy context table for bus %d\n",
3080                                 iommu->name, bus);
3081                         continue;
3082                 }
3083         }
3084
3085         spin_lock_irqsave(&iommu->lock, flags);
3086
3087         /* Context tables are copied, now write them to the root_entry table */
3088         for (bus = 0; bus < 256; bus++) {
3089                 int idx = ext ? bus * 2 : bus;
3090                 u64 val;
3091
3092                 if (ctxt_tbls[idx]) {
3093                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3094                         iommu->root_entry[bus].lo = val;
3095                 }
3096
3097                 if (!ext || !ctxt_tbls[idx + 1])
3098                         continue;
3099
3100                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3101                 iommu->root_entry[bus].hi = val;
3102         }
3103
3104         spin_unlock_irqrestore(&iommu->lock, flags);
3105
3106         kfree(ctxt_tbls);
3107
3108         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3109
3110         ret = 0;
3111
3112 out_unmap:
3113         memunmap(old_rt);
3114
3115         return ret;
3116 }
3117
3118 #ifdef CONFIG_INTEL_IOMMU_SVM
3119 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3120 {
3121         struct intel_iommu *iommu = data;
3122         ioasid_t ioasid;
3123
3124         if (!iommu)
3125                 return INVALID_IOASID;
3126         /*
3127          * VT-d virtual command interface always uses the full 20 bit
3128          * PASID range. Host can partition guest PASID range based on
3129          * policies but it is out of guest's control.
3130          */
3131         if (min < PASID_MIN || max > intel_pasid_max_id)
3132                 return INVALID_IOASID;
3133
3134         if (vcmd_alloc_pasid(iommu, &ioasid))
3135                 return INVALID_IOASID;
3136
3137         return ioasid;
3138 }
3139
3140 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3141 {
3142         struct intel_iommu *iommu = data;
3143
3144         if (!iommu)
3145                 return;
3146         /*
3147          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3148          * We can only free the PASID when all the devices are unbound.
3149          */
3150         if (ioasid_find(NULL, ioasid, NULL)) {
3151                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3152                 return;
3153         }
3154         vcmd_free_pasid(iommu, ioasid);
3155 }
3156
3157 static void register_pasid_allocator(struct intel_iommu *iommu)
3158 {
3159         /*
3160          * If we are running in the host, no need for custom allocator
3161          * in that PASIDs are allocated from the host system-wide.
3162          */
3163         if (!cap_caching_mode(iommu->cap))
3164                 return;
3165
3166         if (!sm_supported(iommu)) {
3167                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3168                 return;
3169         }
3170
3171         /*
3172          * Register a custom PASID allocator if we are running in a guest,
3173          * guest PASID must be obtained via virtual command interface.
3174          * There can be multiple vIOMMUs in each guest but only one allocator
3175          * is active. All vIOMMU allocators will eventually be calling the same
3176          * host allocator.
3177          */
3178         if (!vccap_pasid(iommu->vccap))
3179                 return;
3180
3181         pr_info("Register custom PASID allocator\n");
3182         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3183         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3184         iommu->pasid_allocator.pdata = (void *)iommu;
3185         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3186                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3187                 /*
3188                  * Disable scalable mode on this IOMMU if there
3189                  * is no custom allocator. Mixing SM capable vIOMMU
3190                  * and non-SM vIOMMU are not supported.
3191                  */
3192                 intel_iommu_sm = 0;
3193         }
3194 }
3195 #endif
3196
3197 static int __init init_dmars(void)
3198 {
3199         struct dmar_drhd_unit *drhd;
3200         struct intel_iommu *iommu;
3201         int ret;
3202
3203         /*
3204          * for each drhd
3205          *    allocate root
3206          *    initialize and program root entry to not present
3207          * endfor
3208          */
3209         for_each_drhd_unit(drhd) {
3210                 /*
3211                  * lock not needed as this is only incremented in the single
3212                  * threaded kernel __init code path all other access are read
3213                  * only
3214                  */
3215                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3216                         g_num_of_iommus++;
3217                         continue;
3218                 }
3219                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3220         }
3221
3222         /* Preallocate enough resources for IOMMU hot-addition */
3223         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3224                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3225
3226         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3227                         GFP_KERNEL);
3228         if (!g_iommus) {
3229                 pr_err("Allocating global iommu array failed\n");
3230                 ret = -ENOMEM;
3231                 goto error;
3232         }
3233
3234         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3235         if (ret)
3236                 goto free_iommu;
3237
3238         for_each_iommu(iommu, drhd) {
3239                 if (drhd->ignored) {
3240                         iommu_disable_translation(iommu);
3241                         continue;
3242                 }
3243
3244                 /*
3245                  * Find the max pasid size of all IOMMU's in the system.
3246                  * We need to ensure the system pasid table is no bigger
3247                  * than the smallest supported.
3248                  */
3249                 if (pasid_supported(iommu)) {
3250                         u32 temp = 2 << ecap_pss(iommu->ecap);
3251
3252                         intel_pasid_max_id = min_t(u32, temp,
3253                                                    intel_pasid_max_id);
3254                 }
3255
3256                 g_iommus[iommu->seq_id] = iommu;
3257
3258                 intel_iommu_init_qi(iommu);
3259
3260                 ret = iommu_init_domains(iommu);
3261                 if (ret)
3262                         goto free_iommu;
3263
3264                 init_translation_status(iommu);
3265
3266                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3267                         iommu_disable_translation(iommu);
3268                         clear_translation_pre_enabled(iommu);
3269                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3270                                 iommu->name);
3271                 }
3272
3273                 /*
3274                  * TBD:
3275                  * we could share the same root & context tables
3276                  * among all IOMMU's. Need to Split it later.
3277                  */
3278                 ret = iommu_alloc_root_entry(iommu);
3279                 if (ret)
3280                         goto free_iommu;
3281
3282                 if (translation_pre_enabled(iommu)) {
3283                         pr_info("Translation already enabled - trying to copy translation structures\n");
3284
3285                         ret = copy_translation_tables(iommu);
3286                         if (ret) {
3287                                 /*
3288                                  * We found the IOMMU with translation
3289                                  * enabled - but failed to copy over the
3290                                  * old root-entry table. Try to proceed
3291                                  * by disabling translation now and
3292                                  * allocating a clean root-entry table.
3293                                  * This might cause DMAR faults, but
3294                                  * probably the dump will still succeed.
3295                                  */
3296                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3297                                        iommu->name);
3298                                 iommu_disable_translation(iommu);
3299                                 clear_translation_pre_enabled(iommu);
3300                         } else {
3301                                 pr_info("Copied translation tables from previous kernel for %s\n",
3302                                         iommu->name);
3303                         }
3304                 }
3305
3306                 if (!ecap_pass_through(iommu->ecap))
3307                         hw_pass_through = 0;
3308                 intel_svm_check(iommu);
3309         }
3310
3311         /*
3312          * Now that qi is enabled on all iommus, set the root entry and flush
3313          * caches. This is required on some Intel X58 chipsets, otherwise the
3314          * flush_context function will loop forever and the boot hangs.
3315          */
3316         for_each_active_iommu(iommu, drhd) {
3317                 iommu_flush_write_buffer(iommu);
3318 #ifdef CONFIG_INTEL_IOMMU_SVM
3319                 register_pasid_allocator(iommu);
3320 #endif
3321                 iommu_set_root_entry(iommu);
3322         }
3323
3324 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3325         dmar_map_gfx = 0;
3326 #endif
3327
3328         if (!dmar_map_gfx)
3329                 iommu_identity_mapping |= IDENTMAP_GFX;
3330
3331         check_tylersburg_isoch();
3332
3333         ret = si_domain_init(hw_pass_through);
3334         if (ret)
3335                 goto free_iommu;
3336
3337         /*
3338          * for each drhd
3339          *   enable fault log
3340          *   global invalidate context cache
3341          *   global invalidate iotlb
3342          *   enable translation
3343          */
3344         for_each_iommu(iommu, drhd) {
3345                 if (drhd->ignored) {
3346                         /*
3347                          * we always have to disable PMRs or DMA may fail on
3348                          * this device
3349                          */
3350                         if (force_on)
3351                                 iommu_disable_protect_mem_regions(iommu);
3352                         continue;
3353                 }
3354
3355                 iommu_flush_write_buffer(iommu);
3356
3357 #ifdef CONFIG_INTEL_IOMMU_SVM
3358                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3359                         /*
3360                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3361                          * could cause possible lock race condition.
3362                          */
3363                         up_write(&dmar_global_lock);
3364                         ret = intel_svm_enable_prq(iommu);
3365                         down_write(&dmar_global_lock);
3366                         if (ret)
3367                                 goto free_iommu;
3368                 }
3369 #endif
3370                 ret = dmar_set_interrupt(iommu);
3371                 if (ret)
3372                         goto free_iommu;
3373         }
3374
3375         return 0;
3376
3377 free_iommu:
3378         for_each_active_iommu(iommu, drhd) {
3379                 disable_dmar_iommu(iommu);
3380                 free_dmar_iommu(iommu);
3381         }
3382
3383         kfree(g_iommus);
3384
3385 error:
3386         return ret;
3387 }
3388
3389 static inline int iommu_domain_cache_init(void)
3390 {
3391         int ret = 0;
3392
3393         iommu_domain_cache = kmem_cache_create("iommu_domain",
3394                                          sizeof(struct dmar_domain),
3395                                          0,
3396                                          SLAB_HWCACHE_ALIGN,
3397
3398                                          NULL);
3399         if (!iommu_domain_cache) {
3400                 pr_err("Couldn't create iommu_domain cache\n");
3401                 ret = -ENOMEM;
3402         }
3403
3404         return ret;
3405 }
3406
3407 static inline int iommu_devinfo_cache_init(void)
3408 {
3409         int ret = 0;
3410
3411         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3412                                          sizeof(struct device_domain_info),
3413                                          0,
3414                                          SLAB_HWCACHE_ALIGN,
3415                                          NULL);
3416         if (!iommu_devinfo_cache) {
3417                 pr_err("Couldn't create devinfo cache\n");
3418                 ret = -ENOMEM;
3419         }
3420
3421         return ret;
3422 }
3423
3424 static int __init iommu_init_mempool(void)
3425 {
3426         int ret;
3427         ret = iova_cache_get();
3428         if (ret)
3429                 return ret;
3430
3431         ret = iommu_domain_cache_init();
3432         if (ret)
3433                 goto domain_error;
3434
3435         ret = iommu_devinfo_cache_init();
3436         if (!ret)
3437                 return ret;
3438
3439         kmem_cache_destroy(iommu_domain_cache);
3440 domain_error:
3441         iova_cache_put();
3442
3443         return -ENOMEM;
3444 }
3445
3446 static void __init iommu_exit_mempool(void)
3447 {
3448         kmem_cache_destroy(iommu_devinfo_cache);
3449         kmem_cache_destroy(iommu_domain_cache);
3450         iova_cache_put();
3451 }
3452
3453 static void __init init_no_remapping_devices(void)
3454 {
3455         struct dmar_drhd_unit *drhd;
3456         struct device *dev;
3457         int i;
3458
3459         for_each_drhd_unit(drhd) {
3460                 if (!drhd->include_all) {
3461                         for_each_active_dev_scope(drhd->devices,
3462                                                   drhd->devices_cnt, i, dev)
3463                                 break;
3464                         /* ignore DMAR unit if no devices exist */
3465                         if (i == drhd->devices_cnt)
3466                                 drhd->ignored = 1;
3467                 }
3468         }
3469
3470         for_each_active_drhd_unit(drhd) {
3471                 if (drhd->include_all)
3472                         continue;
3473
3474                 for_each_active_dev_scope(drhd->devices,
3475                                           drhd->devices_cnt, i, dev)
3476                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3477                                 break;
3478                 if (i < drhd->devices_cnt)
3479                         continue;
3480
3481                 /* This IOMMU has *only* gfx devices. Either bypass it or
3482                    set the gfx_mapped flag, as appropriate */
3483                 drhd->gfx_dedicated = 1;
3484                 if (!dmar_map_gfx)
3485                         drhd->ignored = 1;
3486         }
3487 }
3488
3489 #ifdef CONFIG_SUSPEND
3490 static int init_iommu_hw(void)
3491 {
3492         struct dmar_drhd_unit *drhd;
3493         struct intel_iommu *iommu = NULL;
3494
3495         for_each_active_iommu(iommu, drhd)
3496                 if (iommu->qi)
3497                         dmar_reenable_qi(iommu);
3498
3499         for_each_iommu(iommu, drhd) {
3500                 if (drhd->ignored) {
3501                         /*
3502                          * we always have to disable PMRs or DMA may fail on
3503                          * this device
3504                          */
3505                         if (force_on)
3506                                 iommu_disable_protect_mem_regions(iommu);
3507                         continue;
3508                 }
3509
3510                 iommu_flush_write_buffer(iommu);
3511                 iommu_set_root_entry(iommu);
3512                 iommu_enable_translation(iommu);
3513                 iommu_disable_protect_mem_regions(iommu);
3514         }
3515
3516         return 0;
3517 }
3518
3519 static void iommu_flush_all(void)
3520 {
3521         struct dmar_drhd_unit *drhd;
3522         struct intel_iommu *iommu;
3523
3524         for_each_active_iommu(iommu, drhd) {
3525                 iommu->flush.flush_context(iommu, 0, 0, 0,
3526                                            DMA_CCMD_GLOBAL_INVL);
3527                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3528                                          DMA_TLB_GLOBAL_FLUSH);
3529         }
3530 }
3531
3532 static int iommu_suspend(void)
3533 {
3534         struct dmar_drhd_unit *drhd;
3535         struct intel_iommu *iommu = NULL;
3536         unsigned long flag;
3537
3538         for_each_active_iommu(iommu, drhd) {
3539                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3540                                              GFP_KERNEL);
3541                 if (!iommu->iommu_state)
3542                         goto nomem;
3543         }
3544
3545         iommu_flush_all();
3546
3547         for_each_active_iommu(iommu, drhd) {
3548                 iommu_disable_translation(iommu);
3549
3550                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3551
3552                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3553                         readl(iommu->reg + DMAR_FECTL_REG);
3554                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3555                         readl(iommu->reg + DMAR_FEDATA_REG);
3556                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3557                         readl(iommu->reg + DMAR_FEADDR_REG);
3558                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3559                         readl(iommu->reg + DMAR_FEUADDR_REG);
3560
3561                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3562         }
3563         return 0;
3564
3565 nomem:
3566         for_each_active_iommu(iommu, drhd)
3567                 kfree(iommu->iommu_state);
3568
3569         return -ENOMEM;
3570 }
3571
3572 static void iommu_resume(void)
3573 {
3574         struct dmar_drhd_unit *drhd;
3575         struct intel_iommu *iommu = NULL;
3576         unsigned long flag;
3577
3578         if (init_iommu_hw()) {
3579                 if (force_on)
3580                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3581                 else
3582                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3583                 return;
3584         }
3585
3586         for_each_active_iommu(iommu, drhd) {
3587
3588                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3589
3590                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3591                         iommu->reg + DMAR_FECTL_REG);
3592                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3593                         iommu->reg + DMAR_FEDATA_REG);
3594                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3595                         iommu->reg + DMAR_FEADDR_REG);
3596                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3597                         iommu->reg + DMAR_FEUADDR_REG);
3598
3599                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3600         }
3601
3602         for_each_active_iommu(iommu, drhd)
3603                 kfree(iommu->iommu_state);
3604 }
3605
3606 static struct syscore_ops iommu_syscore_ops = {
3607         .resume         = iommu_resume,
3608         .suspend        = iommu_suspend,
3609 };
3610
3611 static void __init init_iommu_pm_ops(void)
3612 {
3613         register_syscore_ops(&iommu_syscore_ops);
3614 }
3615
3616 #else
3617 static inline void init_iommu_pm_ops(void) {}
3618 #endif  /* CONFIG_PM */
3619
3620 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3621 {
3622         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3623             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3624             rmrr->end_address <= rmrr->base_address ||
3625             arch_rmrr_sanity_check(rmrr))
3626                 return -EINVAL;
3627
3628         return 0;
3629 }
3630
3631 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3632 {
3633         struct acpi_dmar_reserved_memory *rmrr;
3634         struct dmar_rmrr_unit *rmrru;
3635
3636         rmrr = (struct acpi_dmar_reserved_memory *)header;
3637         if (rmrr_sanity_check(rmrr)) {
3638                 pr_warn(FW_BUG
3639                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3640                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3641                            rmrr->base_address, rmrr->end_address,
3642                            dmi_get_system_info(DMI_BIOS_VENDOR),
3643                            dmi_get_system_info(DMI_BIOS_VERSION),
3644                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3645                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3646         }
3647
3648         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3649         if (!rmrru)
3650                 goto out;
3651
3652         rmrru->hdr = header;
3653
3654         rmrru->base_address = rmrr->base_address;
3655         rmrru->end_address = rmrr->end_address;
3656
3657         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3658                                 ((void *)rmrr) + rmrr->header.length,
3659                                 &rmrru->devices_cnt);
3660         if (rmrru->devices_cnt && rmrru->devices == NULL)
3661                 goto free_rmrru;
3662
3663         list_add(&rmrru->list, &dmar_rmrr_units);
3664
3665         return 0;
3666 free_rmrru:
3667         kfree(rmrru);
3668 out:
3669         return -ENOMEM;
3670 }
3671
3672 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3673 {
3674         struct dmar_atsr_unit *atsru;
3675         struct acpi_dmar_atsr *tmp;
3676
3677         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3678                                 dmar_rcu_check()) {
3679                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3680                 if (atsr->segment != tmp->segment)
3681                         continue;
3682                 if (atsr->header.length != tmp->header.length)
3683                         continue;
3684                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3685                         return atsru;
3686         }
3687
3688         return NULL;
3689 }
3690
3691 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3692 {
3693         struct acpi_dmar_atsr *atsr;
3694         struct dmar_atsr_unit *atsru;
3695
3696         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3697                 return 0;
3698
3699         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3700         atsru = dmar_find_atsr(atsr);
3701         if (atsru)
3702                 return 0;
3703
3704         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3705         if (!atsru)
3706                 return -ENOMEM;
3707
3708         /*
3709          * If memory is allocated from slab by ACPI _DSM method, we need to
3710          * copy the memory content because the memory buffer will be freed
3711          * on return.
3712          */
3713         atsru->hdr = (void *)(atsru + 1);
3714         memcpy(atsru->hdr, hdr, hdr->length);
3715         atsru->include_all = atsr->flags & 0x1;
3716         if (!atsru->include_all) {
3717                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3718                                 (void *)atsr + atsr->header.length,
3719                                 &atsru->devices_cnt);
3720                 if (atsru->devices_cnt && atsru->devices == NULL) {
3721                         kfree(atsru);
3722                         return -ENOMEM;
3723                 }
3724         }
3725
3726         list_add_rcu(&atsru->list, &dmar_atsr_units);
3727
3728         return 0;
3729 }
3730
3731 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3732 {
3733         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3734         kfree(atsru);
3735 }
3736
3737 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3738 {
3739         struct acpi_dmar_atsr *atsr;
3740         struct dmar_atsr_unit *atsru;
3741
3742         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3743         atsru = dmar_find_atsr(atsr);
3744         if (atsru) {
3745                 list_del_rcu(&atsru->list);
3746                 synchronize_rcu();
3747                 intel_iommu_free_atsr(atsru);
3748         }
3749
3750         return 0;
3751 }
3752
3753 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3754 {
3755         int i;
3756         struct device *dev;
3757         struct acpi_dmar_atsr *atsr;
3758         struct dmar_atsr_unit *atsru;
3759
3760         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3761         atsru = dmar_find_atsr(atsr);
3762         if (!atsru)
3763                 return 0;
3764
3765         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3766                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3767                                           i, dev)
3768                         return -EBUSY;
3769         }
3770
3771         return 0;
3772 }
3773
3774 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3775 {
3776         struct dmar_satc_unit *satcu;
3777         struct acpi_dmar_satc *tmp;
3778
3779         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3780                                 dmar_rcu_check()) {
3781                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3782                 if (satc->segment != tmp->segment)
3783                         continue;
3784                 if (satc->header.length != tmp->header.length)
3785                         continue;
3786                 if (memcmp(satc, tmp, satc->header.length) == 0)
3787                         return satcu;
3788         }
3789
3790         return NULL;
3791 }
3792
3793 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3794 {
3795         struct acpi_dmar_satc *satc;
3796         struct dmar_satc_unit *satcu;
3797
3798         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3799                 return 0;
3800
3801         satc = container_of(hdr, struct acpi_dmar_satc, header);
3802         satcu = dmar_find_satc(satc);
3803         if (satcu)
3804                 return 0;
3805
3806         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3807         if (!satcu)
3808                 return -ENOMEM;
3809
3810         satcu->hdr = (void *)(satcu + 1);
3811         memcpy(satcu->hdr, hdr, hdr->length);
3812         satcu->atc_required = satc->flags & 0x1;
3813         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3814                                               (void *)satc + satc->header.length,
3815                                               &satcu->devices_cnt);
3816         if (satcu->devices_cnt && !satcu->devices) {
3817                 kfree(satcu);
3818                 return -ENOMEM;
3819         }
3820         list_add_rcu(&satcu->list, &dmar_satc_units);
3821
3822         return 0;
3823 }
3824
3825 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3826 {
3827         int sp, ret;
3828         struct intel_iommu *iommu = dmaru->iommu;
3829
3830         if (g_iommus[iommu->seq_id])
3831                 return 0;
3832
3833         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3834         if (ret)
3835                 goto out;
3836
3837         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3838                 pr_warn("%s: Doesn't support hardware pass through.\n",
3839                         iommu->name);
3840                 return -ENXIO;
3841         }
3842         if (!ecap_sc_support(iommu->ecap) &&
3843             domain_update_iommu_snooping(iommu)) {
3844                 pr_warn("%s: Doesn't support snooping.\n",
3845                         iommu->name);
3846                 return -ENXIO;
3847         }
3848         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3849         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3850                 pr_warn("%s: Doesn't support large page.\n",
3851                         iommu->name);
3852                 return -ENXIO;
3853         }
3854
3855         /*
3856          * Disable translation if already enabled prior to OS handover.
3857          */
3858         if (iommu->gcmd & DMA_GCMD_TE)
3859                 iommu_disable_translation(iommu);
3860
3861         g_iommus[iommu->seq_id] = iommu;
3862         ret = iommu_init_domains(iommu);
3863         if (ret == 0)
3864                 ret = iommu_alloc_root_entry(iommu);
3865         if (ret)
3866                 goto out;
3867
3868         intel_svm_check(iommu);
3869
3870         if (dmaru->ignored) {
3871                 /*
3872                  * we always have to disable PMRs or DMA may fail on this device
3873                  */
3874                 if (force_on)
3875                         iommu_disable_protect_mem_regions(iommu);
3876                 return 0;
3877         }
3878
3879         intel_iommu_init_qi(iommu);
3880         iommu_flush_write_buffer(iommu);
3881
3882 #ifdef CONFIG_INTEL_IOMMU_SVM
3883         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3884                 ret = intel_svm_enable_prq(iommu);
3885                 if (ret)
3886                         goto disable_iommu;
3887         }
3888 #endif
3889         ret = dmar_set_interrupt(iommu);
3890         if (ret)
3891                 goto disable_iommu;
3892
3893         iommu_set_root_entry(iommu);
3894         iommu_enable_translation(iommu);
3895
3896         iommu_disable_protect_mem_regions(iommu);
3897         return 0;
3898
3899 disable_iommu:
3900         disable_dmar_iommu(iommu);
3901 out:
3902         free_dmar_iommu(iommu);
3903         return ret;
3904 }
3905
3906 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3907 {
3908         int ret = 0;
3909         struct intel_iommu *iommu = dmaru->iommu;
3910
3911         if (!intel_iommu_enabled)
3912                 return 0;
3913         if (iommu == NULL)
3914                 return -EINVAL;
3915
3916         if (insert) {
3917                 ret = intel_iommu_add(dmaru);
3918         } else {
3919                 disable_dmar_iommu(iommu);
3920                 free_dmar_iommu(iommu);
3921         }
3922
3923         return ret;
3924 }
3925
3926 static void intel_iommu_free_dmars(void)
3927 {
3928         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3929         struct dmar_atsr_unit *atsru, *atsr_n;
3930         struct dmar_satc_unit *satcu, *satc_n;
3931
3932         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3933                 list_del(&rmrru->list);
3934                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3935                 kfree(rmrru);
3936         }
3937
3938         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3939                 list_del(&atsru->list);
3940                 intel_iommu_free_atsr(atsru);
3941         }
3942         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3943                 list_del(&satcu->list);
3944                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3945                 kfree(satcu);
3946         }
3947 }
3948
3949 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3950 {
3951         int i, ret = 1;
3952         struct pci_bus *bus;
3953         struct pci_dev *bridge = NULL;
3954         struct device *tmp;
3955         struct acpi_dmar_atsr *atsr;
3956         struct dmar_atsr_unit *atsru;
3957
3958         dev = pci_physfn(dev);
3959         for (bus = dev->bus; bus; bus = bus->parent) {
3960                 bridge = bus->self;
3961                 /* If it's an integrated device, allow ATS */
3962                 if (!bridge)
3963                         return 1;
3964                 /* Connected via non-PCIe: no ATS */
3965                 if (!pci_is_pcie(bridge) ||
3966                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3967                         return 0;
3968                 /* If we found the root port, look it up in the ATSR */
3969                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3970                         break;
3971         }
3972
3973         rcu_read_lock();
3974         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3975                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3976                 if (atsr->segment != pci_domain_nr(dev->bus))
3977                         continue;
3978
3979                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3980                         if (tmp == &bridge->dev)
3981                                 goto out;
3982
3983                 if (atsru->include_all)
3984                         goto out;
3985         }
3986         ret = 0;
3987 out:
3988         rcu_read_unlock();
3989
3990         return ret;
3991 }
3992
3993 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3994 {
3995         int ret;
3996         struct dmar_rmrr_unit *rmrru;
3997         struct dmar_atsr_unit *atsru;
3998         struct dmar_satc_unit *satcu;
3999         struct acpi_dmar_atsr *atsr;
4000         struct acpi_dmar_reserved_memory *rmrr;
4001         struct acpi_dmar_satc *satc;
4002
4003         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4004                 return 0;
4005
4006         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4007                 rmrr = container_of(rmrru->hdr,
4008                                     struct acpi_dmar_reserved_memory, header);
4009                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4010                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4011                                 ((void *)rmrr) + rmrr->header.length,
4012                                 rmrr->segment, rmrru->devices,
4013                                 rmrru->devices_cnt);
4014                         if (ret < 0)
4015                                 return ret;
4016                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4017                         dmar_remove_dev_scope(info, rmrr->segment,
4018                                 rmrru->devices, rmrru->devices_cnt);
4019                 }
4020         }
4021
4022         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4023                 if (atsru->include_all)
4024                         continue;
4025
4026                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4027                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4028                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4029                                         (void *)atsr + atsr->header.length,
4030                                         atsr->segment, atsru->devices,
4031                                         atsru->devices_cnt);
4032                         if (ret > 0)
4033                                 break;
4034                         else if (ret < 0)
4035                                 return ret;
4036                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4037                         if (dmar_remove_dev_scope(info, atsr->segment,
4038                                         atsru->devices, atsru->devices_cnt))
4039                                 break;
4040                 }
4041         }
4042         list_for_each_entry(satcu, &dmar_satc_units, list) {
4043                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4044                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4045                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4046                                         (void *)satc + satc->header.length,
4047                                         satc->segment, satcu->devices,
4048                                         satcu->devices_cnt);
4049                         if (ret > 0)
4050                                 break;
4051                         else if (ret < 0)
4052                                 return ret;
4053                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4054                         if (dmar_remove_dev_scope(info, satc->segment,
4055                                         satcu->devices, satcu->devices_cnt))
4056                                 break;
4057                 }
4058         }
4059
4060         return 0;
4061 }
4062
4063 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4064                                        unsigned long val, void *v)
4065 {
4066         struct memory_notify *mhp = v;
4067         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4068         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4069                         mhp->nr_pages - 1);
4070
4071         switch (val) {
4072         case MEM_GOING_ONLINE:
4073                 if (iommu_domain_identity_map(si_domain,
4074                                               start_vpfn, last_vpfn)) {
4075                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4076                                 start_vpfn, last_vpfn);
4077                         return NOTIFY_BAD;
4078                 }
4079                 break;
4080
4081         case MEM_OFFLINE:
4082         case MEM_CANCEL_ONLINE:
4083                 {
4084                         struct dmar_drhd_unit *drhd;
4085                         struct intel_iommu *iommu;
4086                         struct page *freelist;
4087
4088                         freelist = domain_unmap(si_domain,
4089                                                 start_vpfn, last_vpfn,
4090                                                 NULL);
4091
4092                         rcu_read_lock();
4093                         for_each_active_iommu(iommu, drhd)
4094                                 iommu_flush_iotlb_psi(iommu, si_domain,
4095                                         start_vpfn, mhp->nr_pages,
4096                                         !freelist, 0);
4097                         rcu_read_unlock();
4098                         dma_free_pagelist(freelist);
4099                 }
4100                 break;
4101         }
4102
4103         return NOTIFY_OK;
4104 }
4105
4106 static struct notifier_block intel_iommu_memory_nb = {
4107         .notifier_call = intel_iommu_memory_notifier,
4108         .priority = 0
4109 };
4110
4111 static void intel_disable_iommus(void)
4112 {
4113         struct intel_iommu *iommu = NULL;
4114         struct dmar_drhd_unit *drhd;
4115
4116         for_each_iommu(iommu, drhd)
4117                 iommu_disable_translation(iommu);
4118 }
4119
4120 void intel_iommu_shutdown(void)
4121 {
4122         struct dmar_drhd_unit *drhd;
4123         struct intel_iommu *iommu = NULL;
4124
4125         if (no_iommu || dmar_disabled)
4126                 return;
4127
4128         down_write(&dmar_global_lock);
4129
4130         /* Disable PMRs explicitly here. */
4131         for_each_iommu(iommu, drhd)
4132                 iommu_disable_protect_mem_regions(iommu);
4133
4134         /* Make sure the IOMMUs are switched off */
4135         intel_disable_iommus();
4136
4137         up_write(&dmar_global_lock);
4138 }
4139
4140 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4141 {
4142         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4143
4144         return container_of(iommu_dev, struct intel_iommu, iommu);
4145 }
4146
4147 static ssize_t version_show(struct device *dev,
4148                             struct device_attribute *attr, char *buf)
4149 {
4150         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4151         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4152         return sprintf(buf, "%d:%d\n",
4153                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4154 }
4155 static DEVICE_ATTR_RO(version);
4156
4157 static ssize_t address_show(struct device *dev,
4158                             struct device_attribute *attr, char *buf)
4159 {
4160         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4161         return sprintf(buf, "%llx\n", iommu->reg_phys);
4162 }
4163 static DEVICE_ATTR_RO(address);
4164
4165 static ssize_t cap_show(struct device *dev,
4166                         struct device_attribute *attr, char *buf)
4167 {
4168         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4169         return sprintf(buf, "%llx\n", iommu->cap);
4170 }
4171 static DEVICE_ATTR_RO(cap);
4172
4173 static ssize_t ecap_show(struct device *dev,
4174                          struct device_attribute *attr, char *buf)
4175 {
4176         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4177         return sprintf(buf, "%llx\n", iommu->ecap);
4178 }
4179 static DEVICE_ATTR_RO(ecap);
4180
4181 static ssize_t domains_supported_show(struct device *dev,
4182                                       struct device_attribute *attr, char *buf)
4183 {
4184         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4185         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4186 }
4187 static DEVICE_ATTR_RO(domains_supported);
4188
4189 static ssize_t domains_used_show(struct device *dev,
4190                                  struct device_attribute *attr, char *buf)
4191 {
4192         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4193         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4194                                                   cap_ndoms(iommu->cap)));
4195 }
4196 static DEVICE_ATTR_RO(domains_used);
4197
4198 static struct attribute *intel_iommu_attrs[] = {
4199         &dev_attr_version.attr,
4200         &dev_attr_address.attr,
4201         &dev_attr_cap.attr,
4202         &dev_attr_ecap.attr,
4203         &dev_attr_domains_supported.attr,
4204         &dev_attr_domains_used.attr,
4205         NULL,
4206 };
4207
4208 static struct attribute_group intel_iommu_group = {
4209         .name = "intel-iommu",
4210         .attrs = intel_iommu_attrs,
4211 };
4212
4213 const struct attribute_group *intel_iommu_groups[] = {
4214         &intel_iommu_group,
4215         NULL,
4216 };
4217
4218 static inline bool has_external_pci(void)
4219 {
4220         struct pci_dev *pdev = NULL;
4221
4222         for_each_pci_dev(pdev)
4223                 if (pdev->external_facing)
4224                         return true;
4225
4226         return false;
4227 }
4228
4229 static int __init platform_optin_force_iommu(void)
4230 {
4231         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4232                 return 0;
4233
4234         if (no_iommu || dmar_disabled)
4235                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4236
4237         /*
4238          * If Intel-IOMMU is disabled by default, we will apply identity
4239          * map for all devices except those marked as being untrusted.
4240          */
4241         if (dmar_disabled)
4242                 iommu_set_default_passthrough(false);
4243
4244         dmar_disabled = 0;
4245         no_iommu = 0;
4246
4247         return 1;
4248 }
4249
4250 static int __init probe_acpi_namespace_devices(void)
4251 {
4252         struct dmar_drhd_unit *drhd;
4253         /* To avoid a -Wunused-but-set-variable warning. */
4254         struct intel_iommu *iommu __maybe_unused;
4255         struct device *dev;
4256         int i, ret = 0;
4257
4258         for_each_active_iommu(iommu, drhd) {
4259                 for_each_active_dev_scope(drhd->devices,
4260                                           drhd->devices_cnt, i, dev) {
4261                         struct acpi_device_physical_node *pn;
4262                         struct iommu_group *group;
4263                         struct acpi_device *adev;
4264
4265                         if (dev->bus != &acpi_bus_type)
4266                                 continue;
4267
4268                         adev = to_acpi_device(dev);
4269                         mutex_lock(&adev->physical_node_lock);
4270                         list_for_each_entry(pn,
4271                                             &adev->physical_node_list, node) {
4272                                 group = iommu_group_get(pn->dev);
4273                                 if (group) {
4274                                         iommu_group_put(group);
4275                                         continue;
4276                                 }
4277
4278                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4279                                 ret = iommu_probe_device(pn->dev);
4280                                 if (ret)
4281                                         break;
4282                         }
4283                         mutex_unlock(&adev->physical_node_lock);
4284
4285                         if (ret)
4286                                 return ret;
4287                 }
4288         }
4289
4290         return 0;
4291 }
4292
4293 int __init intel_iommu_init(void)
4294 {
4295         int ret = -ENODEV;
4296         struct dmar_drhd_unit *drhd;
4297         struct intel_iommu *iommu;
4298
4299         /*
4300          * Intel IOMMU is required for a TXT/tboot launch or platform
4301          * opt in, so enforce that.
4302          */
4303         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4304                     platform_optin_force_iommu();
4305
4306         if (iommu_init_mempool()) {
4307                 if (force_on)
4308                         panic("tboot: Failed to initialize iommu memory\n");
4309                 return -ENOMEM;
4310         }
4311
4312         down_write(&dmar_global_lock);
4313         if (dmar_table_init()) {
4314                 if (force_on)
4315                         panic("tboot: Failed to initialize DMAR table\n");
4316                 goto out_free_dmar;
4317         }
4318
4319         if (dmar_dev_scope_init() < 0) {
4320                 if (force_on)
4321                         panic("tboot: Failed to initialize DMAR device scope\n");
4322                 goto out_free_dmar;
4323         }
4324
4325         up_write(&dmar_global_lock);
4326
4327         /*
4328          * The bus notifier takes the dmar_global_lock, so lockdep will
4329          * complain later when we register it under the lock.
4330          */
4331         dmar_register_bus_notifier();
4332
4333         down_write(&dmar_global_lock);
4334
4335         if (!no_iommu)
4336                 intel_iommu_debugfs_init();
4337
4338         if (no_iommu || dmar_disabled) {
4339                 /*
4340                  * We exit the function here to ensure IOMMU's remapping and
4341                  * mempool aren't setup, which means that the IOMMU's PMRs
4342                  * won't be disabled via the call to init_dmars(). So disable
4343                  * it explicitly here. The PMRs were setup by tboot prior to
4344                  * calling SENTER, but the kernel is expected to reset/tear
4345                  * down the PMRs.
4346                  */
4347                 if (intel_iommu_tboot_noforce) {
4348                         for_each_iommu(iommu, drhd)
4349                                 iommu_disable_protect_mem_regions(iommu);
4350                 }
4351
4352                 /*
4353                  * Make sure the IOMMUs are switched off, even when we
4354                  * boot into a kexec kernel and the previous kernel left
4355                  * them enabled
4356                  */
4357                 intel_disable_iommus();
4358                 goto out_free_dmar;
4359         }
4360
4361         if (list_empty(&dmar_rmrr_units))
4362                 pr_info("No RMRR found\n");
4363
4364         if (list_empty(&dmar_atsr_units))
4365                 pr_info("No ATSR found\n");
4366
4367         if (list_empty(&dmar_satc_units))
4368                 pr_info("No SATC found\n");
4369
4370         if (dmar_map_gfx)
4371                 intel_iommu_gfx_mapped = 1;
4372
4373         init_no_remapping_devices();
4374
4375         ret = init_dmars();
4376         if (ret) {
4377                 if (force_on)
4378                         panic("tboot: Failed to initialize DMARs\n");
4379                 pr_err("Initialization failed\n");
4380                 goto out_free_dmar;
4381         }
4382         up_write(&dmar_global_lock);
4383
4384         init_iommu_pm_ops();
4385
4386         down_read(&dmar_global_lock);
4387         for_each_active_iommu(iommu, drhd) {
4388                 /*
4389                  * The flush queue implementation does not perform
4390                  * page-selective invalidations that are required for efficient
4391                  * TLB flushes in virtual environments.  The benefit of batching
4392                  * is likely to be much lower than the overhead of synchronizing
4393                  * the virtual and physical IOMMU page-tables.
4394                  */
4395                 if (cap_caching_mode(iommu->cap)) {
4396                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4397                         iommu_set_dma_strict();
4398                 }
4399                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4400                                        intel_iommu_groups,
4401                                        "%s", iommu->name);
4402                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4403         }
4404         up_read(&dmar_global_lock);
4405
4406         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4407         if (si_domain && !hw_pass_through)
4408                 register_memory_notifier(&intel_iommu_memory_nb);
4409
4410         down_read(&dmar_global_lock);
4411         if (probe_acpi_namespace_devices())
4412                 pr_warn("ACPI name space devices didn't probe correctly\n");
4413
4414         /* Finally, we enable the DMA remapping hardware. */
4415         for_each_iommu(iommu, drhd) {
4416                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4417                         iommu_enable_translation(iommu);
4418
4419                 iommu_disable_protect_mem_regions(iommu);
4420         }
4421         up_read(&dmar_global_lock);
4422
4423         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4424
4425         intel_iommu_enabled = 1;
4426
4427         return 0;
4428
4429 out_free_dmar:
4430         intel_iommu_free_dmars();
4431         up_write(&dmar_global_lock);
4432         iommu_exit_mempool();
4433         return ret;
4434 }
4435
4436 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4437 {
4438         struct device_domain_info *info = opaque;
4439
4440         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4441         return 0;
4442 }
4443
4444 /*
4445  * NB - intel-iommu lacks any sort of reference counting for the users of
4446  * dependent devices.  If multiple endpoints have intersecting dependent
4447  * devices, unbinding the driver from any one of them will possibly leave
4448  * the others unable to operate.
4449  */
4450 static void domain_context_clear(struct device_domain_info *info)
4451 {
4452         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4453                 return;
4454
4455         pci_for_each_dma_alias(to_pci_dev(info->dev),
4456                                &domain_context_clear_one_cb, info);
4457 }
4458
4459 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4460 {
4461         struct dmar_domain *domain;
4462         struct intel_iommu *iommu;
4463         unsigned long flags;
4464
4465         assert_spin_locked(&device_domain_lock);
4466
4467         if (WARN_ON(!info))
4468                 return;
4469
4470         iommu = info->iommu;
4471         domain = info->domain;
4472
4473         if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4474                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4475                         intel_pasid_tear_down_entry(iommu, info->dev,
4476                                         PASID_RID2PASID, false);
4477
4478                 iommu_disable_dev_iotlb(info);
4479                 domain_context_clear(info);
4480                 intel_pasid_free_table(info->dev);
4481         }
4482
4483         unlink_domain_info(info);
4484
4485         spin_lock_irqsave(&iommu->lock, flags);
4486         domain_detach_iommu(domain, iommu);
4487         spin_unlock_irqrestore(&iommu->lock, flags);
4488
4489         free_devinfo_mem(info);
4490 }
4491
4492 static void dmar_remove_one_dev_info(struct device *dev)
4493 {
4494         struct device_domain_info *info;
4495         unsigned long flags;
4496
4497         spin_lock_irqsave(&device_domain_lock, flags);
4498         info = get_domain_info(dev);
4499         if (info)
4500                 __dmar_remove_one_dev_info(info);
4501         spin_unlock_irqrestore(&device_domain_lock, flags);
4502 }
4503
4504 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4505 {
4506         int adjust_width;
4507
4508         /* calculate AGAW */
4509         domain->gaw = guest_width;
4510         adjust_width = guestwidth_to_adjustwidth(guest_width);
4511         domain->agaw = width_to_agaw(adjust_width);
4512
4513         domain->iommu_coherency = false;
4514         domain->iommu_snooping = false;
4515         domain->iommu_superpage = 0;
4516         domain->max_addr = 0;
4517
4518         /* always allocate the top pgd */
4519         domain->pgd = alloc_pgtable_page(domain->nid);
4520         if (!domain->pgd)
4521                 return -ENOMEM;
4522         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4523         return 0;
4524 }
4525
4526 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4527 {
4528         struct dmar_domain *dmar_domain;
4529         struct iommu_domain *domain;
4530
4531         switch (type) {
4532         case IOMMU_DOMAIN_DMA:
4533         case IOMMU_DOMAIN_UNMANAGED:
4534                 dmar_domain = alloc_domain(0);
4535                 if (!dmar_domain) {
4536                         pr_err("Can't allocate dmar_domain\n");
4537                         return NULL;
4538                 }
4539                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4540                         pr_err("Domain initialization failed\n");
4541                         domain_exit(dmar_domain);
4542                         return NULL;
4543                 }
4544
4545                 if (type == IOMMU_DOMAIN_DMA &&
4546                     iommu_get_dma_cookie(&dmar_domain->domain))
4547                         return NULL;
4548
4549                 domain = &dmar_domain->domain;
4550                 domain->geometry.aperture_start = 0;
4551                 domain->geometry.aperture_end   =
4552                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4553                 domain->geometry.force_aperture = true;
4554
4555                 return domain;
4556         case IOMMU_DOMAIN_IDENTITY:
4557                 return &si_domain->domain;
4558         default:
4559                 return NULL;
4560         }
4561
4562         return NULL;
4563 }
4564
4565 static void intel_iommu_domain_free(struct iommu_domain *domain)
4566 {
4567         if (domain != &si_domain->domain)
4568                 domain_exit(to_dmar_domain(domain));
4569 }
4570
4571 /*
4572  * Check whether a @domain could be attached to the @dev through the
4573  * aux-domain attach/detach APIs.
4574  */
4575 static inline bool
4576 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4577 {
4578         struct device_domain_info *info = get_domain_info(dev);
4579
4580         return info && info->auxd_enabled &&
4581                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4582 }
4583
4584 static inline struct subdev_domain_info *
4585 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4586 {
4587         struct subdev_domain_info *sinfo;
4588
4589         if (!list_empty(&domain->subdevices)) {
4590                 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4591                         if (sinfo->pdev == dev)
4592                                 return sinfo;
4593                 }
4594         }
4595
4596         return NULL;
4597 }
4598
4599 static int auxiliary_link_device(struct dmar_domain *domain,
4600                                  struct device *dev)
4601 {
4602         struct device_domain_info *info = get_domain_info(dev);
4603         struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4604
4605         assert_spin_locked(&device_domain_lock);
4606         if (WARN_ON(!info))
4607                 return -EINVAL;
4608
4609         if (!sinfo) {
4610                 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4611                 if (!sinfo)
4612                         return -ENOMEM;
4613                 sinfo->domain = domain;
4614                 sinfo->pdev = dev;
4615                 list_add(&sinfo->link_phys, &info->subdevices);
4616                 list_add(&sinfo->link_domain, &domain->subdevices);
4617         }
4618
4619         return ++sinfo->users;
4620 }
4621
4622 static int auxiliary_unlink_device(struct dmar_domain *domain,
4623                                    struct device *dev)
4624 {
4625         struct device_domain_info *info = get_domain_info(dev);
4626         struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4627         int ret;
4628
4629         assert_spin_locked(&device_domain_lock);
4630         if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4631                 return -EINVAL;
4632
4633         ret = --sinfo->users;
4634         if (!ret) {
4635                 list_del(&sinfo->link_phys);
4636                 list_del(&sinfo->link_domain);
4637                 kfree(sinfo);
4638         }
4639
4640         return ret;
4641 }
4642
4643 static int aux_domain_add_dev(struct dmar_domain *domain,
4644                               struct device *dev)
4645 {
4646         int ret;
4647         unsigned long flags;
4648         struct intel_iommu *iommu;
4649
4650         iommu = device_to_iommu(dev, NULL, NULL);
4651         if (!iommu)
4652                 return -ENODEV;
4653
4654         if (domain->default_pasid <= 0) {
4655                 u32 pasid;
4656
4657                 /* No private data needed for the default pasid */
4658                 pasid = ioasid_alloc(NULL, PASID_MIN,
4659                                      pci_max_pasids(to_pci_dev(dev)) - 1,
4660                                      NULL);
4661                 if (pasid == INVALID_IOASID) {
4662                         pr_err("Can't allocate default pasid\n");
4663                         return -ENODEV;
4664                 }
4665                 domain->default_pasid = pasid;
4666         }
4667
4668         spin_lock_irqsave(&device_domain_lock, flags);
4669         ret = auxiliary_link_device(domain, dev);
4670         if (ret <= 0)
4671                 goto link_failed;
4672
4673         /*
4674          * Subdevices from the same physical device can be attached to the
4675          * same domain. For such cases, only the first subdevice attachment
4676          * needs to go through the full steps in this function. So if ret >
4677          * 1, just goto out.
4678          */
4679         if (ret > 1)
4680                 goto out;
4681
4682         /*
4683          * iommu->lock must be held to attach domain to iommu and setup the
4684          * pasid entry for second level translation.
4685          */
4686         spin_lock(&iommu->lock);
4687         ret = domain_attach_iommu(domain, iommu);
4688         if (ret)
4689                 goto attach_failed;
4690
4691         /* Setup the PASID entry for mediated devices: */
4692         if (domain_use_first_level(domain))
4693                 ret = domain_setup_first_level(iommu, domain, dev,
4694                                                domain->default_pasid);
4695         else
4696                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4697                                                      domain->default_pasid);
4698         if (ret)
4699                 goto table_failed;
4700
4701         spin_unlock(&iommu->lock);
4702 out:
4703         spin_unlock_irqrestore(&device_domain_lock, flags);
4704
4705         return 0;
4706
4707 table_failed:
4708         domain_detach_iommu(domain, iommu);
4709 attach_failed:
4710         spin_unlock(&iommu->lock);
4711         auxiliary_unlink_device(domain, dev);
4712 link_failed:
4713         spin_unlock_irqrestore(&device_domain_lock, flags);
4714         if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4715                 ioasid_put(domain->default_pasid);
4716
4717         return ret;
4718 }
4719
4720 static void aux_domain_remove_dev(struct dmar_domain *domain,
4721                                   struct device *dev)
4722 {
4723         struct device_domain_info *info;
4724         struct intel_iommu *iommu;
4725         unsigned long flags;
4726
4727         if (!is_aux_domain(dev, &domain->domain))
4728                 return;
4729
4730         spin_lock_irqsave(&device_domain_lock, flags);
4731         info = get_domain_info(dev);
4732         iommu = info->iommu;
4733
4734         if (!auxiliary_unlink_device(domain, dev)) {
4735                 spin_lock(&iommu->lock);
4736                 intel_pasid_tear_down_entry(iommu, dev,
4737                                             domain->default_pasid, false);
4738                 domain_detach_iommu(domain, iommu);
4739                 spin_unlock(&iommu->lock);
4740         }
4741
4742         spin_unlock_irqrestore(&device_domain_lock, flags);
4743
4744         if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4745                 ioasid_put(domain->default_pasid);
4746 }
4747
4748 static int prepare_domain_attach_device(struct iommu_domain *domain,
4749                                         struct device *dev)
4750 {
4751         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4752         struct intel_iommu *iommu;
4753         int addr_width;
4754
4755         iommu = device_to_iommu(dev, NULL, NULL);
4756         if (!iommu)
4757                 return -ENODEV;
4758
4759         if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4760             !ecap_nest(iommu->ecap)) {
4761                 dev_err(dev, "%s: iommu not support nested translation\n",
4762                         iommu->name);
4763                 return -EINVAL;
4764         }
4765
4766         /* check if this iommu agaw is sufficient for max mapped address */
4767         addr_width = agaw_to_width(iommu->agaw);
4768         if (addr_width > cap_mgaw(iommu->cap))
4769                 addr_width = cap_mgaw(iommu->cap);
4770
4771         if (dmar_domain->max_addr > (1LL << addr_width)) {
4772                 dev_err(dev, "%s: iommu width (%d) is not "
4773                         "sufficient for the mapped address (%llx)\n",
4774                         __func__, addr_width, dmar_domain->max_addr);
4775                 return -EFAULT;
4776         }
4777         dmar_domain->gaw = addr_width;
4778
4779         /*
4780          * Knock out extra levels of page tables if necessary
4781          */
4782         while (iommu->agaw < dmar_domain->agaw) {
4783                 struct dma_pte *pte;
4784
4785                 pte = dmar_domain->pgd;
4786                 if (dma_pte_present(pte)) {
4787                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4788                         free_pgtable_page(pte);
4789                 }
4790                 dmar_domain->agaw--;
4791         }
4792
4793         return 0;
4794 }
4795
4796 static int intel_iommu_attach_device(struct iommu_domain *domain,
4797                                      struct device *dev)
4798 {
4799         int ret;
4800
4801         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4802             device_is_rmrr_locked(dev)) {
4803                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4804                 return -EPERM;
4805         }
4806
4807         if (is_aux_domain(dev, domain))
4808                 return -EPERM;
4809
4810         /* normally dev is not mapped */
4811         if (unlikely(domain_context_mapped(dev))) {
4812                 struct dmar_domain *old_domain;
4813
4814                 old_domain = find_domain(dev);
4815                 if (old_domain)
4816                         dmar_remove_one_dev_info(dev);
4817         }
4818
4819         ret = prepare_domain_attach_device(domain, dev);
4820         if (ret)
4821                 return ret;
4822
4823         return domain_add_dev_info(to_dmar_domain(domain), dev);
4824 }
4825
4826 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4827                                          struct device *dev)
4828 {
4829         int ret;
4830
4831         if (!is_aux_domain(dev, domain))
4832                 return -EPERM;
4833
4834         ret = prepare_domain_attach_device(domain, dev);
4835         if (ret)
4836                 return ret;
4837
4838         return aux_domain_add_dev(to_dmar_domain(domain), dev);
4839 }
4840
4841 static void intel_iommu_detach_device(struct iommu_domain *domain,
4842                                       struct device *dev)
4843 {
4844         dmar_remove_one_dev_info(dev);
4845 }
4846
4847 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4848                                           struct device *dev)
4849 {
4850         aux_domain_remove_dev(to_dmar_domain(domain), dev);
4851 }
4852
4853 #ifdef CONFIG_INTEL_IOMMU_SVM
4854 /*
4855  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4856  * VT-d granularity. Invalidation is typically included in the unmap operation
4857  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4858  * owns the first level page tables. Invalidations of translation caches in the
4859  * guest are trapped and passed down to the host.
4860  *
4861  * vIOMMU in the guest will only expose first level page tables, therefore
4862  * we do not support IOTLB granularity for request without PASID (second level).
4863  *
4864  * For example, to find the VT-d granularity encoding for IOTLB
4865  * type and page selective granularity within PASID:
4866  * X: indexed by iommu cache type
4867  * Y: indexed by enum iommu_inv_granularity
4868  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4869  */
4870
4871 static const int
4872 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4873         /*
4874          * PASID based IOTLB invalidation: PASID selective (per PASID),
4875          * page selective (address granularity)
4876          */
4877         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4878         /* PASID based dev TLBs */
4879         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4880         /* PASID cache */
4881         {-EINVAL, -EINVAL, -EINVAL}
4882 };
4883
4884 static inline int to_vtd_granularity(int type, int granu)
4885 {
4886         return inv_type_granu_table[type][granu];
4887 }
4888
4889 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4890 {
4891         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4892
4893         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4894          * IOMMU cache invalidate API passes granu_size in bytes, and number of
4895          * granu size in contiguous memory.
4896          */
4897         return order_base_2(nr_pages);
4898 }
4899
4900 static int
4901 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4902                            struct iommu_cache_invalidate_info *inv_info)
4903 {
4904         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4905         struct device_domain_info *info;
4906         struct intel_iommu *iommu;
4907         unsigned long flags;
4908         int cache_type;
4909         u8 bus, devfn;
4910         u16 did, sid;
4911         int ret = 0;
4912         u64 size = 0;
4913
4914         if (!inv_info || !dmar_domain)
4915                 return -EINVAL;
4916
4917         if (!dev || !dev_is_pci(dev))
4918                 return -ENODEV;
4919
4920         iommu = device_to_iommu(dev, &bus, &devfn);
4921         if (!iommu)
4922                 return -ENODEV;
4923
4924         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4925                 return -EINVAL;
4926
4927         spin_lock_irqsave(&device_domain_lock, flags);
4928         spin_lock(&iommu->lock);
4929         info = get_domain_info(dev);
4930         if (!info) {
4931                 ret = -EINVAL;
4932                 goto out_unlock;
4933         }
4934         did = dmar_domain->iommu_did[iommu->seq_id];
4935         sid = PCI_DEVID(bus, devfn);
4936
4937         /* Size is only valid in address selective invalidation */
4938         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4939                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4940                                    inv_info->granu.addr_info.nb_granules);
4941
4942         for_each_set_bit(cache_type,
4943                          (unsigned long *)&inv_info->cache,
4944                          IOMMU_CACHE_INV_TYPE_NR) {
4945                 int granu = 0;
4946                 u64 pasid = 0;
4947                 u64 addr = 0;
4948
4949                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4950                 if (granu == -EINVAL) {
4951                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4952                                            cache_type, inv_info->granularity);
4953                         break;
4954                 }
4955
4956                 /*
4957                  * PASID is stored in different locations based on the
4958                  * granularity.
4959                  */
4960                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4961                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4962                         pasid = inv_info->granu.pasid_info.pasid;
4963                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4964                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4965                         pasid = inv_info->granu.addr_info.pasid;
4966
4967                 switch (BIT(cache_type)) {
4968                 case IOMMU_CACHE_INV_TYPE_IOTLB:
4969                         /* HW will ignore LSB bits based on address mask */
4970                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4971                             size &&
4972                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4973                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4974                                                    inv_info->granu.addr_info.addr, size);
4975                         }
4976
4977                         /*
4978                          * If granu is PASID-selective, address is ignored.
4979                          * We use npages = -1 to indicate that.
4980                          */
4981                         qi_flush_piotlb(iommu, did, pasid,
4982                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4983                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4984                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4985
4986                         if (!info->ats_enabled)
4987                                 break;
4988                         /*
4989                          * Always flush device IOTLB if ATS is enabled. vIOMMU
4990                          * in the guest may assume IOTLB flush is inclusive,
4991                          * which is more efficient.
4992                          */
4993                         fallthrough;
4994                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4995                         /*
4996                          * PASID based device TLB invalidation does not support
4997                          * IOMMU_INV_GRANU_PASID granularity but only supports
4998                          * IOMMU_INV_GRANU_ADDR.
4999                          * The equivalent of that is we set the size to be the
5000                          * entire range of 64 bit. User only provides PASID info
5001                          * without address info. So we set addr to 0.
5002                          */
5003                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5004                                 size = 64 - VTD_PAGE_SHIFT;
5005                                 addr = 0;
5006                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5007                                 addr = inv_info->granu.addr_info.addr;
5008                         }
5009
5010                         if (info->ats_enabled)
5011                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5012                                                 info->pfsid, pasid,
5013                                                 info->ats_qdep, addr,
5014                                                 size);
5015                         else
5016                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5017                         break;
5018                 default:
5019                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5020                                             cache_type);
5021                         ret = -EINVAL;
5022                 }
5023         }
5024 out_unlock:
5025         spin_unlock(&iommu->lock);
5026         spin_unlock_irqrestore(&device_domain_lock, flags);
5027
5028         return ret;
5029 }
5030 #endif
5031
5032 static int intel_iommu_map(struct iommu_domain *domain,
5033                            unsigned long iova, phys_addr_t hpa,
5034                            size_t size, int iommu_prot, gfp_t gfp)
5035 {
5036         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5037         u64 max_addr;
5038         int prot = 0;
5039
5040         if (iommu_prot & IOMMU_READ)
5041                 prot |= DMA_PTE_READ;
5042         if (iommu_prot & IOMMU_WRITE)
5043                 prot |= DMA_PTE_WRITE;
5044         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5045                 prot |= DMA_PTE_SNP;
5046
5047         max_addr = iova + size;
5048         if (dmar_domain->max_addr < max_addr) {
5049                 u64 end;
5050
5051                 /* check if minimum agaw is sufficient for mapped address */
5052                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5053                 if (end < max_addr) {
5054                         pr_err("%s: iommu width (%d) is not "
5055                                "sufficient for the mapped address (%llx)\n",
5056                                __func__, dmar_domain->gaw, max_addr);
5057                         return -EFAULT;
5058                 }
5059                 dmar_domain->max_addr = max_addr;
5060         }
5061         /* Round up size to next multiple of PAGE_SIZE, if it and
5062            the low bits of hpa would take us onto the next page */
5063         size = aligned_nrpages(hpa, size);
5064         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5065                                 hpa >> VTD_PAGE_SHIFT, size, prot);
5066 }
5067
5068 static int intel_iommu_map_pages(struct iommu_domain *domain,
5069                                  unsigned long iova, phys_addr_t paddr,
5070                                  size_t pgsize, size_t pgcount,
5071                                  int prot, gfp_t gfp, size_t *mapped)
5072 {
5073         unsigned long pgshift = __ffs(pgsize);
5074         size_t size = pgcount << pgshift;
5075         int ret;
5076
5077         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5078                 return -EINVAL;
5079
5080         if (!IS_ALIGNED(iova | paddr, pgsize))
5081                 return -EINVAL;
5082
5083         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5084         if (!ret && mapped)
5085                 *mapped = size;
5086
5087         return ret;
5088 }
5089
5090 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5091                                 unsigned long iova, size_t size,
5092                                 struct iommu_iotlb_gather *gather)
5093 {
5094         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5095         unsigned long start_pfn, last_pfn;
5096         int level = 0;
5097
5098         /* Cope with horrid API which requires us to unmap more than the
5099            size argument if it happens to be a large-page mapping. */
5100         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5101
5102         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5103                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5104
5105         start_pfn = iova >> VTD_PAGE_SHIFT;
5106         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5107
5108         gather->freelist = domain_unmap(dmar_domain, start_pfn,
5109                                         last_pfn, gather->freelist);
5110
5111         if (dmar_domain->max_addr == iova + size)
5112                 dmar_domain->max_addr = iova;
5113
5114         iommu_iotlb_gather_add_page(domain, gather, iova, size);
5115
5116         return size;
5117 }
5118
5119 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5120                                       unsigned long iova,
5121                                       size_t pgsize, size_t pgcount,
5122                                       struct iommu_iotlb_gather *gather)
5123 {
5124         unsigned long pgshift = __ffs(pgsize);
5125         size_t size = pgcount << pgshift;
5126
5127         return intel_iommu_unmap(domain, iova, size, gather);
5128 }
5129
5130 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5131                                  struct iommu_iotlb_gather *gather)
5132 {
5133         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5134         unsigned long iova_pfn = IOVA_PFN(gather->start);
5135         size_t size = gather->end - gather->start;
5136         unsigned long start_pfn;
5137         unsigned long nrpages;
5138         int iommu_id;
5139
5140         nrpages = aligned_nrpages(gather->start, size);
5141         start_pfn = mm_to_dma_pfn(iova_pfn);
5142
5143         for_each_domain_iommu(iommu_id, dmar_domain)
5144                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5145                                       start_pfn, nrpages, !gather->freelist, 0);
5146
5147         dma_free_pagelist(gather->freelist);
5148 }
5149
5150 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5151                                             dma_addr_t iova)
5152 {
5153         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5154         struct dma_pte *pte;
5155         int level = 0;
5156         u64 phys = 0;
5157
5158         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5159         if (pte && dma_pte_present(pte))
5160                 phys = dma_pte_addr(pte) +
5161                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5162                                                 VTD_PAGE_SHIFT) - 1));
5163
5164         return phys;
5165 }
5166
5167 static bool intel_iommu_capable(enum iommu_cap cap)
5168 {
5169         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5170                 return domain_update_iommu_snooping(NULL);
5171         if (cap == IOMMU_CAP_INTR_REMAP)
5172                 return irq_remapping_enabled == 1;
5173
5174         return false;
5175 }
5176
5177 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5178 {
5179         struct intel_iommu *iommu;
5180
5181         iommu = device_to_iommu(dev, NULL, NULL);
5182         if (!iommu)
5183                 return ERR_PTR(-ENODEV);
5184
5185         if (translation_pre_enabled(iommu))
5186                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5187
5188         return &iommu->iommu;
5189 }
5190
5191 static void intel_iommu_release_device(struct device *dev)
5192 {
5193         struct intel_iommu *iommu;
5194
5195         iommu = device_to_iommu(dev, NULL, NULL);
5196         if (!iommu)
5197                 return;
5198
5199         dmar_remove_one_dev_info(dev);
5200
5201         set_dma_ops(dev, NULL);
5202 }
5203
5204 static void intel_iommu_probe_finalize(struct device *dev)
5205 {
5206         struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5207
5208         if (domain && domain->type == IOMMU_DOMAIN_DMA)
5209                 iommu_setup_dma_ops(dev, 0, U64_MAX);
5210         else
5211                 set_dma_ops(dev, NULL);
5212 }
5213
5214 static void intel_iommu_get_resv_regions(struct device *device,
5215                                          struct list_head *head)
5216 {
5217         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5218         struct iommu_resv_region *reg;
5219         struct dmar_rmrr_unit *rmrr;
5220         struct device *i_dev;
5221         int i;
5222
5223         down_read(&dmar_global_lock);
5224         for_each_rmrr_units(rmrr) {
5225                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5226                                           i, i_dev) {
5227                         struct iommu_resv_region *resv;
5228                         enum iommu_resv_type type;
5229                         size_t length;
5230
5231                         if (i_dev != device &&
5232                             !is_downstream_to_pci_bridge(device, i_dev))
5233                                 continue;
5234
5235                         length = rmrr->end_address - rmrr->base_address + 1;
5236
5237                         type = device_rmrr_is_relaxable(device) ?
5238                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5239
5240                         resv = iommu_alloc_resv_region(rmrr->base_address,
5241                                                        length, prot, type);
5242                         if (!resv)
5243                                 break;
5244
5245                         list_add_tail(&resv->list, head);
5246                 }
5247         }
5248         up_read(&dmar_global_lock);
5249
5250 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5251         if (dev_is_pci(device)) {
5252                 struct pci_dev *pdev = to_pci_dev(device);
5253
5254                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5255                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5256                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5257                         if (reg)
5258                                 list_add_tail(&reg->list, head);
5259                 }
5260         }
5261 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5262
5263         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5264                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5265                                       0, IOMMU_RESV_MSI);
5266         if (!reg)
5267                 return;
5268         list_add_tail(&reg->list, head);
5269 }
5270
5271 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5272 {
5273         struct device_domain_info *info;
5274         struct context_entry *context;
5275         struct dmar_domain *domain;
5276         unsigned long flags;
5277         u64 ctx_lo;
5278         int ret;
5279
5280         domain = find_domain(dev);
5281         if (!domain)
5282                 return -EINVAL;
5283
5284         spin_lock_irqsave(&device_domain_lock, flags);
5285         spin_lock(&iommu->lock);
5286
5287         ret = -EINVAL;
5288         info = get_domain_info(dev);
5289         if (!info || !info->pasid_supported)
5290                 goto out;
5291
5292         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5293         if (WARN_ON(!context))
5294                 goto out;
5295
5296         ctx_lo = context[0].lo;
5297
5298         if (!(ctx_lo & CONTEXT_PASIDE)) {
5299                 ctx_lo |= CONTEXT_PASIDE;
5300                 context[0].lo = ctx_lo;
5301                 wmb();
5302                 iommu->flush.flush_context(iommu,
5303                                            domain->iommu_did[iommu->seq_id],
5304                                            PCI_DEVID(info->bus, info->devfn),
5305                                            DMA_CCMD_MASK_NOBIT,
5306                                            DMA_CCMD_DEVICE_INVL);
5307         }
5308
5309         /* Enable PASID support in the device, if it wasn't already */
5310         if (!info->pasid_enabled)
5311                 iommu_enable_dev_iotlb(info);
5312
5313         ret = 0;
5314
5315  out:
5316         spin_unlock(&iommu->lock);
5317         spin_unlock_irqrestore(&device_domain_lock, flags);
5318
5319         return ret;
5320 }
5321
5322 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5323 {
5324         if (dev_is_pci(dev))
5325                 return pci_device_group(dev);
5326         return generic_device_group(dev);
5327 }
5328
5329 static int intel_iommu_enable_auxd(struct device *dev)
5330 {
5331         struct device_domain_info *info;
5332         struct intel_iommu *iommu;
5333         unsigned long flags;
5334         int ret;
5335
5336         iommu = device_to_iommu(dev, NULL, NULL);
5337         if (!iommu || dmar_disabled)
5338                 return -EINVAL;
5339
5340         if (!sm_supported(iommu) || !pasid_supported(iommu))
5341                 return -EINVAL;
5342
5343         ret = intel_iommu_enable_pasid(iommu, dev);
5344         if (ret)
5345                 return -ENODEV;
5346
5347         spin_lock_irqsave(&device_domain_lock, flags);
5348         info = get_domain_info(dev);
5349         info->auxd_enabled = 1;
5350         spin_unlock_irqrestore(&device_domain_lock, flags);
5351
5352         return 0;
5353 }
5354
5355 static int intel_iommu_disable_auxd(struct device *dev)
5356 {
5357         struct device_domain_info *info;
5358         unsigned long flags;
5359
5360         spin_lock_irqsave(&device_domain_lock, flags);
5361         info = get_domain_info(dev);
5362         if (!WARN_ON(!info))
5363                 info->auxd_enabled = 0;
5364         spin_unlock_irqrestore(&device_domain_lock, flags);
5365
5366         return 0;
5367 }
5368
5369 static int intel_iommu_enable_sva(struct device *dev)
5370 {
5371         struct device_domain_info *info = get_domain_info(dev);
5372         struct intel_iommu *iommu;
5373         int ret;
5374
5375         if (!info || dmar_disabled)
5376                 return -EINVAL;
5377
5378         iommu = info->iommu;
5379         if (!iommu)
5380                 return -EINVAL;
5381
5382         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5383                 return -ENODEV;
5384
5385         if (intel_iommu_enable_pasid(iommu, dev))
5386                 return -ENODEV;
5387
5388         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5389                 return -EINVAL;
5390
5391         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5392         if (!ret)
5393                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5394
5395         return ret;
5396 }
5397
5398 static int intel_iommu_disable_sva(struct device *dev)
5399 {
5400         struct device_domain_info *info = get_domain_info(dev);
5401         struct intel_iommu *iommu = info->iommu;
5402         int ret;
5403
5404         ret = iommu_unregister_device_fault_handler(dev);
5405         if (!ret)
5406                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5407
5408         return ret;
5409 }
5410
5411 /*
5412  * A PCI express designated vendor specific extended capability is defined
5413  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5414  * for system software and tools to detect endpoint devices supporting the
5415  * Intel scalable IO virtualization without host driver dependency.
5416  *
5417  * Returns the address of the matching extended capability structure within
5418  * the device's PCI configuration space or 0 if the device does not support
5419  * it.
5420  */
5421 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5422 {
5423         int pos;
5424         u16 vendor, id;
5425
5426         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5427         while (pos) {
5428                 pci_read_config_word(pdev, pos + 4, &vendor);
5429                 pci_read_config_word(pdev, pos + 8, &id);
5430                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5431                         return pos;
5432
5433                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5434         }
5435
5436         return 0;
5437 }
5438
5439 static bool
5440 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5441 {
5442         struct device_domain_info *info = get_domain_info(dev);
5443
5444         if (feat == IOMMU_DEV_FEAT_AUX) {
5445                 int ret;
5446
5447                 if (!dev_is_pci(dev) || dmar_disabled ||
5448                     !scalable_mode_support() || !pasid_mode_support())
5449                         return false;
5450
5451                 ret = pci_pasid_features(to_pci_dev(dev));
5452                 if (ret < 0)
5453                         return false;
5454
5455                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5456         }
5457
5458         if (feat == IOMMU_DEV_FEAT_IOPF)
5459                 return info && info->pri_supported;
5460
5461         if (feat == IOMMU_DEV_FEAT_SVA)
5462                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5463                         info->pasid_supported && info->pri_supported &&
5464                         info->ats_supported;
5465
5466         return false;
5467 }
5468
5469 static int
5470 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5471 {
5472         switch (feat) {
5473         case IOMMU_DEV_FEAT_AUX:
5474                 return intel_iommu_enable_auxd(dev);
5475
5476         case IOMMU_DEV_FEAT_IOPF:
5477                 return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5478
5479         case IOMMU_DEV_FEAT_SVA:
5480                 return intel_iommu_enable_sva(dev);
5481
5482         default:
5483                 return -ENODEV;
5484         }
5485 }
5486
5487 static int
5488 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5489 {
5490         switch (feat) {
5491         case IOMMU_DEV_FEAT_AUX:
5492                 return intel_iommu_disable_auxd(dev);
5493
5494         case IOMMU_DEV_FEAT_IOPF:
5495                 return 0;
5496
5497         case IOMMU_DEV_FEAT_SVA:
5498                 return intel_iommu_disable_sva(dev);
5499
5500         default:
5501                 return -ENODEV;
5502         }
5503 }
5504
5505 static bool
5506 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5507 {
5508         struct device_domain_info *info = get_domain_info(dev);
5509
5510         if (feat == IOMMU_DEV_FEAT_AUX)
5511                 return scalable_mode_support() && info && info->auxd_enabled;
5512
5513         return false;
5514 }
5515
5516 static int
5517 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5518 {
5519         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5520
5521         return dmar_domain->default_pasid > 0 ?
5522                         dmar_domain->default_pasid : -EINVAL;
5523 }
5524
5525 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5526                                            struct device *dev)
5527 {
5528         return attach_deferred(dev);
5529 }
5530
5531 static int
5532 intel_iommu_enable_nesting(struct iommu_domain *domain)
5533 {
5534         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535         unsigned long flags;
5536         int ret = -ENODEV;
5537
5538         spin_lock_irqsave(&device_domain_lock, flags);
5539         if (list_empty(&dmar_domain->devices)) {
5540                 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5541                 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5542                 ret = 0;
5543         }
5544         spin_unlock_irqrestore(&device_domain_lock, flags);
5545
5546         return ret;
5547 }
5548
5549 /*
5550  * Check that the device does not live on an external facing PCI port that is
5551  * marked as untrusted. Such devices should not be able to apply quirks and
5552  * thus not be able to bypass the IOMMU restrictions.
5553  */
5554 static bool risky_device(struct pci_dev *pdev)
5555 {
5556         if (pdev->untrusted) {
5557                 pci_info(pdev,
5558                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5559                          pdev->vendor, pdev->device);
5560                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5561                 return true;
5562         }
5563         return false;
5564 }
5565
5566 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5567                              unsigned long clf_pages)
5568 {
5569         struct dma_pte *first_pte = NULL, *pte = NULL;
5570         unsigned long lvl_pages = 0;
5571         int level = 0;
5572
5573         while (clf_pages > 0) {
5574                 if (!pte) {
5575                         level = 0;
5576                         pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5577                         if (WARN_ON(!pte))
5578                                 return;
5579                         first_pte = pte;
5580                         lvl_pages = lvl_to_nr_pages(level);
5581                 }
5582
5583                 if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5584                         return;
5585
5586                 clf_pages -= lvl_pages;
5587                 clf_pfn += lvl_pages;
5588                 pte++;
5589
5590                 if (!clf_pages || first_pte_in_page(pte) ||
5591                     (level > 1 && clf_pages < lvl_pages)) {
5592                         domain_flush_cache(domain, first_pte,
5593                                            (void *)pte - (void *)first_pte);
5594                         pte = NULL;
5595                 }
5596         }
5597 }
5598
5599 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5600                                        unsigned long iova, size_t size)
5601 {
5602         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5603         unsigned long pages = aligned_nrpages(iova, size);
5604         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5605         struct intel_iommu *iommu;
5606         int iommu_id;
5607
5608         if (!dmar_domain->iommu_coherency)
5609                 clflush_sync_map(dmar_domain, pfn, pages);
5610
5611         for_each_domain_iommu(iommu_id, dmar_domain) {
5612                 iommu = g_iommus[iommu_id];
5613                 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
5614         }
5615 }
5616
5617 const struct iommu_ops intel_iommu_ops = {
5618         .capable                = intel_iommu_capable,
5619         .domain_alloc           = intel_iommu_domain_alloc,
5620         .domain_free            = intel_iommu_domain_free,
5621         .enable_nesting         = intel_iommu_enable_nesting,
5622         .attach_dev             = intel_iommu_attach_device,
5623         .detach_dev             = intel_iommu_detach_device,
5624         .aux_attach_dev         = intel_iommu_aux_attach_device,
5625         .aux_detach_dev         = intel_iommu_aux_detach_device,
5626         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5627         .map_pages              = intel_iommu_map_pages,
5628         .unmap_pages            = intel_iommu_unmap_pages,
5629         .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
5630         .flush_iotlb_all        = intel_flush_iotlb_all,
5631         .iotlb_sync             = intel_iommu_tlb_sync,
5632         .iova_to_phys           = intel_iommu_iova_to_phys,
5633         .probe_device           = intel_iommu_probe_device,
5634         .probe_finalize         = intel_iommu_probe_finalize,
5635         .release_device         = intel_iommu_release_device,
5636         .get_resv_regions       = intel_iommu_get_resv_regions,
5637         .put_resv_regions       = generic_iommu_put_resv_regions,
5638         .device_group           = intel_iommu_device_group,
5639         .dev_has_feat           = intel_iommu_dev_has_feat,
5640         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5641         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5642         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5643         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5644         .def_domain_type        = device_def_domain_type,
5645         .pgsize_bitmap          = SZ_4K,
5646 #ifdef CONFIG_INTEL_IOMMU_SVM
5647         .cache_invalidate       = intel_iommu_sva_invalidate,
5648         .sva_bind_gpasid        = intel_svm_bind_gpasid,
5649         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
5650         .sva_bind               = intel_svm_bind,
5651         .sva_unbind             = intel_svm_unbind,
5652         .sva_get_pasid          = intel_svm_get_pasid,
5653         .page_response          = intel_svm_page_response,
5654 #endif
5655 };
5656
5657 static void quirk_iommu_igfx(struct pci_dev *dev)
5658 {
5659         if (risky_device(dev))
5660                 return;
5661
5662         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5663         dmar_map_gfx = 0;
5664 }
5665
5666 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5674
5675 /* Broadwell igfx malfunctions with dmar */
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5682 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5683 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5684 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5685 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5700
5701 static void quirk_iommu_rwbf(struct pci_dev *dev)
5702 {
5703         if (risky_device(dev))
5704                 return;
5705
5706         /*
5707          * Mobile 4 Series Chipset neglects to set RWBF capability,
5708          * but needs it. Same seems to hold for the desktop versions.
5709          */
5710         pci_info(dev, "Forcing write-buffer flush capability\n");
5711         rwbf_quirk = 1;
5712 }
5713
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5721
5722 #define GGC 0x52
5723 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5724 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5725 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5726 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5727 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5728 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5729 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5730 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5731
5732 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5733 {
5734         unsigned short ggc;
5735
5736         if (risky_device(dev))
5737                 return;
5738
5739         if (pci_read_config_word(dev, GGC, &ggc))
5740                 return;
5741
5742         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5743                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5744                 dmar_map_gfx = 0;
5745         } else if (dmar_map_gfx) {
5746                 /* we have to ensure the gfx device is idle before we flush */
5747                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5748                 iommu_set_dma_strict();
5749         }
5750 }
5751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5755
5756 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5757 {
5758         unsigned short ver;
5759
5760         if (!IS_GFX_DEVICE(dev))
5761                 return;
5762
5763         ver = (dev->device >> 8) & 0xff;
5764         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5765             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5766             ver != 0x9a)
5767                 return;
5768
5769         if (risky_device(dev))
5770                 return;
5771
5772         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5773         iommu_skip_te_disable = 1;
5774 }
5775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5776
5777 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5778    ISOCH DMAR unit for the Azalia sound device, but not give it any
5779    TLB entries, which causes it to deadlock. Check for that.  We do
5780    this in a function called from init_dmars(), instead of in a PCI
5781    quirk, because we don't want to print the obnoxious "BIOS broken"
5782    message if VT-d is actually disabled.
5783 */
5784 static void __init check_tylersburg_isoch(void)
5785 {
5786         struct pci_dev *pdev;
5787         uint32_t vtisochctrl;
5788
5789         /* If there's no Azalia in the system anyway, forget it. */
5790         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5791         if (!pdev)
5792                 return;
5793
5794         if (risky_device(pdev)) {
5795                 pci_dev_put(pdev);
5796                 return;
5797         }
5798
5799         pci_dev_put(pdev);
5800
5801         /* System Management Registers. Might be hidden, in which case
5802            we can't do the sanity check. But that's OK, because the
5803            known-broken BIOSes _don't_ actually hide it, so far. */
5804         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5805         if (!pdev)
5806                 return;
5807
5808         if (risky_device(pdev)) {
5809                 pci_dev_put(pdev);
5810                 return;
5811         }
5812
5813         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5814                 pci_dev_put(pdev);
5815                 return;
5816         }
5817
5818         pci_dev_put(pdev);
5819
5820         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5821         if (vtisochctrl & 1)
5822                 return;
5823
5824         /* Drop all bits other than the number of TLB entries */
5825         vtisochctrl &= 0x1c;
5826
5827         /* If we have the recommended number of TLB entries (16), fine. */
5828         if (vtisochctrl == 0x10)
5829                 return;
5830
5831         /* Zero TLB entries? You get to ride the short bus to school. */
5832         if (!vtisochctrl) {
5833                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5834                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5835                      dmi_get_system_info(DMI_BIOS_VENDOR),
5836                      dmi_get_system_info(DMI_BIOS_VERSION),
5837                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5838                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5839                 return;
5840         }
5841
5842         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5843                vtisochctrl);
5844 }