drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-mapping.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-contiguous.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "../irq_remapping.h"
  51 #include "pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(u64 pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline u64 level_mask(int level)
 132 {
 133         return -1ULL << level_to_offset_bits(level);
 134 }
 135
 136 static inline u64 level_size(int level)
 137 {
 138         return 1ULL << level_to_offset_bits(level);
 139 }
 140
 141 static inline u64 align_to_level(u64 pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 #define for_each_domain_iommu(idx, domain)                      \
 300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 301                 if (domain->iommu_refcnt[idx])
 302
 303 struct dmar_rmrr_unit {
 304         struct list_head list;          /* list of rmrr units   */
 305         struct acpi_dmar_header *hdr;   /* ACPI header          */
 306         u64     base_address;           /* reserved base address*/
 307         u64     end_address;            /* reserved end address */
 308         struct dmar_dev_scope *devices; /* target devices */
 309         int     devices_cnt;            /* target device count */
 310 };
 311
 312 struct dmar_atsr_unit {
 313         struct list_head list;          /* list of ATSR units */
 314         struct acpi_dmar_header *hdr;   /* ACPI header */
 315         struct dmar_dev_scope *devices; /* target devices */
 316         int devices_cnt;                /* target device count */
 317         u8 include_all:1;               /* include all ports */
 318 };
 319
 320 static LIST_HEAD(dmar_atsr_units);
 321 static LIST_HEAD(dmar_rmrr_units);
 322
 323 #define for_each_rmrr_units(rmrr) \
 324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 325
 326 /* bitmap for indexing intel_iommus */
 327 static int g_num_of_iommus;
 328
 329 static void domain_exit(struct dmar_domain *domain);
 330 static void domain_remove_dev_info(struct dmar_domain *domain);
 331 static void dmar_remove_one_dev_info(struct device *dev);
 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 333 static int intel_iommu_attach_device(struct iommu_domain *domain,
 334                                      struct device *dev);
 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 336                                             dma_addr_t iova);
 337
 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 339 int dmar_disabled = 0;
 340 #else
 341 int dmar_disabled = 1;
 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 343
 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 345 int intel_iommu_sm = 1;
 346 #else
 347 int intel_iommu_sm;
 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 349
 350 int intel_iommu_enabled = 0;
 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 352
 353 static int dmar_map_gfx = 1;
 354 static int dmar_forcedac;
 355 static int intel_iommu_strict;
 356 static int intel_iommu_superpage = 1;
 357 static int iommu_identity_mapping;
 358 static int intel_no_bounce;
 359 static int iommu_skip_te_disable;
 360
 361 #define IDENTMAP_GFX            2
 362 #define IDENTMAP_AZALIA         4
 363
 364 int intel_iommu_gfx_mapped;
 365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 366
 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 368 struct device_domain_info *get_domain_info(struct device *dev)
 369 {
 370         struct device_domain_info *info;
 371
 372         if (!dev)
 373                 return NULL;
 374
 375         info = dev_iommu_priv_get(dev);
 376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 377                 return NULL;
 378
 379         return info;
 380 }
 381
 382 DEFINE_SPINLOCK(device_domain_lock);
 383 static LIST_HEAD(device_domain_list);
 384
 385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 386                                 to_pci_dev(d)->untrusted)
 387
 388 /*
 389  * Iterate over elements in device_domain_list and call the specified
 390  * callback @fn against each element.
 391  */
 392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 393                                      void *data), void *data)
 394 {
 395         int ret = 0;
 396         unsigned long flags;
 397         struct device_domain_info *info;
 398
 399         spin_lock_irqsave(&device_domain_lock, flags);
 400         list_for_each_entry(info, &device_domain_list, global) {
 401                 ret = fn(info, data);
 402                 if (ret) {
 403                         spin_unlock_irqrestore(&device_domain_lock, flags);
 404                         return ret;
 405                 }
 406         }
 407         spin_unlock_irqrestore(&device_domain_lock, flags);
 408
 409         return 0;
 410 }
 411
 412 const struct iommu_ops intel_iommu_ops;
 413
 414 static bool translation_pre_enabled(struct intel_iommu *iommu)
 415 {
 416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 417 }
 418
 419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 420 {
 421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 422 }
 423
 424 static void init_translation_status(struct intel_iommu *iommu)
 425 {
 426         u32 gsts;
 427
 428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 429         if (gsts & DMA_GSTS_TES)
 430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 431 }
 432
 433 static int __init intel_iommu_setup(char *str)
 434 {
 435         if (!str)
 436                 return -EINVAL;
 437         while (*str) {
 438                 if (!strncmp(str, "on", 2)) {
 439                         dmar_disabled = 0;
 440                         pr_info("IOMMU enabled\n");
 441                 } else if (!strncmp(str, "off", 3)) {
 442                         dmar_disabled = 1;
 443                         no_platform_optin = 1;
 444                         pr_info("IOMMU disabled\n");
 445                 } else if (!strncmp(str, "igfx_off", 8)) {
 446                         dmar_map_gfx = 0;
 447                         pr_info("Disable GFX device mapping\n");
 448                 } else if (!strncmp(str, "forcedac", 8)) {
 449                         pr_info("Forcing DAC for PCI devices\n");
 450                         dmar_forcedac = 1;
 451                 } else if (!strncmp(str, "strict", 6)) {
 452                         pr_info("Disable batched IOTLB flush\n");
 453                         intel_iommu_strict = 1;
 454                 } else if (!strncmp(str, "sp_off", 6)) {
 455                         pr_info("Disable supported super page\n");
 456                         intel_iommu_superpage = 0;
 457                 } else if (!strncmp(str, "sm_on", 5)) {
 458                         pr_info("Intel-IOMMU: scalable mode supported\n");
 459                         intel_iommu_sm = 1;
 460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 462                         intel_iommu_tboot_noforce = 1;
 463                 } else if (!strncmp(str, "nobounce", 8)) {
 464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 465                         intel_no_bounce = 1;
 466                 }
 467
 468                 str += strcspn(str, ",");
 469                 while (*str == ',')
 470                         str++;
 471         }
 472         return 0;
 473 }
 474 __setup("intel_iommu=", intel_iommu_setup);
 475
 476 static struct kmem_cache *iommu_domain_cache;
 477 static struct kmem_cache *iommu_devinfo_cache;
 478
 479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 480 {
 481         struct dmar_domain **domains;
 482         int idx = did >> 8;
 483
 484         domains = iommu->domains[idx];
 485         if (!domains)
 486                 return NULL;
 487
 488         return domains[did & 0xff];
 489 }
 490
 491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 492                              struct dmar_domain *domain)
 493 {
 494         struct dmar_domain **domains;
 495         int idx = did >> 8;
 496
 497         if (!iommu->domains[idx]) {
 498                 size_t size = 256 * sizeof(struct dmar_domain *);
 499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 500         }
 501
 502         domains = iommu->domains[idx];
 503         if (WARN_ON(!domains))
 504                 return;
 505         else
 506                 domains[did & 0xff] = domain;
 507 }
 508
 509 void *alloc_pgtable_page(int node)
 510 {
 511         struct page *page;
 512         void *vaddr = NULL;
 513
 514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 515         if (page)
 516                 vaddr = page_address(page);
 517         return vaddr;
 518 }
 519
 520 void free_pgtable_page(void *vaddr)
 521 {
 522         free_page((unsigned long)vaddr);
 523 }
 524
 525 static inline void *alloc_domain_mem(void)
 526 {
 527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 528 }
 529
 530 static void free_domain_mem(void *vaddr)
 531 {
 532         kmem_cache_free(iommu_domain_cache, vaddr);
 533 }
 534
 535 static inline void * alloc_devinfo_mem(void)
 536 {
 537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 538 }
 539
 540 static inline void free_devinfo_mem(void *vaddr)
 541 {
 542         kmem_cache_free(iommu_devinfo_cache, vaddr);
 543 }
 544
 545 static inline int domain_type_is_si(struct dmar_domain *domain)
 546 {
 547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 548 }
 549
 550 static inline bool domain_use_first_level(struct dmar_domain *domain)
 551 {
 552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 553 }
 554
 555 static inline int domain_pfn_supported(struct dmar_domain *domain,
 556                                        unsigned long pfn)
 557 {
 558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561 }
 562
 563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564 {
 565         unsigned long sagaw;
 566         int agaw = -1;
 567
 568         sagaw = cap_sagaw(iommu->cap);
 569         for (agaw = width_to_agaw(max_gaw);
 570              agaw >= 0; agaw--) {
 571                 if (test_bit(agaw, &sagaw))
 572                         break;
 573         }
 574
 575         return agaw;
 576 }
 577
 578 /*
 579  * Calculate max SAGAW for each iommu.
 580  */
 581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582 {
 583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584 }
 585
 586 /*
 587  * calculate agaw for each iommu.
 588  * "SAGAW" may be different across iommus, use a default agaw, and
 589  * get a supported less agaw for iommus that don't support the default agaw.
 590  */
 591 int iommu_calculate_agaw(struct intel_iommu *iommu)
 592 {
 593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594 }
 595
 596 /* This functionin only returns single iommu in a domain */
 597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598 {
 599         int iommu_id;
 600
 601         /* si_domain and vm domain should not get here. */
 602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603                 return NULL;
 604
 605         for_each_domain_iommu(iommu_id, domain)
 606                 break;
 607
 608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609                 return NULL;
 610
 611         return g_iommus[iommu_id];
 612 }
 613
 614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 615 {
 616         return sm_supported(iommu) ?
 617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 618 }
 619
 620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 621 {
 622         struct dmar_drhd_unit *drhd;
 623         struct intel_iommu *iommu;
 624         bool found = false;
 625         int i;
 626
 627         domain->iommu_coherency = 1;
 628
 629         for_each_domain_iommu(i, domain) {
 630                 found = true;
 631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
 632                         domain->iommu_coherency = 0;
 633                         break;
 634                 }
 635         }
 636         if (found)
 637                 return;
 638
 639         /* No hardware attached; use lowest common denominator */
 640         rcu_read_lock();
 641         for_each_active_iommu(iommu, drhd) {
 642                 if (!iommu_paging_structure_coherency(iommu)) {
 643                         domain->iommu_coherency = 0;
 644                         break;
 645                 }
 646         }
 647         rcu_read_unlock();
 648 }
 649
 650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 651 {
 652         struct dmar_drhd_unit *drhd;
 653         struct intel_iommu *iommu;
 654         int ret = 1;
 655
 656         rcu_read_lock();
 657         for_each_active_iommu(iommu, drhd) {
 658                 if (iommu != skip) {
 659                         if (!ecap_sc_support(iommu->ecap)) {
 660                                 ret = 0;
 661                                 break;
 662                         }
 663                 }
 664         }
 665         rcu_read_unlock();
 666
 667         return ret;
 668 }
 669
 670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 671                                          struct intel_iommu *skip)
 672 {
 673         struct dmar_drhd_unit *drhd;
 674         struct intel_iommu *iommu;
 675         int mask = 0x3;
 676
 677         if (!intel_iommu_superpage) {
 678                 return 0;
 679         }
 680
 681         /* set iommu_superpage to the smallest common denominator */
 682         rcu_read_lock();
 683         for_each_active_iommu(iommu, drhd) {
 684                 if (iommu != skip) {
 685                         if (domain && domain_use_first_level(domain)) {
 686                                 if (!cap_fl1gp_support(iommu->cap))
 687                                         mask = 0x1;
 688                         } else {
 689                                 mask &= cap_super_page_val(iommu->cap);
 690                         }
 691
 692                         if (!mask)
 693                                 break;
 694                 }
 695         }
 696         rcu_read_unlock();
 697
 698         return fls(mask);
 699 }
 700
 701 static int domain_update_device_node(struct dmar_domain *domain)
 702 {
 703         struct device_domain_info *info;
 704         int nid = NUMA_NO_NODE;
 705
 706         assert_spin_locked(&device_domain_lock);
 707
 708         if (list_empty(&domain->devices))
 709                 return NUMA_NO_NODE;
 710
 711         list_for_each_entry(info, &domain->devices, link) {
 712                 if (!info->dev)
 713                         continue;
 714
 715                 /*
 716                  * There could possibly be multiple device numa nodes as devices
 717                  * within the same domain may sit behind different IOMMUs. There
 718                  * isn't perfect answer in such situation, so we select first
 719                  * come first served policy.
 720                  */
 721                 nid = dev_to_node(info->dev);
 722                 if (nid != NUMA_NO_NODE)
 723                         break;
 724         }
 725
 726         return nid;
 727 }
 728
 729 /* Some capabilities may be different across iommus */
 730 static void domain_update_iommu_cap(struct dmar_domain *domain)
 731 {
 732         domain_update_iommu_coherency(domain);
 733         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 734         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 735
 736         /*
 737          * If RHSA is missing, we should default to the device numa domain
 738          * as fall back.
 739          */
 740         if (domain->nid == NUMA_NO_NODE)
 741                 domain->nid = domain_update_device_node(domain);
 742 }
 743
 744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 745                                          u8 devfn, int alloc)
 746 {
 747         struct root_entry *root = &iommu->root_entry[bus];
 748         struct context_entry *context;
 749         u64 *entry;
 750
 751         entry = &root->lo;
 752         if (sm_supported(iommu)) {
 753                 if (devfn >= 0x80) {
 754                         devfn -= 0x80;
 755                         entry = &root->hi;
 756                 }
 757                 devfn *= 2;
 758         }
 759         if (*entry & 1)
 760                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 761         else {
 762                 unsigned long phy_addr;
 763                 if (!alloc)
 764                         return NULL;
 765
 766                 context = alloc_pgtable_page(iommu->node);
 767                 if (!context)
 768                         return NULL;
 769
 770                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 771                 phy_addr = virt_to_phys((void *)context);
 772                 *entry = phy_addr | 1;
 773                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 774         }
 775         return &context[devfn];
 776 }
 777
 778 static bool attach_deferred(struct device *dev)
 779 {
 780         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 781 }
 782
 783 /**
 784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 785  *                               sub-hierarchy of a candidate PCI-PCI bridge
 786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 787  * @bridge: the candidate PCI-PCI bridge
 788  *
 789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 790  */
 791 static bool
 792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 793 {
 794         struct pci_dev *pdev, *pbridge;
 795
 796         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 797                 return false;
 798
 799         pdev = to_pci_dev(dev);
 800         pbridge = to_pci_dev(bridge);
 801
 802         if (pbridge->subordinate &&
 803             pbridge->subordinate->number <= pdev->bus->number &&
 804             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 805                 return true;
 806
 807         return false;
 808 }
 809
 810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 811 {
 812         struct dmar_drhd_unit *drhd;
 813         u32 vtbar;
 814         int rc;
 815
 816         /* We know that this device on this chipset has its own IOMMU.
 817          * If we find it under a different IOMMU, then the BIOS is lying
 818          * to us. Hope that the IOMMU for this device is actually
 819          * disabled, and it needs no translation...
 820          */
 821         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 822         if (rc) {
 823                 /* "can't" happen */
 824                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 825                 return false;
 826         }
 827         vtbar &= 0xffff0000;
 828
 829         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 830         drhd = dmar_find_matched_drhd_unit(pdev);
 831         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 832                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 833                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 834                 return true;
 835         }
 836
 837         return false;
 838 }
 839
 840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 841 {
 842         if (!iommu || iommu->drhd->ignored)
 843                 return true;
 844
 845         if (dev_is_pci(dev)) {
 846                 struct pci_dev *pdev = to_pci_dev(dev);
 847
 848                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 849                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 850                     quirk_ioat_snb_local_iommu(pdev))
 851                         return true;
 852         }
 853
 854         return false;
 855 }
 856
 857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 858 {
 859         struct dmar_drhd_unit *drhd = NULL;
 860         struct pci_dev *pdev = NULL;
 861         struct intel_iommu *iommu;
 862         struct device *tmp;
 863         u16 segment = 0;
 864         int i;
 865
 866         if (!dev)
 867                 return NULL;
 868
 869         if (dev_is_pci(dev)) {
 870                 struct pci_dev *pf_pdev;
 871
 872                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 873
 874                 /* VFs aren't listed in scope tables; we need to look up
 875                  * the PF instead to find the IOMMU. */
 876                 pf_pdev = pci_physfn(pdev);
 877                 dev = &pf_pdev->dev;
 878                 segment = pci_domain_nr(pdev->bus);
 879         } else if (has_acpi_companion(dev))
 880                 dev = &ACPI_COMPANION(dev)->dev;
 881
 882         rcu_read_lock();
 883         for_each_iommu(iommu, drhd) {
 884                 if (pdev && segment != drhd->segment)
 885                         continue;
 886
 887                 for_each_active_dev_scope(drhd->devices,
 888                                           drhd->devices_cnt, i, tmp) {
 889                         if (tmp == dev) {
 890                                 /* For a VF use its original BDF# not that of the PF
 891                                  * which we used for the IOMMU lookup. Strictly speaking
 892                                  * we could do this for all PCI devices; we only need to
 893                                  * get the BDF# from the scope table for ACPI matches. */
 894                                 if (pdev && pdev->is_virtfn)
 895                                         goto got_pdev;
 896
 897                                 if (bus && devfn) {
 898                                         *bus = drhd->devices[i].bus;
 899                                         *devfn = drhd->devices[i].devfn;
 900                                 }
 901                                 goto out;
 902                         }
 903
 904                         if (is_downstream_to_pci_bridge(dev, tmp))
 905                                 goto got_pdev;
 906                 }
 907
 908                 if (pdev && drhd->include_all) {
 909                 got_pdev:
 910                         if (bus && devfn) {
 911                                 *bus = pdev->bus->number;
 912                                 *devfn = pdev->devfn;
 913                         }
 914                         goto out;
 915                 }
 916         }
 917         iommu = NULL;
 918  out:
 919         if (iommu_is_dummy(iommu, dev))
 920                 iommu = NULL;
 921
 922         rcu_read_unlock();
 923
 924         return iommu;
 925 }
 926
 927 static void domain_flush_cache(struct dmar_domain *domain,
 928                                void *addr, int size)
 929 {
 930         if (!domain->iommu_coherency)
 931                 clflush_cache_range(addr, size);
 932 }
 933
 934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 935 {
 936         struct context_entry *context;
 937         int ret = 0;
 938         unsigned long flags;
 939
 940         spin_lock_irqsave(&iommu->lock, flags);
 941         context = iommu_context_addr(iommu, bus, devfn, 0);
 942         if (context)
 943                 ret = context_present(context);
 944         spin_unlock_irqrestore(&iommu->lock, flags);
 945         return ret;
 946 }
 947
 948 static void free_context_table(struct intel_iommu *iommu)
 949 {
 950         int i;
 951         unsigned long flags;
 952         struct context_entry *context;
 953
 954         spin_lock_irqsave(&iommu->lock, flags);
 955         if (!iommu->root_entry) {
 956                 goto out;
 957         }
 958         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 959                 context = iommu_context_addr(iommu, i, 0, 0);
 960                 if (context)
 961                         free_pgtable_page(context);
 962
 963                 if (!sm_supported(iommu))
 964                         continue;
 965
 966                 context = iommu_context_addr(iommu, i, 0x80, 0);
 967                 if (context)
 968                         free_pgtable_page(context);
 969
 970         }
 971         free_pgtable_page(iommu->root_entry);
 972         iommu->root_entry = NULL;
 973 out:
 974         spin_unlock_irqrestore(&iommu->lock, flags);
 975 }
 976
 977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 978                                       unsigned long pfn, int *target_level)
 979 {
 980         struct dma_pte *parent, *pte;
 981         int level = agaw_to_level(domain->agaw);
 982         int offset;
 983
 984         BUG_ON(!domain->pgd);
 985
 986         if (!domain_pfn_supported(domain, pfn))
 987                 /* Address beyond IOMMU's addressing capabilities. */
 988                 return NULL;
 989
 990         parent = domain->pgd;
 991
 992         while (1) {
 993                 void *tmp_page;
 994
 995                 offset = pfn_level_offset(pfn, level);
 996                 pte = &parent[offset];
 997                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 998                         break;
 999                 if (level == *target_level)
1000                         break;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         uint64_t pteval;
1004
1005                         tmp_page = alloc_pgtable_page(domain->nid);
1006
1007                         if (!tmp_page)
1008                                 return NULL;
1009
1010                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012                         if (domain_use_first_level(domain))
1013                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1015                                 /* Someone else set it while we were thinking; use theirs. */
1016                                 free_pgtable_page(tmp_page);
1017                         else
1018                                 domain_flush_cache(domain, pte, sizeof(*pte));
1019                 }
1020                 if (level == 1)
1021                         break;
1022
1023                 parent = phys_to_virt(dma_pte_addr(pte));
1024                 level--;
1025         }
1026
1027         if (!*target_level)
1028                 *target_level = level;
1029
1030         return pte;
1031 }
1032
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035                                          unsigned long pfn,
1036                                          int level, int *large_page)
1037 {
1038         struct dma_pte *parent, *pte;
1039         int total = agaw_to_level(domain->agaw);
1040         int offset;
1041
1042         parent = domain->pgd;
1043         while (level <= total) {
1044                 offset = pfn_level_offset(pfn, total);
1045                 pte = &parent[offset];
1046                 if (level == total)
1047                         return pte;
1048
1049                 if (!dma_pte_present(pte)) {
1050                         *large_page = total;
1051                         break;
1052                 }
1053
1054                 if (dma_pte_superpage(pte)) {
1055                         *large_page = total;
1056                         return pte;
1057                 }
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 total--;
1061         }
1062         return NULL;
1063 }
1064
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067                                 unsigned long start_pfn,
1068                                 unsigned long last_pfn)
1069 {
1070         unsigned int large_page;
1071         struct dma_pte *first_pte, *pte;
1072
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         /* we don't need lock here; nobody else touches the iova range */
1078         do {
1079                 large_page = 1;
1080                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081                 if (!pte) {
1082                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083                         continue;
1084                 }
1085                 do {
1086                         dma_clear_pte(pte);
1087                         start_pfn += lvl_to_nr_pages(large_page);
1088                         pte++;
1089                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090
1091                 domain_flush_cache(domain, first_pte,
1092                                    (void *)pte - (void *)first_pte);
1093
1094         } while (start_pfn && start_pfn <= last_pfn);
1095 }
1096
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098                                int retain_level, struct dma_pte *pte,
1099                                unsigned long pfn, unsigned long start_pfn,
1100                                unsigned long last_pfn)
1101 {
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107                 struct dma_pte *level_pte;
1108
1109                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110                         goto next;
1111
1112                 level_pfn = pfn & level_mask(level);
1113                 level_pte = phys_to_virt(dma_pte_addr(pte));
1114
1115                 if (level > 2) {
1116                         dma_pte_free_level(domain, level - 1, retain_level,
1117                                            level_pte, level_pfn, start_pfn,
1118                                            last_pfn);
1119                 }
1120
1121                 /*
1122                  * Free the page table if we're below the level we want to
1123                  * retain and the range covers the entire table.
1124                  */
1125                 if (level < retain_level && !(start_pfn > level_pfn ||
1126                       last_pfn < level_pfn + level_size(level) - 1)) {
1127                         dma_clear_pte(pte);
1128                         domain_flush_cache(domain, pte, sizeof(*pte));
1129                         free_pgtable_page(level_pte);
1130                 }
1131 next:
1132                 pfn += level_size(level);
1133         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141                                    unsigned long start_pfn,
1142                                    unsigned long last_pfn,
1143                                    int retain_level)
1144 {
1145         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147         BUG_ON(start_pfn > last_pfn);
1148
1149         dma_pte_clear_range(domain, start_pfn, last_pfn);
1150
1151         /* We don't need lock here; nobody else touches the iova range */
1152         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153                            domain->pgd, 0, start_pfn, last_pfn);
1154
1155         /* free pgd */
1156         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157                 free_pgtable_page(domain->pgd);
1158                 domain->pgd = NULL;
1159         }
1160 }
1161
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169                                             int level, struct dma_pte *pte,
1170                                             struct page *freelist)
1171 {
1172         struct page *pg;
1173
1174         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175         pg->freelist = freelist;
1176         freelist = pg;
1177
1178         if (level == 1)
1179                 return freelist;
1180
1181         pte = page_address(pg);
1182         do {
1183                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184                         freelist = dma_pte_list_pagetables(domain, level - 1,
1185                                                            pte, freelist);
1186                 pte++;
1187         } while (!first_pte_in_page(pte));
1188
1189         return freelist;
1190 }
1191
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193                                         struct dma_pte *pte, unsigned long pfn,
1194                                         unsigned long start_pfn,
1195                                         unsigned long last_pfn,
1196                                         struct page *freelist)
1197 {
1198         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199
1200         pfn = max(start_pfn, pfn);
1201         pte = &pte[pfn_level_offset(pfn, level)];
1202
1203         do {
1204                 unsigned long level_pfn;
1205
1206                 if (!dma_pte_present(pte))
1207                         goto next;
1208
1209                 level_pfn = pfn & level_mask(level);
1210
1211                 /* If range covers entire pagetable, free it */
1212                 if (start_pfn <= level_pfn &&
1213                     last_pfn >= level_pfn + level_size(level) - 1) {
1214                         /* These suborbinate page tables are going away entirely. Don't
1215                            bother to clear them; we're just going to *free* them. */
1216                         if (level > 1 && !dma_pte_superpage(pte))
1217                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218
1219                         dma_clear_pte(pte);
1220                         if (!first_pte)
1221                                 first_pte = pte;
1222                         last_pte = pte;
1223                 } else if (level > 1) {
1224                         /* Recurse down into a level that isn't *entirely* obsolete */
1225                         freelist = dma_pte_clear_level(domain, level - 1,
1226                                                        phys_to_virt(dma_pte_addr(pte)),
1227                                                        level_pfn, start_pfn, last_pfn,
1228                                                        freelist);
1229                 }
1230 next:
1231                 pfn += level_size(level);
1232         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233
1234         if (first_pte)
1235                 domain_flush_cache(domain, first_pte,
1236                                    (void *)++last_pte - (void *)first_pte);
1237
1238         return freelist;
1239 }
1240
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245                                  unsigned long start_pfn,
1246                                  unsigned long last_pfn)
1247 {
1248         struct page *freelist;
1249
1250         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252         BUG_ON(start_pfn > last_pfn);
1253
1254         /* we don't need lock here; nobody else touches the iova range */
1255         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1257
1258         /* free pgd */
1259         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260                 struct page *pgd_page = virt_to_page(domain->pgd);
1261                 pgd_page->freelist = freelist;
1262                 freelist = pgd_page;
1263
1264                 domain->pgd = NULL;
1265         }
1266
1267         return freelist;
1268 }
1269
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272         struct page *pg;
1273
1274         while ((pg = freelist)) {
1275                 freelist = pg->freelist;
1276                 free_pgtable_page(page_address(pg));
1277         }
1278 }
1279
1280 static void iova_entry_free(unsigned long data)
1281 {
1282         struct page *freelist = (struct page *)data;
1283
1284         dma_free_pagelist(freelist);
1285 }
1286
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290         struct root_entry *root;
1291         unsigned long flags;
1292
1293         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294         if (!root) {
1295                 pr_err("Allocating root entry for %s failed\n",
1296                         iommu->name);
1297                 return -ENOMEM;
1298         }
1299
1300         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         iommu->root_entry = root;
1304         spin_unlock_irqrestore(&iommu->lock, flags);
1305
1306         return 0;
1307 }
1308
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311         u64 addr;
1312         u32 sts;
1313         unsigned long flag;
1314
1315         addr = virt_to_phys(iommu->root_entry);
1316         if (sm_supported(iommu))
1317                 addr |= DMA_RTADDR_SMT;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321
1322         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (sts & DMA_GSTS_RTPS), sts);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333         u32 val;
1334         unsigned long flag;
1335
1336         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337                 return;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344                       readl, (!(val & DMA_GSTS_WBFS)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351                                   u16 did, u16 source_id, u8 function_mask,
1352                                   u64 type)
1353 {
1354         u64 val = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_CCMD_GLOBAL_INVL:
1359                 val = DMA_CCMD_GLOBAL_INVL;
1360                 break;
1361         case DMA_CCMD_DOMAIN_INVL:
1362                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363                 break;
1364         case DMA_CCMD_DEVICE_INVL:
1365                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367                 break;
1368         default:
1369                 BUG();
1370         }
1371         val |= DMA_CCMD_ICC;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385                                 u64 addr, unsigned int size_order, u64 type)
1386 {
1387         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388         u64 val = 0, val_iva = 0;
1389         unsigned long flag;
1390
1391         switch (type) {
1392         case DMA_TLB_GLOBAL_FLUSH:
1393                 /* global flush doesn't need set IVA_REG */
1394                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395                 break;
1396         case DMA_TLB_DSI_FLUSH:
1397                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398                 break;
1399         case DMA_TLB_PSI_FLUSH:
1400                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 /* IH bit is passed in as part of address */
1402                 val_iva = size_order | addr;
1403                 break;
1404         default:
1405                 BUG();
1406         }
1407         /* Note: set drain read/write */
1408 #if 0
1409         /*
1410          * This is probably to be super secure.. Looks like we can
1411          * ignore it without any impact.
1412          */
1413         if (cap_read_drain(iommu->cap))
1414                 val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416         if (cap_write_drain(iommu->cap))
1417                 val |= DMA_TLB_WRITE_DRAIN;
1418
1419         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420         /* Note: Only uses first TLB reg currently */
1421         if (val_iva)
1422                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430
1431         /* check IOTLB invalidation granularity */
1432         if (DMA_TLB_IAIG(val) == 0)
1433                 pr_err("Flush IOTLB failed\n");
1434         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436                         (unsigned long long)DMA_TLB_IIRG(type),
1437                         (unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442                          u8 bus, u8 devfn)
1443 {
1444         struct device_domain_info *info;
1445
1446         assert_spin_locked(&device_domain_lock);
1447
1448         if (!iommu->qi)
1449                 return NULL;
1450
1451         list_for_each_entry(info, &domain->devices, link)
1452                 if (info->iommu == iommu && info->bus == bus &&
1453                     info->devfn == devfn) {
1454                         if (info->ats_supported && info->dev)
1455                                 return info;
1456                         break;
1457                 }
1458
1459         return NULL;
1460 }
1461
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464         struct device_domain_info *info;
1465         bool has_iotlb_device = false;
1466
1467         assert_spin_locked(&device_domain_lock);
1468
1469         list_for_each_entry(info, &domain->devices, link) {
1470                 struct pci_dev *pdev;
1471
1472                 if (!info->dev || !dev_is_pci(info->dev))
1473                         continue;
1474
1475                 pdev = to_pci_dev(info->dev);
1476                 if (pdev->ats_enabled) {
1477                         has_iotlb_device = true;
1478                         break;
1479                 }
1480         }
1481
1482         domain->has_iotlb_device = has_iotlb_device;
1483 }
1484
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487         struct pci_dev *pdev;
1488
1489         assert_spin_locked(&device_domain_lock);
1490
1491         if (!info || !dev_is_pci(info->dev))
1492                 return;
1493
1494         pdev = to_pci_dev(info->dev);
1495         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498          * reserved, which should be set to 0.
1499          */
1500         if (!ecap_dit(info->iommu->ecap))
1501                 info->pfsid = 0;
1502         else {
1503                 struct pci_dev *pf_pdev;
1504
1505                 /* pdev will be returned if device is not a vf */
1506                 pf_pdev = pci_physfn(pdev);
1507                 info->pfsid = pci_dev_id(pf_pdev);
1508         }
1509
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511         /* The PCIe spec, in its wisdom, declares that the behaviour of
1512            the device if you enable PASID support after ATS support is
1513            undefined. So always enable PASID support on devices which
1514            have it, even if we can't yet know if we're ever going to
1515            use it. */
1516         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517                 info->pasid_enabled = 1;
1518
1519         if (info->pri_supported &&
1520             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522                 info->pri_enabled = 1;
1523 #endif
1524         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526                 info->ats_enabled = 1;
1527                 domain_update_iotlb(info->domain);
1528                 info->ats_qdep = pci_ats_queue_depth(pdev);
1529         }
1530 }
1531
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534         struct pci_dev *pdev;
1535
1536         assert_spin_locked(&device_domain_lock);
1537
1538         if (!dev_is_pci(info->dev))
1539                 return;
1540
1541         pdev = to_pci_dev(info->dev);
1542
1543         if (info->ats_enabled) {
1544                 pci_disable_ats(pdev);
1545                 info->ats_enabled = 0;
1546                 domain_update_iotlb(info->domain);
1547         }
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549         if (info->pri_enabled) {
1550                 pci_disable_pri(pdev);
1551                 info->pri_enabled = 0;
1552         }
1553         if (info->pasid_enabled) {
1554                 pci_disable_pasid(pdev);
1555                 info->pasid_enabled = 0;
1556         }
1557 #endif
1558 }
1559
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561                                   u64 addr, unsigned mask)
1562 {
1563         u16 sid, qdep;
1564         unsigned long flags;
1565         struct device_domain_info *info;
1566
1567         if (!domain->has_iotlb_device)
1568                 return;
1569
1570         spin_lock_irqsave(&device_domain_lock, flags);
1571         list_for_each_entry(info, &domain->devices, link) {
1572                 if (!info->ats_enabled)
1573                         continue;
1574
1575                 sid = info->bus << 8 | info->devfn;
1576                 qdep = info->ats_qdep;
1577                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578                                 qdep, addr, mask);
1579         }
1580         spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584                                 struct dmar_domain *domain,
1585                                 u64 addr, unsigned long npages, bool ih)
1586 {
1587         u16 did = domain->iommu_did[iommu->seq_id];
1588
1589         if (domain->default_pasid)
1590                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591                                 addr, npages, ih);
1592
1593         if (!list_empty(&domain->devices))
1594                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598                                   struct dmar_domain *domain,
1599                                   unsigned long pfn, unsigned int pages,
1600                                   int ih, int map)
1601 {
1602         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604         u16 did = domain->iommu_did[iommu->seq_id];
1605
1606         BUG_ON(pages == 0);
1607
1608         if (ih)
1609                 ih = 1 << 6;
1610
1611         if (domain_use_first_level(domain)) {
1612                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613         } else {
1614                 /*
1615                  * Fallback to domain selective flush if no PSI support or
1616                  * the size is too big. PSI requires page size to be 2 ^ x,
1617                  * and the base address is naturally aligned to the size.
1618                  */
1619                 if (!cap_pgsel_inv(iommu->cap) ||
1620                     mask > cap_max_amask_val(iommu->cap))
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                         DMA_TLB_DSI_FLUSH);
1623                 else
1624                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625                                                         DMA_TLB_PSI_FLUSH);
1626         }
1627
1628         /*
1629          * In caching mode, changes of pages from non-present to present require
1630          * flush. However, device IOTLB doesn't need to be flushed in this case.
1631          */
1632         if (!cap_caching_mode(iommu->cap) || !map)
1633                 iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638                                         struct dmar_domain *domain,
1639                                         unsigned long pfn, unsigned int pages)
1640 {
1641         /*
1642          * It's a non-present to present mapping. Only flush if caching mode
1643          * and second level.
1644          */
1645         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647         else
1648                 iommu_flush_write_buffer(iommu);
1649 }
1650
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653         struct dmar_domain *domain;
1654         int idx;
1655
1656         domain = container_of(iovad, struct dmar_domain, iovad);
1657
1658         for_each_domain_iommu(idx, domain) {
1659                 struct intel_iommu *iommu = g_iommus[idx];
1660                 u16 did = domain->iommu_did[iommu->seq_id];
1661
1662                 if (domain_use_first_level(domain))
1663                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666                                                  DMA_TLB_DSI_FLUSH);
1667
1668                 if (!cap_caching_mode(iommu->cap))
1669                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670                                               0, MAX_AGAW_PFN_WIDTH);
1671         }
1672 }
1673
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676         u32 pmen;
1677         unsigned long flags;
1678
1679         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680                 return;
1681
1682         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684         pmen &= ~DMA_PMEN_EPM;
1685         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686
1687         /* wait for the protected region status bit to clear */
1688         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1690
1691         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696         u32 sts;
1697         unsigned long flags;
1698
1699         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700         iommu->gcmd |= DMA_GCMD_TE;
1701         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702
1703         /* Make sure hardware complete it */
1704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705                       readl, (sts & DMA_GSTS_TES), sts);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flag;
1714
1715         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717                 return;
1718
1719         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720         iommu->gcmd &= ~DMA_GCMD_TE;
1721         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722
1723         /* Make sure hardware complete it */
1724         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725                       readl, (!(sts & DMA_GSTS_TES)), sts);
1726
1727         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732         u32 ndomains, nlongs;
1733         size_t size;
1734
1735         ndomains = cap_ndoms(iommu->cap);
1736         pr_debug("%s: Number of Domains supported <%d>\n",
1737                  iommu->name, ndomains);
1738         nlongs = BITS_TO_LONGS(ndomains);
1739
1740         spin_lock_init(&iommu->lock);
1741
1742         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743         if (!iommu->domain_ids) {
1744                 pr_err("%s: Allocating domain id array failed\n",
1745                        iommu->name);
1746                 return -ENOMEM;
1747         }
1748
1749         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750         iommu->domains = kzalloc(size, GFP_KERNEL);
1751
1752         if (iommu->domains) {
1753                 size = 256 * sizeof(struct dmar_domain *);
1754                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755         }
1756
1757         if (!iommu->domains || !iommu->domains[0]) {
1758                 pr_err("%s: Allocating domain array failed\n",
1759                        iommu->name);
1760                 kfree(iommu->domain_ids);
1761                 kfree(iommu->domains);
1762                 iommu->domain_ids = NULL;
1763                 iommu->domains    = NULL;
1764                 return -ENOMEM;
1765         }
1766
1767         /*
1768          * If Caching mode is set, then invalid translations are tagged
1769          * with domain-id 0, hence we need to pre-allocate it. We also
1770          * use domain-id 0 as a marker for non-allocated domain-id, so
1771          * make sure it is not used for a real domain.
1772          */
1773         set_bit(0, iommu->domain_ids);
1774
1775         /*
1776          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777          * entry for first-level or pass-through translation modes should
1778          * be programmed with a domain id different from those used for
1779          * second-level or nested translation. We reserve a domain id for
1780          * this purpose.
1781          */
1782         if (sm_supported(iommu))
1783                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784
1785         return 0;
1786 }
1787
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790         struct device_domain_info *info, *tmp;
1791         unsigned long flags;
1792
1793         if (!iommu->domains || !iommu->domain_ids)
1794                 return;
1795
1796         spin_lock_irqsave(&device_domain_lock, flags);
1797         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798                 if (info->iommu != iommu)
1799                         continue;
1800
1801                 if (!info->dev || !info->domain)
1802                         continue;
1803
1804                 __dmar_remove_one_dev_info(info);
1805         }
1806         spin_unlock_irqrestore(&device_domain_lock, flags);
1807
1808         if (iommu->gcmd & DMA_GCMD_TE)
1809                 iommu_disable_translation(iommu);
1810 }
1811
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814         if ((iommu->domains) && (iommu->domain_ids)) {
1815                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816                 int i;
1817
1818                 for (i = 0; i < elems; i++)
1819                         kfree(iommu->domains[i]);
1820                 kfree(iommu->domains);
1821                 kfree(iommu->domain_ids);
1822                 iommu->domains = NULL;
1823                 iommu->domain_ids = NULL;
1824         }
1825
1826         g_iommus[iommu->seq_id] = NULL;
1827
1828         /* free context mapping */
1829         free_context_table(iommu);
1830
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832         if (pasid_supported(iommu)) {
1833                 if (ecap_prs(iommu->ecap))
1834                         intel_svm_finish_prq(iommu);
1835         }
1836         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1838
1839 #endif
1840 }
1841
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848         struct dmar_drhd_unit *drhd;
1849         struct intel_iommu *iommu;
1850         static int first_level_support = -1;
1851
1852         if (likely(first_level_support != -1))
1853                 return first_level_support;
1854
1855         first_level_support = 1;
1856
1857         rcu_read_lock();
1858         for_each_active_iommu(iommu, drhd) {
1859                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860                         first_level_support = 0;
1861                         break;
1862                 }
1863         }
1864         rcu_read_unlock();
1865
1866         return first_level_support;
1867 }
1868
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871         struct dmar_domain *domain;
1872
1873         domain = alloc_domain_mem();
1874         if (!domain)
1875                 return NULL;
1876
1877         memset(domain, 0, sizeof(*domain));
1878         domain->nid = NUMA_NO_NODE;
1879         domain->flags = flags;
1880         if (first_level_by_default())
1881                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882         domain->has_iotlb_device = false;
1883         INIT_LIST_HEAD(&domain->devices);
1884
1885         return domain;
1886 }
1887
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890                                struct intel_iommu *iommu)
1891 {
1892         unsigned long ndomains;
1893         int num;
1894
1895         assert_spin_locked(&device_domain_lock);
1896         assert_spin_locked(&iommu->lock);
1897
1898         domain->iommu_refcnt[iommu->seq_id] += 1;
1899         domain->iommu_count += 1;
1900         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901                 ndomains = cap_ndoms(iommu->cap);
1902                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903
1904                 if (num >= ndomains) {
1905                         pr_err("%s: No free domain ids\n", iommu->name);
1906                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1907                         domain->iommu_count -= 1;
1908                         return -ENOSPC;
1909                 }
1910
1911                 set_bit(num, iommu->domain_ids);
1912                 set_iommu_domain(iommu, num, domain);
1913
1914                 domain->iommu_did[iommu->seq_id] = num;
1915                 domain->nid                      = iommu->node;
1916
1917                 domain_update_iommu_cap(domain);
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924                                struct intel_iommu *iommu)
1925 {
1926         int num, count;
1927
1928         assert_spin_locked(&device_domain_lock);
1929         assert_spin_locked(&iommu->lock);
1930
1931         domain->iommu_refcnt[iommu->seq_id] -= 1;
1932         count = --domain->iommu_count;
1933         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934                 num = domain->iommu_did[iommu->seq_id];
1935                 clear_bit(num, iommu->domain_ids);
1936                 set_iommu_domain(iommu, num, NULL);
1937
1938                 domain_update_iommu_cap(domain);
1939                 domain->iommu_did[iommu->seq_id] = 0;
1940         }
1941
1942         return count;
1943 }
1944
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950         struct pci_dev *pdev = NULL;
1951         struct iova *iova;
1952         int i;
1953
1954         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955
1956         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957                 &reserved_rbtree_key);
1958
1959         /* IOAPIC ranges shouldn't be accessed by DMA */
1960         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961                 IOVA_PFN(IOAPIC_RANGE_END));
1962         if (!iova) {
1963                 pr_err("Reserve IOAPIC range failed\n");
1964                 return -ENODEV;
1965         }
1966
1967         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968         for_each_pci_dev(pdev) {
1969                 struct resource *r;
1970
1971                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972                         r = &pdev->resource[i];
1973                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974                                 continue;
1975                         iova = reserve_iova(&reserved_iova_list,
1976                                             IOVA_PFN(r->start),
1977                                             IOVA_PFN(r->end));
1978                         if (!iova) {
1979                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980                                 return -ENODEV;
1981                         }
1982                 }
1983         }
1984         return 0;
1985 }
1986
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989         int agaw;
1990         int r = (gaw - 12) % 9;
1991
1992         if (r == 0)
1993                 agaw = gaw;
1994         else
1995                 agaw = gaw + 9 - r;
1996         if (agaw > 64)
1997                 agaw = 64;
1998         return agaw;
1999 }
2000
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003
2004         /* Remove associated devices and clear attached or cached domains */
2005         domain_remove_dev_info(domain);
2006
2007         /* destroy iovas */
2008         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009                 put_iova_domain(&domain->iovad);
2010
2011         if (domain->pgd) {
2012                 struct page *freelist;
2013
2014                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015                 dma_free_pagelist(freelist);
2016         }
2017
2018         free_domain_mem(domain);
2019 }
2020
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028         int pds, max_pde;
2029
2030         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032         if (pds < 7)
2033                 return 0;
2034
2035         return pds - 7;
2036 }
2037
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046         context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 2);
2056 }
2057
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064         context->lo |= (1 << 4);
2065 }
2066
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2069
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071                                       struct intel_iommu *iommu,
2072                                       struct pasid_table *table,
2073                                       u8 bus, u8 devfn)
2074 {
2075         u16 did = domain->iommu_did[iommu->seq_id];
2076         int translation = CONTEXT_TT_MULTI_LEVEL;
2077         struct device_domain_info *info = NULL;
2078         struct context_entry *context;
2079         unsigned long flags;
2080         int ret;
2081
2082         WARN_ON(did == 0);
2083
2084         if (hw_pass_through && domain_type_is_si(domain))
2085                 translation = CONTEXT_TT_PASS_THROUGH;
2086
2087         pr_debug("Set context mapping for %02x:%02x.%d\n",
2088                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089
2090         BUG_ON(!domain->pgd);
2091
2092         spin_lock_irqsave(&device_domain_lock, flags);
2093         spin_lock(&iommu->lock);
2094
2095         ret = -ENOMEM;
2096         context = iommu_context_addr(iommu, bus, devfn, 1);
2097         if (!context)
2098                 goto out_unlock;
2099
2100         ret = 0;
2101         if (context_present(context))
2102                 goto out_unlock;
2103
2104         /*
2105          * For kdump cases, old valid entries may be cached due to the
2106          * in-flight DMA and copied pgtable, but there is no unmapping
2107          * behaviour for them, thus we need an explicit cache flush for
2108          * the newly-mapped device. For kdump, at this point, the device
2109          * is supposed to finish reset at its driver probe stage, so no
2110          * in-flight DMA will exist, and we don't need to worry anymore
2111          * hereafter.
2112          */
2113         if (context_copied(context)) {
2114                 u16 did_old = context_domain_id(context);
2115
2116                 if (did_old < cap_ndoms(iommu->cap)) {
2117                         iommu->flush.flush_context(iommu, did_old,
2118                                                    (((u16)bus) << 8) | devfn,
2119                                                    DMA_CCMD_MASK_NOBIT,
2120                                                    DMA_CCMD_DEVICE_INVL);
2121                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122                                                  DMA_TLB_DSI_FLUSH);
2123                 }
2124         }
2125
2126         context_clear_entry(context);
2127
2128         if (sm_supported(iommu)) {
2129                 unsigned long pds;
2130
2131                 WARN_ON(!table);
2132
2133                 /* Setup the PASID DIR pointer: */
2134                 pds = context_get_sm_pds(table);
2135                 context->lo = (u64)virt_to_phys(table->table) |
2136                                 context_pdts(pds);
2137
2138                 /* Setup the RID_PASID field: */
2139                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140
2141                 /*
2142                  * Setup the Device-TLB enable bit and Page request
2143                  * Enable bit:
2144                  */
2145                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146                 if (info && info->ats_supported)
2147                         context_set_sm_dte(context);
2148                 if (info && info->pri_supported)
2149                         context_set_sm_pre(context);
2150         } else {
2151                 struct dma_pte *pgd = domain->pgd;
2152                 int agaw;
2153
2154                 context_set_domain_id(context, did);
2155
2156                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2157                         /*
2158                          * Skip top levels of page tables for iommu which has
2159                          * less agaw than default. Unnecessary for PT mode.
2160                          */
2161                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162                                 ret = -ENOMEM;
2163                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2164                                 if (!dma_pte_present(pgd))
2165                                         goto out_unlock;
2166                         }
2167
2168                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169                         if (info && info->ats_supported)
2170                                 translation = CONTEXT_TT_DEV_IOTLB;
2171                         else
2172                                 translation = CONTEXT_TT_MULTI_LEVEL;
2173
2174                         context_set_address_root(context, virt_to_phys(pgd));
2175                         context_set_address_width(context, agaw);
2176                 } else {
2177                         /*
2178                          * In pass through mode, AW must be programmed to
2179                          * indicate the largest AGAW value supported by
2180                          * hardware. And ASR is ignored by hardware.
2181                          */
2182                         context_set_address_width(context, iommu->msagaw);
2183                 }
2184
2185                 context_set_translation_type(context, translation);
2186         }
2187
2188         context_set_fault_enable(context);
2189         context_set_present(context);
2190         if (!ecap_coherent(iommu->ecap))
2191                 clflush_cache_range(context, sizeof(*context));
2192
2193         /*
2194          * It's a non-present to present mapping. If hardware doesn't cache
2195          * non-present entry we only need to flush the write-buffer. If the
2196          * _does_ cache non-present entries, then it does so in the special
2197          * domain #0, which we have to flush:
2198          */
2199         if (cap_caching_mode(iommu->cap)) {
2200                 iommu->flush.flush_context(iommu, 0,
2201                                            (((u16)bus) << 8) | devfn,
2202                                            DMA_CCMD_MASK_NOBIT,
2203                                            DMA_CCMD_DEVICE_INVL);
2204                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205         } else {
2206                 iommu_flush_write_buffer(iommu);
2207         }
2208         iommu_enable_dev_iotlb(info);
2209
2210         ret = 0;
2211
2212 out_unlock:
2213         spin_unlock(&iommu->lock);
2214         spin_unlock_irqrestore(&device_domain_lock, flags);
2215
2216         return ret;
2217 }
2218
2219 struct domain_context_mapping_data {
2220         struct dmar_domain *domain;
2221         struct intel_iommu *iommu;
2222         struct pasid_table *table;
2223 };
2224
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226                                      u16 alias, void *opaque)
2227 {
2228         struct domain_context_mapping_data *data = opaque;
2229
2230         return domain_context_mapping_one(data->domain, data->iommu,
2231                                           data->table, PCI_BUS_NUM(alias),
2232                                           alias & 0xff);
2233 }
2234
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238         struct domain_context_mapping_data data;
2239         struct pasid_table *table;
2240         struct intel_iommu *iommu;
2241         u8 bus, devfn;
2242
2243         iommu = device_to_iommu(dev, &bus, &devfn);
2244         if (!iommu)
2245                 return -ENODEV;
2246
2247         table = intel_pasid_get_table(dev);
2248
2249         if (!dev_is_pci(dev))
2250                 return domain_context_mapping_one(domain, iommu, table,
2251                                                   bus, devfn);
2252
2253         data.domain = domain;
2254         data.iommu = iommu;
2255         data.table = table;
2256
2257         return pci_for_each_dma_alias(to_pci_dev(dev),
2258                                       &domain_context_mapping_cb, &data);
2259 }
2260
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262                                     u16 alias, void *opaque)
2263 {
2264         struct intel_iommu *iommu = opaque;
2265
2266         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271         struct intel_iommu *iommu;
2272         u8 bus, devfn;
2273
2274         iommu = device_to_iommu(dev, &bus, &devfn);
2275         if (!iommu)
2276                 return -ENODEV;
2277
2278         if (!dev_is_pci(dev))
2279                 return device_context_mapped(iommu, bus, devfn);
2280
2281         return !pci_for_each_dma_alias(to_pci_dev(dev),
2282                                        domain_context_mapped_cb, iommu);
2283 }
2284
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287                                             size_t size)
2288 {
2289         host_addr &= ~PAGE_MASK;
2290         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295                                           unsigned long iov_pfn,
2296                                           unsigned long phy_pfn,
2297                                           unsigned long pages)
2298 {
2299         int support, level = 1;
2300         unsigned long pfnmerge;
2301
2302         support = domain->iommu_superpage;
2303
2304         /* To use a large page, the virtual *and* physical addresses
2305            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306            of them will mean we have to use smaller pages. So just
2307            merge them and check both at once. */
2308         pfnmerge = iov_pfn | phy_pfn;
2309
2310         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311                 pages >>= VTD_STRIDE_SHIFT;
2312                 if (!pages)
2313                         break;
2314                 pfnmerge >>= VTD_STRIDE_SHIFT;
2315                 level++;
2316                 support--;
2317         }
2318         return level;
2319 }
2320
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                             struct scatterlist *sg, unsigned long phys_pfn,
2323                             unsigned long nr_pages, int prot)
2324 {
2325         struct dma_pte *first_pte = NULL, *pte = NULL;
2326         phys_addr_t pteval;
2327         unsigned long sg_res = 0;
2328         unsigned int largepage_lvl = 0;
2329         unsigned long lvl_pages = 0;
2330         u64 attr;
2331
2332         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333
2334         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335                 return -EINVAL;
2336
2337         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338         if (domain_use_first_level(domain))
2339                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340
2341         if (!sg) {
2342                 sg_res = nr_pages;
2343                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344         }
2345
2346         while (nr_pages > 0) {
2347                 uint64_t tmp;
2348
2349                 if (!sg_res) {
2350                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351
2352                         sg_res = aligned_nrpages(sg->offset, sg->length);
2353                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354                         sg->dma_length = sg->length;
2355                         pteval = (sg_phys(sg) - pgoff) | attr;
2356                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357                 }
2358
2359                 if (!pte) {
2360                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361
2362                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363                         if (!pte)
2364                                 return -ENOMEM;
2365                         /* It is large page*/
2366                         if (largepage_lvl > 1) {
2367                                 unsigned long nr_superpages, end_pfn;
2368
2369                                 pteval |= DMA_PTE_LARGE_PAGE;
2370                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371
2372                                 nr_superpages = sg_res / lvl_pages;
2373                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374
2375                                 /*
2376                                  * Ensure that old small page tables are
2377                                  * removed to make room for superpage(s).
2378                                  * We're adding new large pages, so make sure
2379                                  * we don't remove their parent tables.
2380                                  */
2381                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382                                                        largepage_lvl + 1);
2383                         } else {
2384                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385                         }
2386
2387                 }
2388                 /* We don't need lock here, nobody else
2389                  * touches the iova range
2390                  */
2391                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392                 if (tmp) {
2393                         static int dumps = 5;
2394                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395                                 iov_pfn, tmp, (unsigned long long)pteval);
2396                         if (dumps) {
2397                                 dumps--;
2398                                 debug_dma_dump_mappings(NULL);
2399                         }
2400                         WARN_ON(1);
2401                 }
2402
2403                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404
2405                 BUG_ON(nr_pages < lvl_pages);
2406                 BUG_ON(sg_res < lvl_pages);
2407
2408                 nr_pages -= lvl_pages;
2409                 iov_pfn += lvl_pages;
2410                 phys_pfn += lvl_pages;
2411                 pteval += lvl_pages * VTD_PAGE_SIZE;
2412                 sg_res -= lvl_pages;
2413
2414                 /* If the next PTE would be the first in a new page, then we
2415                    need to flush the cache on the entries we've just written.
2416                    And then we'll need to recalculate 'pte', so clear it and
2417                    let it get set again in the if (!pte) block above.
2418
2419                    If we're done (!nr_pages) we need to flush the cache too.
2420
2421                    Also if we've been setting superpages, we may need to
2422                    recalculate 'pte' and switch back to smaller pages for the
2423                    end of the mapping, if the trailing size is not enough to
2424                    use another superpage (i.e. sg_res < lvl_pages). */
2425                 pte++;
2426                 if (!nr_pages || first_pte_in_page(pte) ||
2427                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428                         domain_flush_cache(domain, first_pte,
2429                                            (void *)pte - (void *)first_pte);
2430                         pte = NULL;
2431                 }
2432
2433                 if (!sg_res && nr_pages)
2434                         sg = sg_next(sg);
2435         }
2436         return 0;
2437 }
2438
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440                           struct scatterlist *sg, unsigned long phys_pfn,
2441                           unsigned long nr_pages, int prot)
2442 {
2443         int iommu_id, ret;
2444         struct intel_iommu *iommu;
2445
2446         /* Do the real mapping first */
2447         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448         if (ret)
2449                 return ret;
2450
2451         for_each_domain_iommu(iommu_id, domain) {
2452                 iommu = g_iommus[iommu_id];
2453                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454         }
2455
2456         return 0;
2457 }
2458
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460                                     struct scatterlist *sg, unsigned long nr_pages,
2461                                     int prot)
2462 {
2463         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467                                      unsigned long phys_pfn, unsigned long nr_pages,
2468                                      int prot)
2469 {
2470         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475         unsigned long flags;
2476         struct context_entry *context;
2477         u16 did_old;
2478
2479         if (!iommu)
2480                 return;
2481
2482         spin_lock_irqsave(&iommu->lock, flags);
2483         context = iommu_context_addr(iommu, bus, devfn, 0);
2484         if (!context) {
2485                 spin_unlock_irqrestore(&iommu->lock, flags);
2486                 return;
2487         }
2488         did_old = context_domain_id(context);
2489         context_clear_entry(context);
2490         __iommu_flush_cache(iommu, context, sizeof(*context));
2491         spin_unlock_irqrestore(&iommu->lock, flags);
2492         iommu->flush.flush_context(iommu,
2493                                    did_old,
2494                                    (((u16)bus) << 8) | devfn,
2495                                    DMA_CCMD_MASK_NOBIT,
2496                                    DMA_CCMD_DEVICE_INVL);
2497         iommu->flush.flush_iotlb(iommu,
2498                                  did_old,
2499                                  0,
2500                                  0,
2501                                  DMA_TLB_DSI_FLUSH);
2502 }
2503
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506         assert_spin_locked(&device_domain_lock);
2507         list_del(&info->link);
2508         list_del(&info->global);
2509         if (info->dev)
2510                 dev_iommu_priv_set(info->dev, NULL);
2511 }
2512
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515         struct device_domain_info *info, *tmp;
2516         unsigned long flags;
2517
2518         spin_lock_irqsave(&device_domain_lock, flags);
2519         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520                 __dmar_remove_one_dev_info(info);
2521         spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526         struct device_domain_info *info;
2527
2528         if (unlikely(attach_deferred(dev)))
2529                 return NULL;
2530
2531         /* No lock here, assumes no domain exit in normal case */
2532         info = get_domain_info(dev);
2533         if (likely(info))
2534                 return info->domain;
2535
2536         return NULL;
2537 }
2538
2539 static void do_deferred_attach(struct device *dev)
2540 {
2541         struct iommu_domain *domain;
2542
2543         dev_iommu_priv_set(dev, NULL);
2544         domain = iommu_get_domain_for_dev(dev);
2545         if (domain)
2546                 intel_iommu_attach_device(domain, dev);
2547 }
2548
2549 static inline struct device_domain_info *
2550 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2551 {
2552         struct device_domain_info *info;
2553
2554         list_for_each_entry(info, &device_domain_list, global)
2555                 if (info->segment == segment && info->bus == bus &&
2556                     info->devfn == devfn)
2557                         return info;
2558
2559         return NULL;
2560 }
2561
2562 static int domain_setup_first_level(struct intel_iommu *iommu,
2563                                     struct dmar_domain *domain,
2564                                     struct device *dev,
2565                                     u32 pasid)
2566 {
2567         int flags = PASID_FLAG_SUPERVISOR_MODE;
2568         struct dma_pte *pgd = domain->pgd;
2569         int agaw, level;
2570
2571         /*
2572          * Skip top levels of page tables for iommu which has
2573          * less agaw than default. Unnecessary for PT mode.
2574          */
2575         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2576                 pgd = phys_to_virt(dma_pte_addr(pgd));
2577                 if (!dma_pte_present(pgd))
2578                         return -ENOMEM;
2579         }
2580
2581         level = agaw_to_level(agaw);
2582         if (level != 4 && level != 5)
2583                 return -EINVAL;
2584
2585         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2586
2587         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2588                                              domain->iommu_did[iommu->seq_id],
2589                                              flags);
2590 }
2591
2592 static bool dev_is_real_dma_subdevice(struct device *dev)
2593 {
2594         return dev && dev_is_pci(dev) &&
2595                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2596 }
2597
2598 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2599                                                     int bus, int devfn,
2600                                                     struct device *dev,
2601                                                     struct dmar_domain *domain)
2602 {
2603         struct dmar_domain *found = NULL;
2604         struct device_domain_info *info;
2605         unsigned long flags;
2606         int ret;
2607
2608         info = alloc_devinfo_mem();
2609         if (!info)
2610                 return NULL;
2611
2612         if (!dev_is_real_dma_subdevice(dev)) {
2613                 info->bus = bus;
2614                 info->devfn = devfn;
2615                 info->segment = iommu->segment;
2616         } else {
2617                 struct pci_dev *pdev = to_pci_dev(dev);
2618
2619                 info->bus = pdev->bus->number;
2620                 info->devfn = pdev->devfn;
2621                 info->segment = pci_domain_nr(pdev->bus);
2622         }
2623
2624         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2625         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2626         info->ats_qdep = 0;
2627         info->dev = dev;
2628         info->domain = domain;
2629         info->iommu = iommu;
2630         info->pasid_table = NULL;
2631         info->auxd_enabled = 0;
2632         INIT_LIST_HEAD(&info->auxiliary_domains);
2633
2634         if (dev && dev_is_pci(dev)) {
2635                 struct pci_dev *pdev = to_pci_dev(info->dev);
2636
2637                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2638                     pci_ats_supported(pdev) &&
2639                     dmar_find_matched_atsr_unit(pdev))
2640                         info->ats_supported = 1;
2641
2642                 if (sm_supported(iommu)) {
2643                         if (pasid_supported(iommu)) {
2644                                 int features = pci_pasid_features(pdev);
2645                                 if (features >= 0)
2646                                         info->pasid_supported = features | 1;
2647                         }
2648
2649                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2650                             pci_pri_supported(pdev))
2651                                 info->pri_supported = 1;
2652                 }
2653         }
2654
2655         spin_lock_irqsave(&device_domain_lock, flags);
2656         if (dev)
2657                 found = find_domain(dev);
2658
2659         if (!found) {
2660                 struct device_domain_info *info2;
2661                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2662                                                        info->devfn);
2663                 if (info2) {
2664                         found      = info2->domain;
2665                         info2->dev = dev;
2666                 }
2667         }
2668
2669         if (found) {
2670                 spin_unlock_irqrestore(&device_domain_lock, flags);
2671                 free_devinfo_mem(info);
2672                 /* Caller must free the original domain */
2673                 return found;
2674         }
2675
2676         spin_lock(&iommu->lock);
2677         ret = domain_attach_iommu(domain, iommu);
2678         spin_unlock(&iommu->lock);
2679
2680         if (ret) {
2681                 spin_unlock_irqrestore(&device_domain_lock, flags);
2682                 free_devinfo_mem(info);
2683                 return NULL;
2684         }
2685
2686         list_add(&info->link, &domain->devices);
2687         list_add(&info->global, &device_domain_list);
2688         if (dev)
2689                 dev_iommu_priv_set(dev, info);
2690         spin_unlock_irqrestore(&device_domain_lock, flags);
2691
2692         /* PASID table is mandatory for a PCI device in scalable mode. */
2693         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2694                 ret = intel_pasid_alloc_table(dev);
2695                 if (ret) {
2696                         dev_err(dev, "PASID table allocation failed\n");
2697                         dmar_remove_one_dev_info(dev);
2698                         return NULL;
2699                 }
2700
2701                 /* Setup the PASID entry for requests without PASID: */
2702                 spin_lock_irqsave(&iommu->lock, flags);
2703                 if (hw_pass_through && domain_type_is_si(domain))
2704                         ret = intel_pasid_setup_pass_through(iommu, domain,
2705                                         dev, PASID_RID2PASID);
2706                 else if (domain_use_first_level(domain))
2707                         ret = domain_setup_first_level(iommu, domain, dev,
2708                                         PASID_RID2PASID);
2709                 else
2710                         ret = intel_pasid_setup_second_level(iommu, domain,
2711                                         dev, PASID_RID2PASID);
2712                 spin_unlock_irqrestore(&iommu->lock, flags);
2713                 if (ret) {
2714                         dev_err(dev, "Setup RID2PASID failed\n");
2715                         dmar_remove_one_dev_info(dev);
2716                         return NULL;
2717                 }
2718         }
2719
2720         if (dev && domain_context_mapping(domain, dev)) {
2721                 dev_err(dev, "Domain context map failed\n");
2722                 dmar_remove_one_dev_info(dev);
2723                 return NULL;
2724         }
2725
2726         return domain;
2727 }
2728
2729 static int iommu_domain_identity_map(struct dmar_domain *domain,
2730                                      unsigned long first_vpfn,
2731                                      unsigned long last_vpfn)
2732 {
2733         /*
2734          * RMRR range might have overlap with physical memory range,
2735          * clear it first
2736          */
2737         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2738
2739         return __domain_mapping(domain, first_vpfn, NULL,
2740                                 first_vpfn, last_vpfn - first_vpfn + 1,
2741                                 DMA_PTE_READ|DMA_PTE_WRITE);
2742 }
2743
2744 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2745
2746 static int __init si_domain_init(int hw)
2747 {
2748         struct dmar_rmrr_unit *rmrr;
2749         struct device *dev;
2750         int i, nid, ret;
2751
2752         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2753         if (!si_domain)
2754                 return -EFAULT;
2755
2756         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2757                 domain_exit(si_domain);
2758                 return -EFAULT;
2759         }
2760
2761         if (hw)
2762                 return 0;
2763
2764         for_each_online_node(nid) {
2765                 unsigned long start_pfn, end_pfn;
2766                 int i;
2767
2768                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2769                         ret = iommu_domain_identity_map(si_domain,
2770                                         mm_to_dma_pfn(start_pfn),
2771                                         mm_to_dma_pfn(end_pfn));
2772                         if (ret)
2773                                 return ret;
2774                 }
2775         }
2776
2777         /*
2778          * Identity map the RMRRs so that devices with RMRRs could also use
2779          * the si_domain.
2780          */
2781         for_each_rmrr_units(rmrr) {
2782                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2783                                           i, dev) {
2784                         unsigned long long start = rmrr->base_address;
2785                         unsigned long long end = rmrr->end_address;
2786
2787                         if (WARN_ON(end < start ||
2788                                     end >> agaw_to_width(si_domain->agaw)))
2789                                 continue;
2790
2791                         ret = iommu_domain_identity_map(si_domain,
2792                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2793                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2794                         if (ret)
2795                                 return ret;
2796                 }
2797         }
2798
2799         return 0;
2800 }
2801
2802 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2803 {
2804         struct dmar_domain *ndomain;
2805         struct intel_iommu *iommu;
2806         u8 bus, devfn;
2807
2808         iommu = device_to_iommu(dev, &bus, &devfn);
2809         if (!iommu)
2810                 return -ENODEV;
2811
2812         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2813         if (ndomain != domain)
2814                 return -EBUSY;
2815
2816         return 0;
2817 }
2818
2819 static bool device_has_rmrr(struct device *dev)
2820 {
2821         struct dmar_rmrr_unit *rmrr;
2822         struct device *tmp;
2823         int i;
2824
2825         rcu_read_lock();
2826         for_each_rmrr_units(rmrr) {
2827                 /*
2828                  * Return TRUE if this RMRR contains the device that
2829                  * is passed in.
2830                  */
2831                 for_each_active_dev_scope(rmrr->devices,
2832                                           rmrr->devices_cnt, i, tmp)
2833                         if (tmp == dev ||
2834                             is_downstream_to_pci_bridge(dev, tmp)) {
2835                                 rcu_read_unlock();
2836                                 return true;
2837                         }
2838         }
2839         rcu_read_unlock();
2840         return false;
2841 }
2842
2843 /**
2844  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2845  * is relaxable (ie. is allowed to be not enforced under some conditions)
2846  * @dev: device handle
2847  *
2848  * We assume that PCI USB devices with RMRRs have them largely
2849  * for historical reasons and that the RMRR space is not actively used post
2850  * boot.  This exclusion may change if vendors begin to abuse it.
2851  *
2852  * The same exception is made for graphics devices, with the requirement that
2853  * any use of the RMRR regions will be torn down before assigning the device
2854  * to a guest.
2855  *
2856  * Return: true if the RMRR is relaxable, false otherwise
2857  */
2858 static bool device_rmrr_is_relaxable(struct device *dev)
2859 {
2860         struct pci_dev *pdev;
2861
2862         if (!dev_is_pci(dev))
2863                 return false;
2864
2865         pdev = to_pci_dev(dev);
2866         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2867                 return true;
2868         else
2869                 return false;
2870 }
2871
2872 /*
2873  * There are a couple cases where we need to restrict the functionality of
2874  * devices associated with RMRRs.  The first is when evaluating a device for
2875  * identity mapping because problems exist when devices are moved in and out
2876  * of domains and their respective RMRR information is lost.  This means that
2877  * a device with associated RMRRs will never be in a "passthrough" domain.
2878  * The second is use of the device through the IOMMU API.  This interface
2879  * expects to have full control of the IOVA space for the device.  We cannot
2880  * satisfy both the requirement that RMRR access is maintained and have an
2881  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2882  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2883  * We therefore prevent devices associated with an RMRR from participating in
2884  * the IOMMU API, which eliminates them from device assignment.
2885  *
2886  * In both cases, devices which have relaxable RMRRs are not concerned by this
2887  * restriction. See device_rmrr_is_relaxable comment.
2888  */
2889 static bool device_is_rmrr_locked(struct device *dev)
2890 {
2891         if (!device_has_rmrr(dev))
2892                 return false;
2893
2894         if (device_rmrr_is_relaxable(dev))
2895                 return false;
2896
2897         return true;
2898 }
2899
2900 /*
2901  * Return the required default domain type for a specific device.
2902  *
2903  * @dev: the device in query
2904  * @startup: true if this is during early boot
2905  *
2906  * Returns:
2907  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2908  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2909  *  - 0: both identity and dynamic domains work for this device
2910  */
2911 static int device_def_domain_type(struct device *dev)
2912 {
2913         if (dev_is_pci(dev)) {
2914                 struct pci_dev *pdev = to_pci_dev(dev);
2915
2916                 /*
2917                  * Prevent any device marked as untrusted from getting
2918                  * placed into the statically identity mapping domain.
2919                  */
2920                 if (pdev->untrusted)
2921                         return IOMMU_DOMAIN_DMA;
2922
2923                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2924                         return IOMMU_DOMAIN_IDENTITY;
2925
2926                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2927                         return IOMMU_DOMAIN_IDENTITY;
2928         }
2929
2930         return 0;
2931 }
2932
2933 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2934 {
2935         /*
2936          * Start from the sane iommu hardware state.
2937          * If the queued invalidation is already initialized by us
2938          * (for example, while enabling interrupt-remapping) then
2939          * we got the things already rolling from a sane state.
2940          */
2941         if (!iommu->qi) {
2942                 /*
2943                  * Clear any previous faults.
2944                  */
2945                 dmar_fault(-1, iommu);
2946                 /*
2947                  * Disable queued invalidation if supported and already enabled
2948                  * before OS handover.
2949                  */
2950                 dmar_disable_qi(iommu);
2951         }
2952
2953         if (dmar_enable_qi(iommu)) {
2954                 /*
2955                  * Queued Invalidate not enabled, use Register Based Invalidate
2956                  */
2957                 iommu->flush.flush_context = __iommu_flush_context;
2958                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2959                 pr_info("%s: Using Register based invalidation\n",
2960                         iommu->name);
2961         } else {
2962                 iommu->flush.flush_context = qi_flush_context;
2963                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2964                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2965         }
2966 }
2967
2968 static int copy_context_table(struct intel_iommu *iommu,
2969                               struct root_entry *old_re,
2970                               struct context_entry **tbl,
2971                               int bus, bool ext)
2972 {
2973         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2974         struct context_entry *new_ce = NULL, ce;
2975         struct context_entry *old_ce = NULL;
2976         struct root_entry re;
2977         phys_addr_t old_ce_phys;
2978
2979         tbl_idx = ext ? bus * 2 : bus;
2980         memcpy(&re, old_re, sizeof(re));
2981
2982         for (devfn = 0; devfn < 256; devfn++) {
2983                 /* First calculate the correct index */
2984                 idx = (ext ? devfn * 2 : devfn) % 256;
2985
2986                 if (idx == 0) {
2987                         /* First save what we may have and clean up */
2988                         if (new_ce) {
2989                                 tbl[tbl_idx] = new_ce;
2990                                 __iommu_flush_cache(iommu, new_ce,
2991                                                     VTD_PAGE_SIZE);
2992                                 pos = 1;
2993                         }
2994
2995                         if (old_ce)
2996                                 memunmap(old_ce);
2997
2998                         ret = 0;
2999                         if (devfn < 0x80)
3000                                 old_ce_phys = root_entry_lctp(&re);
3001                         else
3002                                 old_ce_phys = root_entry_uctp(&re);
3003
3004                         if (!old_ce_phys) {
3005                                 if (ext && devfn == 0) {
3006                                         /* No LCTP, try UCTP */
3007                                         devfn = 0x7f;
3008                                         continue;
3009                                 } else {
3010                                         goto out;
3011                                 }
3012                         }
3013
3014                         ret = -ENOMEM;
3015                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3016                                         MEMREMAP_WB);
3017                         if (!old_ce)
3018                                 goto out;
3019
3020                         new_ce = alloc_pgtable_page(iommu->node);
3021                         if (!new_ce)
3022                                 goto out_unmap;
3023
3024                         ret = 0;
3025                 }
3026
3027                 /* Now copy the context entry */
3028                 memcpy(&ce, old_ce + idx, sizeof(ce));
3029
3030                 if (!__context_present(&ce))
3031                         continue;
3032
3033                 did = context_domain_id(&ce);
3034                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3035                         set_bit(did, iommu->domain_ids);
3036
3037                 /*
3038                  * We need a marker for copied context entries. This
3039                  * marker needs to work for the old format as well as
3040                  * for extended context entries.
3041                  *
3042                  * Bit 67 of the context entry is used. In the old
3043                  * format this bit is available to software, in the
3044                  * extended format it is the PGE bit, but PGE is ignored
3045                  * by HW if PASIDs are disabled (and thus still
3046                  * available).
3047                  *
3048                  * So disable PASIDs first and then mark the entry
3049                  * copied. This means that we don't copy PASID
3050                  * translations from the old kernel, but this is fine as
3051                  * faults there are not fatal.
3052                  */
3053                 context_clear_pasid_enable(&ce);
3054                 context_set_copied(&ce);
3055
3056                 new_ce[idx] = ce;
3057         }
3058
3059         tbl[tbl_idx + pos] = new_ce;
3060
3061         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3062
3063 out_unmap:
3064         memunmap(old_ce);
3065
3066 out:
3067         return ret;
3068 }
3069
3070 static int copy_translation_tables(struct intel_iommu *iommu)
3071 {
3072         struct context_entry **ctxt_tbls;
3073         struct root_entry *old_rt;
3074         phys_addr_t old_rt_phys;
3075         int ctxt_table_entries;
3076         unsigned long flags;
3077         u64 rtaddr_reg;
3078         int bus, ret;
3079         bool new_ext, ext;
3080
3081         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3082         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3083         new_ext    = !!ecap_ecs(iommu->ecap);
3084
3085         /*
3086          * The RTT bit can only be changed when translation is disabled,
3087          * but disabling translation means to open a window for data
3088          * corruption. So bail out and don't copy anything if we would
3089          * have to change the bit.
3090          */
3091         if (new_ext != ext)
3092                 return -EINVAL;
3093
3094         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3095         if (!old_rt_phys)
3096                 return -EINVAL;
3097
3098         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3099         if (!old_rt)
3100                 return -ENOMEM;
3101
3102         /* This is too big for the stack - allocate it from slab */
3103         ctxt_table_entries = ext ? 512 : 256;
3104         ret = -ENOMEM;
3105         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3106         if (!ctxt_tbls)
3107                 goto out_unmap;
3108
3109         for (bus = 0; bus < 256; bus++) {
3110                 ret = copy_context_table(iommu, &old_rt[bus],
3111                                          ctxt_tbls, bus, ext);
3112                 if (ret) {
3113                         pr_err("%s: Failed to copy context table for bus %d\n",
3114                                 iommu->name, bus);
3115                         continue;
3116                 }
3117         }
3118
3119         spin_lock_irqsave(&iommu->lock, flags);
3120
3121         /* Context tables are copied, now write them to the root_entry table */
3122         for (bus = 0; bus < 256; bus++) {
3123                 int idx = ext ? bus * 2 : bus;
3124                 u64 val;
3125
3126                 if (ctxt_tbls[idx]) {
3127                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3128                         iommu->root_entry[bus].lo = val;
3129                 }
3130
3131                 if (!ext || !ctxt_tbls[idx + 1])
3132                         continue;
3133
3134                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3135                 iommu->root_entry[bus].hi = val;
3136         }
3137
3138         spin_unlock_irqrestore(&iommu->lock, flags);
3139
3140         kfree(ctxt_tbls);
3141
3142         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3143
3144         ret = 0;
3145
3146 out_unmap:
3147         memunmap(old_rt);
3148
3149         return ret;
3150 }
3151
3152 #ifdef CONFIG_INTEL_IOMMU_SVM
3153 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3154 {
3155         struct intel_iommu *iommu = data;
3156         ioasid_t ioasid;
3157
3158         if (!iommu)
3159                 return INVALID_IOASID;
3160         /*
3161          * VT-d virtual command interface always uses the full 20 bit
3162          * PASID range. Host can partition guest PASID range based on
3163          * policies but it is out of guest's control.
3164          */
3165         if (min < PASID_MIN || max > intel_pasid_max_id)
3166                 return INVALID_IOASID;
3167
3168         if (vcmd_alloc_pasid(iommu, &ioasid))
3169                 return INVALID_IOASID;
3170
3171         return ioasid;
3172 }
3173
3174 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3175 {
3176         struct intel_iommu *iommu = data;
3177
3178         if (!iommu)
3179                 return;
3180         /*
3181          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3182          * We can only free the PASID when all the devices are unbound.
3183          */
3184         if (ioasid_find(NULL, ioasid, NULL)) {
3185                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3186                 return;
3187         }
3188         vcmd_free_pasid(iommu, ioasid);
3189 }
3190
3191 static void register_pasid_allocator(struct intel_iommu *iommu)
3192 {
3193         /*
3194          * If we are running in the host, no need for custom allocator
3195          * in that PASIDs are allocated from the host system-wide.
3196          */
3197         if (!cap_caching_mode(iommu->cap))
3198                 return;
3199
3200         if (!sm_supported(iommu)) {
3201                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3202                 return;
3203         }
3204
3205         /*
3206          * Register a custom PASID allocator if we are running in a guest,
3207          * guest PASID must be obtained via virtual command interface.
3208          * There can be multiple vIOMMUs in each guest but only one allocator
3209          * is active. All vIOMMU allocators will eventually be calling the same
3210          * host allocator.
3211          */
3212         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3213                 return;
3214
3215         pr_info("Register custom PASID allocator\n");
3216         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3217         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3218         iommu->pasid_allocator.pdata = (void *)iommu;
3219         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3220                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3221                 /*
3222                  * Disable scalable mode on this IOMMU if there
3223                  * is no custom allocator. Mixing SM capable vIOMMU
3224                  * and non-SM vIOMMU are not supported.
3225                  */
3226                 intel_iommu_sm = 0;
3227         }
3228 }
3229 #endif
3230
3231 static int __init init_dmars(void)
3232 {
3233         struct dmar_drhd_unit *drhd;
3234         struct intel_iommu *iommu;
3235         int ret;
3236
3237         /*
3238          * for each drhd
3239          *    allocate root
3240          *    initialize and program root entry to not present
3241          * endfor
3242          */
3243         for_each_drhd_unit(drhd) {
3244                 /*
3245                  * lock not needed as this is only incremented in the single
3246                  * threaded kernel __init code path all other access are read
3247                  * only
3248                  */
3249                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3250                         g_num_of_iommus++;
3251                         continue;
3252                 }
3253                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3254         }
3255
3256         /* Preallocate enough resources for IOMMU hot-addition */
3257         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3258                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3259
3260         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3261                         GFP_KERNEL);
3262         if (!g_iommus) {
3263                 pr_err("Allocating global iommu array failed\n");
3264                 ret = -ENOMEM;
3265                 goto error;
3266         }
3267
3268         for_each_iommu(iommu, drhd) {
3269                 if (drhd->ignored) {
3270                         iommu_disable_translation(iommu);
3271                         continue;
3272                 }
3273
3274                 /*
3275                  * Find the max pasid size of all IOMMU's in the system.
3276                  * We need to ensure the system pasid table is no bigger
3277                  * than the smallest supported.
3278                  */
3279                 if (pasid_supported(iommu)) {
3280                         u32 temp = 2 << ecap_pss(iommu->ecap);
3281
3282                         intel_pasid_max_id = min_t(u32, temp,
3283                                                    intel_pasid_max_id);
3284                 }
3285
3286                 g_iommus[iommu->seq_id] = iommu;
3287
3288                 intel_iommu_init_qi(iommu);
3289
3290                 ret = iommu_init_domains(iommu);
3291                 if (ret)
3292                         goto free_iommu;
3293
3294                 init_translation_status(iommu);
3295
3296                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3297                         iommu_disable_translation(iommu);
3298                         clear_translation_pre_enabled(iommu);
3299                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3300                                 iommu->name);
3301                 }
3302
3303                 /*
3304                  * TBD:
3305                  * we could share the same root & context tables
3306                  * among all IOMMU's. Need to Split it later.
3307                  */
3308                 ret = iommu_alloc_root_entry(iommu);
3309                 if (ret)
3310                         goto free_iommu;
3311
3312                 if (translation_pre_enabled(iommu)) {
3313                         pr_info("Translation already enabled - trying to copy translation structures\n");
3314
3315                         ret = copy_translation_tables(iommu);
3316                         if (ret) {
3317                                 /*
3318                                  * We found the IOMMU with translation
3319                                  * enabled - but failed to copy over the
3320                                  * old root-entry table. Try to proceed
3321                                  * by disabling translation now and
3322                                  * allocating a clean root-entry table.
3323                                  * This might cause DMAR faults, but
3324                                  * probably the dump will still succeed.
3325                                  */
3326                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3327                                        iommu->name);
3328                                 iommu_disable_translation(iommu);
3329                                 clear_translation_pre_enabled(iommu);
3330                         } else {
3331                                 pr_info("Copied translation tables from previous kernel for %s\n",
3332                                         iommu->name);
3333                         }
3334                 }
3335
3336                 if (!ecap_pass_through(iommu->ecap))
3337                         hw_pass_through = 0;
3338                 intel_svm_check(iommu);
3339         }
3340
3341         /*
3342          * Now that qi is enabled on all iommus, set the root entry and flush
3343          * caches. This is required on some Intel X58 chipsets, otherwise the
3344          * flush_context function will loop forever and the boot hangs.
3345          */
3346         for_each_active_iommu(iommu, drhd) {
3347                 iommu_flush_write_buffer(iommu);
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349                 register_pasid_allocator(iommu);
3350 #endif
3351                 iommu_set_root_entry(iommu);
3352                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3353                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3354         }
3355
3356 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3357         dmar_map_gfx = 0;
3358 #endif
3359
3360         if (!dmar_map_gfx)
3361                 iommu_identity_mapping |= IDENTMAP_GFX;
3362
3363         check_tylersburg_isoch();
3364
3365         ret = si_domain_init(hw_pass_through);
3366         if (ret)
3367                 goto free_iommu;
3368
3369         /*
3370          * for each drhd
3371          *   enable fault log
3372          *   global invalidate context cache
3373          *   global invalidate iotlb
3374          *   enable translation
3375          */
3376         for_each_iommu(iommu, drhd) {
3377                 if (drhd->ignored) {
3378                         /*
3379                          * we always have to disable PMRs or DMA may fail on
3380                          * this device
3381                          */
3382                         if (force_on)
3383                                 iommu_disable_protect_mem_regions(iommu);
3384                         continue;
3385                 }
3386
3387                 iommu_flush_write_buffer(iommu);
3388
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3391                         /*
3392                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3393                          * could cause possible lock race condition.
3394                          */
3395                         up_write(&dmar_global_lock);
3396                         ret = intel_svm_enable_prq(iommu);
3397                         down_write(&dmar_global_lock);
3398                         if (ret)
3399                                 goto free_iommu;
3400                 }
3401 #endif
3402                 ret = dmar_set_interrupt(iommu);
3403                 if (ret)
3404                         goto free_iommu;
3405         }
3406
3407         return 0;
3408
3409 free_iommu:
3410         for_each_active_iommu(iommu, drhd) {
3411                 disable_dmar_iommu(iommu);
3412                 free_dmar_iommu(iommu);
3413         }
3414
3415         kfree(g_iommus);
3416
3417 error:
3418         return ret;
3419 }
3420
3421 /* This takes a number of _MM_ pages, not VTD pages */
3422 static unsigned long intel_alloc_iova(struct device *dev,
3423                                      struct dmar_domain *domain,
3424                                      unsigned long nrpages, uint64_t dma_mask)
3425 {
3426         unsigned long iova_pfn;
3427
3428         /*
3429          * Restrict dma_mask to the width that the iommu can handle.
3430          * First-level translation restricts the input-address to a
3431          * canonical address (i.e., address bits 63:N have the same
3432          * value as address bit [N-1], where N is 48-bits with 4-level
3433          * paging and 57-bits with 5-level paging). Hence, skip bit
3434          * [N-1].
3435          */
3436         if (domain_use_first_level(domain))
3437                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3438                                  dma_mask);
3439         else
3440                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3441                                  dma_mask);
3442
3443         /* Ensure we reserve the whole size-aligned region */
3444         nrpages = __roundup_pow_of_two(nrpages);
3445
3446         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3447                 /*
3448                  * First try to allocate an io virtual address in
3449                  * DMA_BIT_MASK(32) and if that fails then try allocating
3450                  * from higher range
3451                  */
3452                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3453                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3454                 if (iova_pfn)
3455                         return iova_pfn;
3456         }
3457         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3458                                    IOVA_PFN(dma_mask), true);
3459         if (unlikely(!iova_pfn)) {
3460                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3461                              nrpages);
3462                 return 0;
3463         }
3464
3465         return iova_pfn;
3466 }
3467
3468 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3469                                      size_t size, int dir, u64 dma_mask)
3470 {
3471         struct dmar_domain *domain;
3472         phys_addr_t start_paddr;
3473         unsigned long iova_pfn;
3474         int prot = 0;
3475         int ret;
3476         struct intel_iommu *iommu;
3477         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3478
3479         BUG_ON(dir == DMA_NONE);
3480
3481         if (unlikely(attach_deferred(dev)))
3482                 do_deferred_attach(dev);
3483
3484         domain = find_domain(dev);
3485         if (!domain)
3486                 return DMA_MAPPING_ERROR;
3487
3488         iommu = domain_get_iommu(domain);
3489         size = aligned_nrpages(paddr, size);
3490
3491         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3492         if (!iova_pfn)
3493                 goto error;
3494
3495         /*
3496          * Check if DMAR supports zero-length reads on write only
3497          * mappings..
3498          */
3499         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3500                         !cap_zlr(iommu->cap))
3501                 prot |= DMA_PTE_READ;
3502         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3503                 prot |= DMA_PTE_WRITE;
3504         /*
3505          * paddr - (paddr + size) might be partial page, we should map the whole
3506          * page.  Note: if two part of one page are separately mapped, we
3507          * might have two guest_addr mapping to the same host paddr, but this
3508          * is not a big problem
3509          */
3510         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3511                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3512         if (ret)
3513                 goto error;
3514
3515         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3516         start_paddr += paddr & ~PAGE_MASK;
3517
3518         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3519
3520         return start_paddr;
3521
3522 error:
3523         if (iova_pfn)
3524                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3525         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3526                 size, (unsigned long long)paddr, dir);
3527         return DMA_MAPPING_ERROR;
3528 }
3529
3530 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3531                                  unsigned long offset, size_t size,
3532                                  enum dma_data_direction dir,
3533                                  unsigned long attrs)
3534 {
3535         return __intel_map_single(dev, page_to_phys(page) + offset,
3536                                   size, dir, *dev->dma_mask);
3537 }
3538
3539 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3540                                      size_t size, enum dma_data_direction dir,
3541                                      unsigned long attrs)
3542 {
3543         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3544 }
3545
3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3547 {
3548         struct dmar_domain *domain;
3549         unsigned long start_pfn, last_pfn;
3550         unsigned long nrpages;
3551         unsigned long iova_pfn;
3552         struct intel_iommu *iommu;
3553         struct page *freelist;
3554         struct pci_dev *pdev = NULL;
3555
3556         domain = find_domain(dev);
3557         BUG_ON(!domain);
3558
3559         iommu = domain_get_iommu(domain);
3560
3561         iova_pfn = IOVA_PFN(dev_addr);
3562
3563         nrpages = aligned_nrpages(dev_addr, size);
3564         start_pfn = mm_to_dma_pfn(iova_pfn);
3565         last_pfn = start_pfn + nrpages - 1;
3566
3567         if (dev_is_pci(dev))
3568                 pdev = to_pci_dev(dev);
3569
3570         freelist = domain_unmap(domain, start_pfn, last_pfn);
3571         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3572                         !has_iova_flush_queue(&domain->iovad)) {
3573                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3574                                       nrpages, !freelist, 0);
3575                 /* free iova */
3576                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3577                 dma_free_pagelist(freelist);
3578         } else {
3579                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3580                            (unsigned long)freelist);
3581                 /*
3582                  * queue up the release of the unmap to save the 1/6th of the
3583                  * cpu used up by the iotlb flush operation...
3584                  */
3585         }
3586
3587         trace_unmap_single(dev, dev_addr, size);
3588 }
3589
3590 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3591                              size_t size, enum dma_data_direction dir,
3592                              unsigned long attrs)
3593 {
3594         intel_unmap(dev, dev_addr, size);
3595 }
3596
3597 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3598                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3599 {
3600         intel_unmap(dev, dev_addr, size);
3601 }
3602
3603 static void *intel_alloc_coherent(struct device *dev, size_t size,
3604                                   dma_addr_t *dma_handle, gfp_t flags,
3605                                   unsigned long attrs)
3606 {
3607         struct page *page = NULL;
3608         int order;
3609
3610         if (unlikely(attach_deferred(dev)))
3611                 do_deferred_attach(dev);
3612
3613         size = PAGE_ALIGN(size);
3614         order = get_order(size);
3615
3616         if (gfpflags_allow_blocking(flags)) {
3617                 unsigned int count = size >> PAGE_SHIFT;
3618
3619                 page = dma_alloc_from_contiguous(dev, count, order,
3620                                                  flags & __GFP_NOWARN);
3621         }
3622
3623         if (!page)
3624                 page = alloc_pages(flags, order);
3625         if (!page)
3626                 return NULL;
3627         memset(page_address(page), 0, size);
3628
3629         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3630                                          DMA_BIDIRECTIONAL,
3631                                          dev->coherent_dma_mask);
3632         if (*dma_handle != DMA_MAPPING_ERROR)
3633                 return page_address(page);
3634         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3635                 __free_pages(page, order);
3636
3637         return NULL;
3638 }
3639
3640 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3641                                 dma_addr_t dma_handle, unsigned long attrs)
3642 {
3643         int order;
3644         struct page *page = virt_to_page(vaddr);
3645
3646         size = PAGE_ALIGN(size);
3647         order = get_order(size);
3648
3649         intel_unmap(dev, dma_handle, size);
3650         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651                 __free_pages(page, order);
3652 }
3653
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655                            int nelems, enum dma_data_direction dir,
3656                            unsigned long attrs)
3657 {
3658         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659         unsigned long nrpages = 0;
3660         struct scatterlist *sg;
3661         int i;
3662
3663         for_each_sg(sglist, sg, nelems, i) {
3664                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3665         }
3666
3667         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3668
3669         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3670 }
3671
3672 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3673                         enum dma_data_direction dir, unsigned long attrs)
3674 {
3675         int i;
3676         struct dmar_domain *domain;
3677         size_t size = 0;
3678         int prot = 0;
3679         unsigned long iova_pfn;
3680         int ret;
3681         struct scatterlist *sg;
3682         unsigned long start_vpfn;
3683         struct intel_iommu *iommu;
3684
3685         BUG_ON(dir == DMA_NONE);
3686
3687         if (unlikely(attach_deferred(dev)))
3688                 do_deferred_attach(dev);
3689
3690         domain = find_domain(dev);
3691         if (!domain)
3692                 return 0;
3693
3694         iommu = domain_get_iommu(domain);
3695
3696         for_each_sg(sglist, sg, nelems, i)
3697                 size += aligned_nrpages(sg->offset, sg->length);
3698
3699         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3700                                 *dev->dma_mask);
3701         if (!iova_pfn) {
3702                 sglist->dma_length = 0;
3703                 return 0;
3704         }
3705
3706         /*
3707          * Check if DMAR supports zero-length reads on write only
3708          * mappings..
3709          */
3710         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711                         !cap_zlr(iommu->cap))
3712                 prot |= DMA_PTE_READ;
3713         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714                 prot |= DMA_PTE_WRITE;
3715
3716         start_vpfn = mm_to_dma_pfn(iova_pfn);
3717
3718         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719         if (unlikely(ret)) {
3720                 dma_pte_free_pagetable(domain, start_vpfn,
3721                                        start_vpfn + size - 1,
3722                                        agaw_to_level(domain->agaw) + 1);
3723                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3724                 return 0;
3725         }
3726
3727         for_each_sg(sglist, sg, nelems, i)
3728                 trace_map_sg(dev, i + 1, nelems, sg);
3729
3730         return nelems;
3731 }
3732
3733 static u64 intel_get_required_mask(struct device *dev)
3734 {
3735         return DMA_BIT_MASK(32);
3736 }
3737
3738 static const struct dma_map_ops intel_dma_ops = {
3739         .alloc = intel_alloc_coherent,
3740         .free = intel_free_coherent,
3741         .map_sg = intel_map_sg,
3742         .unmap_sg = intel_unmap_sg,
3743         .map_page = intel_map_page,
3744         .unmap_page = intel_unmap_page,
3745         .map_resource = intel_map_resource,
3746         .unmap_resource = intel_unmap_resource,
3747         .dma_supported = dma_direct_supported,
3748         .mmap = dma_common_mmap,
3749         .get_sgtable = dma_common_get_sgtable,
3750         .get_required_mask = intel_get_required_mask,
3751 };
3752
3753 static void
3754 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3755                    enum dma_data_direction dir, enum dma_sync_target target)
3756 {
3757         struct dmar_domain *domain;
3758         phys_addr_t tlb_addr;
3759
3760         domain = find_domain(dev);
3761         if (WARN_ON(!domain))
3762                 return;
3763
3764         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3765         if (is_swiotlb_buffer(tlb_addr))
3766                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3767 }
3768
3769 static dma_addr_t
3770 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3771                   enum dma_data_direction dir, unsigned long attrs,
3772                   u64 dma_mask)
3773 {
3774         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3775         struct dmar_domain *domain;
3776         struct intel_iommu *iommu;
3777         unsigned long iova_pfn;
3778         unsigned long nrpages;
3779         phys_addr_t tlb_addr;
3780         int prot = 0;
3781         int ret;
3782
3783         if (unlikely(attach_deferred(dev)))
3784                 do_deferred_attach(dev);
3785
3786         domain = find_domain(dev);
3787
3788         if (WARN_ON(dir == DMA_NONE || !domain))
3789                 return DMA_MAPPING_ERROR;
3790
3791         iommu = domain_get_iommu(domain);
3792         if (WARN_ON(!iommu))
3793                 return DMA_MAPPING_ERROR;
3794
3795         nrpages = aligned_nrpages(0, size);
3796         iova_pfn = intel_alloc_iova(dev, domain,
3797                                     dma_to_mm_pfn(nrpages), dma_mask);
3798         if (!iova_pfn)
3799                 return DMA_MAPPING_ERROR;
3800
3801         /*
3802          * Check if DMAR supports zero-length reads on write only
3803          * mappings..
3804          */
3805         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3806                         !cap_zlr(iommu->cap))
3807                 prot |= DMA_PTE_READ;
3808         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3809                 prot |= DMA_PTE_WRITE;
3810
3811         /*
3812          * If both the physical buffer start address and size are
3813          * page aligned, we don't need to use a bounce page.
3814          */
3815         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3816                 tlb_addr = swiotlb_tbl_map_single(dev,
3817                                 __phys_to_dma(dev, io_tlb_start),
3818                                 paddr, size, aligned_size, dir, attrs);
3819                 if (tlb_addr == DMA_MAPPING_ERROR) {
3820                         goto swiotlb_error;
3821                 } else {
3822                         /* Cleanup the padding area. */
3823                         void *padding_start = phys_to_virt(tlb_addr);
3824                         size_t padding_size = aligned_size;
3825
3826                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3827                             (dir == DMA_TO_DEVICE ||
3828                              dir == DMA_BIDIRECTIONAL)) {
3829                                 padding_start += size;
3830                                 padding_size -= size;
3831                         }
3832
3833                         memset(padding_start, 0, padding_size);
3834                 }
3835         } else {
3836                 tlb_addr = paddr;
3837         }
3838
3839         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3840                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3841         if (ret)
3842                 goto mapping_error;
3843
3844         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3845
3846         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3847
3848 mapping_error:
3849         if (is_swiotlb_buffer(tlb_addr))
3850                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3851                                          aligned_size, dir, attrs);
3852 swiotlb_error:
3853         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3854         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3855                 size, (unsigned long long)paddr, dir);
3856
3857         return DMA_MAPPING_ERROR;
3858 }
3859
3860 static void
3861 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3862                     enum dma_data_direction dir, unsigned long attrs)
3863 {
3864         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3865         struct dmar_domain *domain;
3866         phys_addr_t tlb_addr;
3867
3868         domain = find_domain(dev);
3869         if (WARN_ON(!domain))
3870                 return;
3871
3872         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3873         if (WARN_ON(!tlb_addr))
3874                 return;
3875
3876         intel_unmap(dev, dev_addr, size);
3877         if (is_swiotlb_buffer(tlb_addr))
3878                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3879                                          aligned_size, dir, attrs);
3880
3881         trace_bounce_unmap_single(dev, dev_addr, size);
3882 }
3883
3884 static dma_addr_t
3885 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3886                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3887 {
3888         return bounce_map_single(dev, page_to_phys(page) + offset,
3889                                  size, dir, attrs, *dev->dma_mask);
3890 }
3891
3892 static dma_addr_t
3893 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3894                     enum dma_data_direction dir, unsigned long attrs)
3895 {
3896         return bounce_map_single(dev, phys_addr, size,
3897                                  dir, attrs, *dev->dma_mask);
3898 }
3899
3900 static void
3901 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3902                   enum dma_data_direction dir, unsigned long attrs)
3903 {
3904         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3905 }
3906
3907 static void
3908 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3909                       enum dma_data_direction dir, unsigned long attrs)
3910 {
3911         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3912 }
3913
3914 static void
3915 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3916                 enum dma_data_direction dir, unsigned long attrs)
3917 {
3918         struct scatterlist *sg;
3919         int i;
3920
3921         for_each_sg(sglist, sg, nelems, i)
3922                 bounce_unmap_page(dev, sg->dma_address,
3923                                   sg_dma_len(sg), dir, attrs);
3924 }
3925
3926 static int
3927 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3928               enum dma_data_direction dir, unsigned long attrs)
3929 {
3930         int i;
3931         struct scatterlist *sg;
3932
3933         for_each_sg(sglist, sg, nelems, i) {
3934                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3935                                                   sg->offset, sg->length,
3936                                                   dir, attrs);
3937                 if (sg->dma_address == DMA_MAPPING_ERROR)
3938                         goto out_unmap;
3939                 sg_dma_len(sg) = sg->length;
3940         }
3941
3942         for_each_sg(sglist, sg, nelems, i)
3943                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3944
3945         return nelems;
3946
3947 out_unmap:
3948         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3949         return 0;
3950 }
3951
3952 static void
3953 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3954                            size_t size, enum dma_data_direction dir)
3955 {
3956         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3957 }
3958
3959 static void
3960 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3961                               size_t size, enum dma_data_direction dir)
3962 {
3963         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3964 }
3965
3966 static void
3967 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3968                        int nelems, enum dma_data_direction dir)
3969 {
3970         struct scatterlist *sg;
3971         int i;
3972
3973         for_each_sg(sglist, sg, nelems, i)
3974                 bounce_sync_single(dev, sg_dma_address(sg),
3975                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3976 }
3977
3978 static void
3979 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3980                           int nelems, enum dma_data_direction dir)
3981 {
3982         struct scatterlist *sg;
3983         int i;
3984
3985         for_each_sg(sglist, sg, nelems, i)
3986                 bounce_sync_single(dev, sg_dma_address(sg),
3987                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3988 }
3989
3990 static const struct dma_map_ops bounce_dma_ops = {
3991         .alloc                  = intel_alloc_coherent,
3992         .free                   = intel_free_coherent,
3993         .map_sg                 = bounce_map_sg,
3994         .unmap_sg               = bounce_unmap_sg,
3995         .map_page               = bounce_map_page,
3996         .unmap_page             = bounce_unmap_page,
3997         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3998         .sync_single_for_device = bounce_sync_single_for_device,
3999         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4000         .sync_sg_for_device     = bounce_sync_sg_for_device,
4001         .map_resource           = bounce_map_resource,
4002         .unmap_resource         = bounce_unmap_resource,
4003         .dma_supported          = dma_direct_supported,
4004 };
4005
4006 static inline int iommu_domain_cache_init(void)
4007 {
4008         int ret = 0;
4009
4010         iommu_domain_cache = kmem_cache_create("iommu_domain",
4011                                          sizeof(struct dmar_domain),
4012                                          0,
4013                                          SLAB_HWCACHE_ALIGN,
4014
4015                                          NULL);
4016         if (!iommu_domain_cache) {
4017                 pr_err("Couldn't create iommu_domain cache\n");
4018                 ret = -ENOMEM;
4019         }
4020
4021         return ret;
4022 }
4023
4024 static inline int iommu_devinfo_cache_init(void)
4025 {
4026         int ret = 0;
4027
4028         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4029                                          sizeof(struct device_domain_info),
4030                                          0,
4031                                          SLAB_HWCACHE_ALIGN,
4032                                          NULL);
4033         if (!iommu_devinfo_cache) {
4034                 pr_err("Couldn't create devinfo cache\n");
4035                 ret = -ENOMEM;
4036         }
4037
4038         return ret;
4039 }
4040
4041 static int __init iommu_init_mempool(void)
4042 {
4043         int ret;
4044         ret = iova_cache_get();
4045         if (ret)
4046                 return ret;
4047
4048         ret = iommu_domain_cache_init();
4049         if (ret)
4050                 goto domain_error;
4051
4052         ret = iommu_devinfo_cache_init();
4053         if (!ret)
4054                 return ret;
4055
4056         kmem_cache_destroy(iommu_domain_cache);
4057 domain_error:
4058         iova_cache_put();
4059
4060         return -ENOMEM;
4061 }
4062
4063 static void __init iommu_exit_mempool(void)
4064 {
4065         kmem_cache_destroy(iommu_devinfo_cache);
4066         kmem_cache_destroy(iommu_domain_cache);
4067         iova_cache_put();
4068 }
4069
4070 static void __init init_no_remapping_devices(void)
4071 {
4072         struct dmar_drhd_unit *drhd;
4073         struct device *dev;
4074         int i;
4075
4076         for_each_drhd_unit(drhd) {
4077                 if (!drhd->include_all) {
4078                         for_each_active_dev_scope(drhd->devices,
4079                                                   drhd->devices_cnt, i, dev)
4080                                 break;
4081                         /* ignore DMAR unit if no devices exist */
4082                         if (i == drhd->devices_cnt)
4083                                 drhd->ignored = 1;
4084                 }
4085         }
4086
4087         for_each_active_drhd_unit(drhd) {
4088                 if (drhd->include_all)
4089                         continue;
4090
4091                 for_each_active_dev_scope(drhd->devices,
4092                                           drhd->devices_cnt, i, dev)
4093                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4094                                 break;
4095                 if (i < drhd->devices_cnt)
4096                         continue;
4097
4098                 /* This IOMMU has *only* gfx devices. Either bypass it or
4099                    set the gfx_mapped flag, as appropriate */
4100                 drhd->gfx_dedicated = 1;
4101                 if (!dmar_map_gfx)
4102                         drhd->ignored = 1;
4103         }
4104 }
4105
4106 #ifdef CONFIG_SUSPEND
4107 static int init_iommu_hw(void)
4108 {
4109         struct dmar_drhd_unit *drhd;
4110         struct intel_iommu *iommu = NULL;
4111
4112         for_each_active_iommu(iommu, drhd)
4113                 if (iommu->qi)
4114                         dmar_reenable_qi(iommu);
4115
4116         for_each_iommu(iommu, drhd) {
4117                 if (drhd->ignored) {
4118                         /*
4119                          * we always have to disable PMRs or DMA may fail on
4120                          * this device
4121                          */
4122                         if (force_on)
4123                                 iommu_disable_protect_mem_regions(iommu);
4124                         continue;
4125                 }
4126
4127                 iommu_flush_write_buffer(iommu);
4128
4129                 iommu_set_root_entry(iommu);
4130
4131                 iommu->flush.flush_context(iommu, 0, 0, 0,
4132                                            DMA_CCMD_GLOBAL_INVL);
4133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4134                 iommu_enable_translation(iommu);
4135                 iommu_disable_protect_mem_regions(iommu);
4136         }
4137
4138         return 0;
4139 }
4140
4141 static void iommu_flush_all(void)
4142 {
4143         struct dmar_drhd_unit *drhd;
4144         struct intel_iommu *iommu;
4145
4146         for_each_active_iommu(iommu, drhd) {
4147                 iommu->flush.flush_context(iommu, 0, 0, 0,
4148                                            DMA_CCMD_GLOBAL_INVL);
4149                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4150                                          DMA_TLB_GLOBAL_FLUSH);
4151         }
4152 }
4153
4154 static int iommu_suspend(void)
4155 {
4156         struct dmar_drhd_unit *drhd;
4157         struct intel_iommu *iommu = NULL;
4158         unsigned long flag;
4159
4160         for_each_active_iommu(iommu, drhd) {
4161                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4162                                                  GFP_ATOMIC);
4163                 if (!iommu->iommu_state)
4164                         goto nomem;
4165         }
4166
4167         iommu_flush_all();
4168
4169         for_each_active_iommu(iommu, drhd) {
4170                 iommu_disable_translation(iommu);
4171
4172                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4173
4174                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4175                         readl(iommu->reg + DMAR_FECTL_REG);
4176                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4177                         readl(iommu->reg + DMAR_FEDATA_REG);
4178                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4179                         readl(iommu->reg + DMAR_FEADDR_REG);
4180                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4181                         readl(iommu->reg + DMAR_FEUADDR_REG);
4182
4183                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4184         }
4185         return 0;
4186
4187 nomem:
4188         for_each_active_iommu(iommu, drhd)
4189                 kfree(iommu->iommu_state);
4190
4191         return -ENOMEM;
4192 }
4193
4194 static void iommu_resume(void)
4195 {
4196         struct dmar_drhd_unit *drhd;
4197         struct intel_iommu *iommu = NULL;
4198         unsigned long flag;
4199
4200         if (init_iommu_hw()) {
4201                 if (force_on)
4202                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4203                 else
4204                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4205                 return;
4206         }
4207
4208         for_each_active_iommu(iommu, drhd) {
4209
4210                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4211
4212                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4213                         iommu->reg + DMAR_FECTL_REG);
4214                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4215                         iommu->reg + DMAR_FEDATA_REG);
4216                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4217                         iommu->reg + DMAR_FEADDR_REG);
4218                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4219                         iommu->reg + DMAR_FEUADDR_REG);
4220
4221                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4222         }
4223
4224         for_each_active_iommu(iommu, drhd)
4225                 kfree(iommu->iommu_state);
4226 }
4227
4228 static struct syscore_ops iommu_syscore_ops = {
4229         .resume         = iommu_resume,
4230         .suspend        = iommu_suspend,
4231 };
4232
4233 static void __init init_iommu_pm_ops(void)
4234 {
4235         register_syscore_ops(&iommu_syscore_ops);
4236 }
4237
4238 #else
4239 static inline void init_iommu_pm_ops(void) {}
4240 #endif  /* CONFIG_PM */
4241
4242 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4243 {
4244         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4245             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4246             rmrr->end_address <= rmrr->base_address ||
4247             arch_rmrr_sanity_check(rmrr))
4248                 return -EINVAL;
4249
4250         return 0;
4251 }
4252
4253 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4254 {
4255         struct acpi_dmar_reserved_memory *rmrr;
4256         struct dmar_rmrr_unit *rmrru;
4257
4258         rmrr = (struct acpi_dmar_reserved_memory *)header;
4259         if (rmrr_sanity_check(rmrr)) {
4260                 pr_warn(FW_BUG
4261                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4262                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4263                            rmrr->base_address, rmrr->end_address,
4264                            dmi_get_system_info(DMI_BIOS_VENDOR),
4265                            dmi_get_system_info(DMI_BIOS_VERSION),
4266                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4267                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4268         }
4269
4270         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4271         if (!rmrru)
4272                 goto out;
4273
4274         rmrru->hdr = header;
4275
4276         rmrru->base_address = rmrr->base_address;
4277         rmrru->end_address = rmrr->end_address;
4278
4279         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4280                                 ((void *)rmrr) + rmrr->header.length,
4281                                 &rmrru->devices_cnt);
4282         if (rmrru->devices_cnt && rmrru->devices == NULL)
4283                 goto free_rmrru;
4284
4285         list_add(&rmrru->list, &dmar_rmrr_units);
4286
4287         return 0;
4288 free_rmrru:
4289         kfree(rmrru);
4290 out:
4291         return -ENOMEM;
4292 }
4293
4294 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4295 {
4296         struct dmar_atsr_unit *atsru;
4297         struct acpi_dmar_atsr *tmp;
4298
4299         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4300                                 dmar_rcu_check()) {
4301                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4302                 if (atsr->segment != tmp->segment)
4303                         continue;
4304                 if (atsr->header.length != tmp->header.length)
4305                         continue;
4306                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4307                         return atsru;
4308         }
4309
4310         return NULL;
4311 }
4312
4313 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4314 {
4315         struct acpi_dmar_atsr *atsr;
4316         struct dmar_atsr_unit *atsru;
4317
4318         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4319                 return 0;
4320
4321         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4322         atsru = dmar_find_atsr(atsr);
4323         if (atsru)
4324                 return 0;
4325
4326         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4327         if (!atsru)
4328                 return -ENOMEM;
4329
4330         /*
4331          * If memory is allocated from slab by ACPI _DSM method, we need to
4332          * copy the memory content because the memory buffer will be freed
4333          * on return.
4334          */
4335         atsru->hdr = (void *)(atsru + 1);
4336         memcpy(atsru->hdr, hdr, hdr->length);
4337         atsru->include_all = atsr->flags & 0x1;
4338         if (!atsru->include_all) {
4339                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4340                                 (void *)atsr + atsr->header.length,
4341                                 &atsru->devices_cnt);
4342                 if (atsru->devices_cnt && atsru->devices == NULL) {
4343                         kfree(atsru);
4344                         return -ENOMEM;
4345                 }
4346         }
4347
4348         list_add_rcu(&atsru->list, &dmar_atsr_units);
4349
4350         return 0;
4351 }
4352
4353 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4354 {
4355         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4356         kfree(atsru);
4357 }
4358
4359 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4360 {
4361         struct acpi_dmar_atsr *atsr;
4362         struct dmar_atsr_unit *atsru;
4363
4364         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4365         atsru = dmar_find_atsr(atsr);
4366         if (atsru) {
4367                 list_del_rcu(&atsru->list);
4368                 synchronize_rcu();
4369                 intel_iommu_free_atsr(atsru);
4370         }
4371
4372         return 0;
4373 }
4374
4375 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4376 {
4377         int i;
4378         struct device *dev;
4379         struct acpi_dmar_atsr *atsr;
4380         struct dmar_atsr_unit *atsru;
4381
4382         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4383         atsru = dmar_find_atsr(atsr);
4384         if (!atsru)
4385                 return 0;
4386
4387         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4388                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4389                                           i, dev)
4390                         return -EBUSY;
4391         }
4392
4393         return 0;
4394 }
4395
4396 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4397 {
4398         int sp, ret;
4399         struct intel_iommu *iommu = dmaru->iommu;
4400
4401         if (g_iommus[iommu->seq_id])
4402                 return 0;
4403
4404         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4405                 pr_warn("%s: Doesn't support hardware pass through.\n",
4406                         iommu->name);
4407                 return -ENXIO;
4408         }
4409         if (!ecap_sc_support(iommu->ecap) &&
4410             domain_update_iommu_snooping(iommu)) {
4411                 pr_warn("%s: Doesn't support snooping.\n",
4412                         iommu->name);
4413                 return -ENXIO;
4414         }
4415         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4416         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4417                 pr_warn("%s: Doesn't support large page.\n",
4418                         iommu->name);
4419                 return -ENXIO;
4420         }
4421
4422         /*
4423          * Disable translation if already enabled prior to OS handover.
4424          */
4425         if (iommu->gcmd & DMA_GCMD_TE)
4426                 iommu_disable_translation(iommu);
4427
4428         g_iommus[iommu->seq_id] = iommu;
4429         ret = iommu_init_domains(iommu);
4430         if (ret == 0)
4431                 ret = iommu_alloc_root_entry(iommu);
4432         if (ret)
4433                 goto out;
4434
4435         intel_svm_check(iommu);
4436
4437         if (dmaru->ignored) {
4438                 /*
4439                  * we always have to disable PMRs or DMA may fail on this device
4440                  */
4441                 if (force_on)
4442                         iommu_disable_protect_mem_regions(iommu);
4443                 return 0;
4444         }
4445
4446         intel_iommu_init_qi(iommu);
4447         iommu_flush_write_buffer(iommu);
4448
4449 #ifdef CONFIG_INTEL_IOMMU_SVM
4450         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4451                 ret = intel_svm_enable_prq(iommu);
4452                 if (ret)
4453                         goto disable_iommu;
4454         }
4455 #endif
4456         ret = dmar_set_interrupt(iommu);
4457         if (ret)
4458                 goto disable_iommu;
4459
4460         iommu_set_root_entry(iommu);
4461         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4462         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4463         iommu_enable_translation(iommu);
4464
4465         iommu_disable_protect_mem_regions(iommu);
4466         return 0;
4467
4468 disable_iommu:
4469         disable_dmar_iommu(iommu);
4470 out:
4471         free_dmar_iommu(iommu);
4472         return ret;
4473 }
4474
4475 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4476 {
4477         int ret = 0;
4478         struct intel_iommu *iommu = dmaru->iommu;
4479
4480         if (!intel_iommu_enabled)
4481                 return 0;
4482         if (iommu == NULL)
4483                 return -EINVAL;
4484
4485         if (insert) {
4486                 ret = intel_iommu_add(dmaru);
4487         } else {
4488                 disable_dmar_iommu(iommu);
4489                 free_dmar_iommu(iommu);
4490         }
4491
4492         return ret;
4493 }
4494
4495 static void intel_iommu_free_dmars(void)
4496 {
4497         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4498         struct dmar_atsr_unit *atsru, *atsr_n;
4499
4500         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4501                 list_del(&rmrru->list);
4502                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4503                 kfree(rmrru);
4504         }
4505
4506         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4507                 list_del(&atsru->list);
4508                 intel_iommu_free_atsr(atsru);
4509         }
4510 }
4511
4512 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4513 {
4514         int i, ret = 1;
4515         struct pci_bus *bus;
4516         struct pci_dev *bridge = NULL;
4517         struct device *tmp;
4518         struct acpi_dmar_atsr *atsr;
4519         struct dmar_atsr_unit *atsru;
4520
4521         dev = pci_physfn(dev);
4522         for (bus = dev->bus; bus; bus = bus->parent) {
4523                 bridge = bus->self;
4524                 /* If it's an integrated device, allow ATS */
4525                 if (!bridge)
4526                         return 1;
4527                 /* Connected via non-PCIe: no ATS */
4528                 if (!pci_is_pcie(bridge) ||
4529                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4530                         return 0;
4531                 /* If we found the root port, look it up in the ATSR */
4532                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4533                         break;
4534         }
4535
4536         rcu_read_lock();
4537         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4538                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4539                 if (atsr->segment != pci_domain_nr(dev->bus))
4540                         continue;
4541
4542                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4543                         if (tmp == &bridge->dev)
4544                                 goto out;
4545
4546                 if (atsru->include_all)
4547                         goto out;
4548         }
4549         ret = 0;
4550 out:
4551         rcu_read_unlock();
4552
4553         return ret;
4554 }
4555
4556 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4557 {
4558         int ret;
4559         struct dmar_rmrr_unit *rmrru;
4560         struct dmar_atsr_unit *atsru;
4561         struct acpi_dmar_atsr *atsr;
4562         struct acpi_dmar_reserved_memory *rmrr;
4563
4564         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4565                 return 0;
4566
4567         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4568                 rmrr = container_of(rmrru->hdr,
4569                                     struct acpi_dmar_reserved_memory, header);
4570                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4571                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4572                                 ((void *)rmrr) + rmrr->header.length,
4573                                 rmrr->segment, rmrru->devices,
4574                                 rmrru->devices_cnt);
4575                         if (ret < 0)
4576                                 return ret;
4577                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4578                         dmar_remove_dev_scope(info, rmrr->segment,
4579                                 rmrru->devices, rmrru->devices_cnt);
4580                 }
4581         }
4582
4583         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4584                 if (atsru->include_all)
4585                         continue;
4586
4587                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4588                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4589                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4590                                         (void *)atsr + atsr->header.length,
4591                                         atsr->segment, atsru->devices,
4592                                         atsru->devices_cnt);
4593                         if (ret > 0)
4594                                 break;
4595                         else if (ret < 0)
4596                                 return ret;
4597                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4598                         if (dmar_remove_dev_scope(info, atsr->segment,
4599                                         atsru->devices, atsru->devices_cnt))
4600                                 break;
4601                 }
4602         }
4603
4604         return 0;
4605 }
4606
4607 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4608                                        unsigned long val, void *v)
4609 {
4610         struct memory_notify *mhp = v;
4611         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4612         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4613                         mhp->nr_pages - 1);
4614
4615         switch (val) {
4616         case MEM_GOING_ONLINE:
4617                 if (iommu_domain_identity_map(si_domain,
4618                                               start_vpfn, last_vpfn)) {
4619                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4620                                 start_vpfn, last_vpfn);
4621                         return NOTIFY_BAD;
4622                 }
4623                 break;
4624
4625         case MEM_OFFLINE:
4626         case MEM_CANCEL_ONLINE:
4627                 {
4628                         struct dmar_drhd_unit *drhd;
4629                         struct intel_iommu *iommu;
4630                         struct page *freelist;
4631
4632                         freelist = domain_unmap(si_domain,
4633                                                 start_vpfn, last_vpfn);
4634
4635                         rcu_read_lock();
4636                         for_each_active_iommu(iommu, drhd)
4637                                 iommu_flush_iotlb_psi(iommu, si_domain,
4638                                         start_vpfn, mhp->nr_pages,
4639                                         !freelist, 0);
4640                         rcu_read_unlock();
4641                         dma_free_pagelist(freelist);
4642                 }
4643                 break;
4644         }
4645
4646         return NOTIFY_OK;
4647 }
4648
4649 static struct notifier_block intel_iommu_memory_nb = {
4650         .notifier_call = intel_iommu_memory_notifier,
4651         .priority = 0
4652 };
4653
4654 static void free_all_cpu_cached_iovas(unsigned int cpu)
4655 {
4656         int i;
4657
4658         for (i = 0; i < g_num_of_iommus; i++) {
4659                 struct intel_iommu *iommu = g_iommus[i];
4660                 struct dmar_domain *domain;
4661                 int did;
4662
4663                 if (!iommu)
4664                         continue;
4665
4666                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4667                         domain = get_iommu_domain(iommu, (u16)did);
4668
4669                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4670                                 continue;
4671
4672                         free_cpu_cached_iovas(cpu, &domain->iovad);
4673                 }
4674         }
4675 }
4676
4677 static int intel_iommu_cpu_dead(unsigned int cpu)
4678 {
4679         free_all_cpu_cached_iovas(cpu);
4680         return 0;
4681 }
4682
4683 static void intel_disable_iommus(void)
4684 {
4685         struct intel_iommu *iommu = NULL;
4686         struct dmar_drhd_unit *drhd;
4687
4688         for_each_iommu(iommu, drhd)
4689                 iommu_disable_translation(iommu);
4690 }
4691
4692 void intel_iommu_shutdown(void)
4693 {
4694         struct dmar_drhd_unit *drhd;
4695         struct intel_iommu *iommu = NULL;
4696
4697         if (no_iommu || dmar_disabled)
4698                 return;
4699
4700         down_write(&dmar_global_lock);
4701
4702         /* Disable PMRs explicitly here. */
4703         for_each_iommu(iommu, drhd)
4704                 iommu_disable_protect_mem_regions(iommu);
4705
4706         /* Make sure the IOMMUs are switched off */
4707         intel_disable_iommus();
4708
4709         up_write(&dmar_global_lock);
4710 }
4711
4712 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4713 {
4714         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4715
4716         return container_of(iommu_dev, struct intel_iommu, iommu);
4717 }
4718
4719 static ssize_t intel_iommu_show_version(struct device *dev,
4720                                         struct device_attribute *attr,
4721                                         char *buf)
4722 {
4723         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4724         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4725         return sprintf(buf, "%d:%d\n",
4726                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4727 }
4728 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4729
4730 static ssize_t intel_iommu_show_address(struct device *dev,
4731                                         struct device_attribute *attr,
4732                                         char *buf)
4733 {
4734         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4735         return sprintf(buf, "%llx\n", iommu->reg_phys);
4736 }
4737 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4738
4739 static ssize_t intel_iommu_show_cap(struct device *dev,
4740                                     struct device_attribute *attr,
4741                                     char *buf)
4742 {
4743         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4744         return sprintf(buf, "%llx\n", iommu->cap);
4745 }
4746 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4747
4748 static ssize_t intel_iommu_show_ecap(struct device *dev,
4749                                     struct device_attribute *attr,
4750                                     char *buf)
4751 {
4752         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4753         return sprintf(buf, "%llx\n", iommu->ecap);
4754 }
4755 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4756
4757 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4758                                       struct device_attribute *attr,
4759                                       char *buf)
4760 {
4761         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4762         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4763 }
4764 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4765
4766 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4767                                            struct device_attribute *attr,
4768                                            char *buf)
4769 {
4770         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4771         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4772                                                   cap_ndoms(iommu->cap)));
4773 }
4774 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4775
4776 static struct attribute *intel_iommu_attrs[] = {
4777         &dev_attr_version.attr,
4778         &dev_attr_address.attr,
4779         &dev_attr_cap.attr,
4780         &dev_attr_ecap.attr,
4781         &dev_attr_domains_supported.attr,
4782         &dev_attr_domains_used.attr,
4783         NULL,
4784 };
4785
4786 static struct attribute_group intel_iommu_group = {
4787         .name = "intel-iommu",
4788         .attrs = intel_iommu_attrs,
4789 };
4790
4791 const struct attribute_group *intel_iommu_groups[] = {
4792         &intel_iommu_group,
4793         NULL,
4794 };
4795
4796 static inline bool has_external_pci(void)
4797 {
4798         struct pci_dev *pdev = NULL;
4799
4800         for_each_pci_dev(pdev)
4801                 if (pdev->external_facing)
4802                         return true;
4803
4804         return false;
4805 }
4806
4807 static int __init platform_optin_force_iommu(void)
4808 {
4809         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4810                 return 0;
4811
4812         if (no_iommu || dmar_disabled)
4813                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4814
4815         /*
4816          * If Intel-IOMMU is disabled by default, we will apply identity
4817          * map for all devices except those marked as being untrusted.
4818          */
4819         if (dmar_disabled)
4820                 iommu_set_default_passthrough(false);
4821
4822         dmar_disabled = 0;
4823         no_iommu = 0;
4824
4825         return 1;
4826 }
4827
4828 static int __init probe_acpi_namespace_devices(void)
4829 {
4830         struct dmar_drhd_unit *drhd;
4831         /* To avoid a -Wunused-but-set-variable warning. */
4832         struct intel_iommu *iommu __maybe_unused;
4833         struct device *dev;
4834         int i, ret = 0;
4835
4836         for_each_active_iommu(iommu, drhd) {
4837                 for_each_active_dev_scope(drhd->devices,
4838                                           drhd->devices_cnt, i, dev) {
4839                         struct acpi_device_physical_node *pn;
4840                         struct iommu_group *group;
4841                         struct acpi_device *adev;
4842
4843                         if (dev->bus != &acpi_bus_type)
4844                                 continue;
4845
4846                         adev = to_acpi_device(dev);
4847                         mutex_lock(&adev->physical_node_lock);
4848                         list_for_each_entry(pn,
4849                                             &adev->physical_node_list, node) {
4850                                 group = iommu_group_get(pn->dev);
4851                                 if (group) {
4852                                         iommu_group_put(group);
4853                                         continue;
4854                                 }
4855
4856                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4857                                 ret = iommu_probe_device(pn->dev);
4858                                 if (ret)
4859                                         break;
4860                         }
4861                         mutex_unlock(&adev->physical_node_lock);
4862
4863                         if (ret)
4864                                 return ret;
4865                 }
4866         }
4867
4868         return 0;
4869 }
4870
4871 int __init intel_iommu_init(void)
4872 {
4873         int ret = -ENODEV;
4874         struct dmar_drhd_unit *drhd;
4875         struct intel_iommu *iommu;
4876
4877         /*
4878          * Intel IOMMU is required for a TXT/tboot launch or platform
4879          * opt in, so enforce that.
4880          */
4881         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4882
4883         if (iommu_init_mempool()) {
4884                 if (force_on)
4885                         panic("tboot: Failed to initialize iommu memory\n");
4886                 return -ENOMEM;
4887         }
4888
4889         down_write(&dmar_global_lock);
4890         if (dmar_table_init()) {
4891                 if (force_on)
4892                         panic("tboot: Failed to initialize DMAR table\n");
4893                 goto out_free_dmar;
4894         }
4895
4896         if (dmar_dev_scope_init() < 0) {
4897                 if (force_on)
4898                         panic("tboot: Failed to initialize DMAR device scope\n");
4899                 goto out_free_dmar;
4900         }
4901
4902         up_write(&dmar_global_lock);
4903
4904         /*
4905          * The bus notifier takes the dmar_global_lock, so lockdep will
4906          * complain later when we register it under the lock.
4907          */
4908         dmar_register_bus_notifier();
4909
4910         down_write(&dmar_global_lock);
4911
4912         if (!no_iommu)
4913                 intel_iommu_debugfs_init();
4914
4915         if (no_iommu || dmar_disabled) {
4916                 /*
4917                  * We exit the function here to ensure IOMMU's remapping and
4918                  * mempool aren't setup, which means that the IOMMU's PMRs
4919                  * won't be disabled via the call to init_dmars(). So disable
4920                  * it explicitly here. The PMRs were setup by tboot prior to
4921                  * calling SENTER, but the kernel is expected to reset/tear
4922                  * down the PMRs.
4923                  */
4924                 if (intel_iommu_tboot_noforce) {
4925                         for_each_iommu(iommu, drhd)
4926                                 iommu_disable_protect_mem_regions(iommu);
4927                 }
4928
4929                 /*
4930                  * Make sure the IOMMUs are switched off, even when we
4931                  * boot into a kexec kernel and the previous kernel left
4932                  * them enabled
4933                  */
4934                 intel_disable_iommus();
4935                 goto out_free_dmar;
4936         }
4937
4938         if (list_empty(&dmar_rmrr_units))
4939                 pr_info("No RMRR found\n");
4940
4941         if (list_empty(&dmar_atsr_units))
4942                 pr_info("No ATSR found\n");
4943
4944         if (dmar_init_reserved_ranges()) {
4945                 if (force_on)
4946                         panic("tboot: Failed to reserve iommu ranges\n");
4947                 goto out_free_reserved_range;
4948         }
4949
4950         if (dmar_map_gfx)
4951                 intel_iommu_gfx_mapped = 1;
4952
4953         init_no_remapping_devices();
4954
4955         ret = init_dmars();
4956         if (ret) {
4957                 if (force_on)
4958                         panic("tboot: Failed to initialize DMARs\n");
4959                 pr_err("Initialization failed\n");
4960                 goto out_free_reserved_range;
4961         }
4962         up_write(&dmar_global_lock);
4963
4964         init_iommu_pm_ops();
4965
4966         down_read(&dmar_global_lock);
4967         for_each_active_iommu(iommu, drhd) {
4968                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4969                                        intel_iommu_groups,
4970                                        "%s", iommu->name);
4971                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4972                 iommu_device_register(&iommu->iommu);
4973         }
4974         up_read(&dmar_global_lock);
4975
4976         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4977         if (si_domain && !hw_pass_through)
4978                 register_memory_notifier(&intel_iommu_memory_nb);
4979         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4980                           intel_iommu_cpu_dead);
4981
4982         down_read(&dmar_global_lock);
4983         if (probe_acpi_namespace_devices())
4984                 pr_warn("ACPI name space devices didn't probe correctly\n");
4985
4986         /* Finally, we enable the DMA remapping hardware. */
4987         for_each_iommu(iommu, drhd) {
4988                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4989                         iommu_enable_translation(iommu);
4990
4991                 iommu_disable_protect_mem_regions(iommu);
4992         }
4993         up_read(&dmar_global_lock);
4994
4995         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4996
4997         intel_iommu_enabled = 1;
4998
4999         return 0;
5000
5001 out_free_reserved_range:
5002         put_iova_domain(&reserved_iova_list);
5003 out_free_dmar:
5004         intel_iommu_free_dmars();
5005         up_write(&dmar_global_lock);
5006         iommu_exit_mempool();
5007         return ret;
5008 }
5009
5010 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5011 {
5012         struct intel_iommu *iommu = opaque;
5013
5014         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5015         return 0;
5016 }
5017
5018 /*
5019  * NB - intel-iommu lacks any sort of reference counting for the users of
5020  * dependent devices.  If multiple endpoints have intersecting dependent
5021  * devices, unbinding the driver from any one of them will possibly leave
5022  * the others unable to operate.
5023  */
5024 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5025 {
5026         if (!iommu || !dev || !dev_is_pci(dev))
5027                 return;
5028
5029         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5030 }
5031
5032 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5033 {
5034         struct dmar_domain *domain;
5035         struct intel_iommu *iommu;
5036         unsigned long flags;
5037
5038         assert_spin_locked(&device_domain_lock);
5039
5040         if (WARN_ON(!info))
5041                 return;
5042
5043         iommu = info->iommu;
5044         domain = info->domain;
5045
5046         if (info->dev) {
5047                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5048                         intel_pasid_tear_down_entry(iommu, info->dev,
5049                                         PASID_RID2PASID, false);
5050
5051                 iommu_disable_dev_iotlb(info);
5052                 if (!dev_is_real_dma_subdevice(info->dev))
5053                         domain_context_clear(iommu, info->dev);
5054                 intel_pasid_free_table(info->dev);
5055         }
5056
5057         unlink_domain_info(info);
5058
5059         spin_lock_irqsave(&iommu->lock, flags);
5060         domain_detach_iommu(domain, iommu);
5061         spin_unlock_irqrestore(&iommu->lock, flags);
5062
5063         free_devinfo_mem(info);
5064 }
5065
5066 static void dmar_remove_one_dev_info(struct device *dev)
5067 {
5068         struct device_domain_info *info;
5069         unsigned long flags;
5070
5071         spin_lock_irqsave(&device_domain_lock, flags);
5072         info = get_domain_info(dev);
5073         if (info)
5074                 __dmar_remove_one_dev_info(info);
5075         spin_unlock_irqrestore(&device_domain_lock, flags);
5076 }
5077
5078 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5079 {
5080         int adjust_width;
5081
5082         /* calculate AGAW */
5083         domain->gaw = guest_width;
5084         adjust_width = guestwidth_to_adjustwidth(guest_width);
5085         domain->agaw = width_to_agaw(adjust_width);
5086
5087         domain->iommu_coherency = 0;
5088         domain->iommu_snooping = 0;
5089         domain->iommu_superpage = 0;
5090         domain->max_addr = 0;
5091
5092         /* always allocate the top pgd */
5093         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5094         if (!domain->pgd)
5095                 return -ENOMEM;
5096         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5097         return 0;
5098 }
5099
5100 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5101 {
5102         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5103         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5104
5105         if (!intel_iommu_strict &&
5106             init_iova_flush_queue(&dmar_domain->iovad,
5107                                   iommu_flush_iova, iova_entry_free))
5108                 pr_info("iova flush queue initialization failed\n");
5109 }
5110
5111 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5112 {
5113         struct dmar_domain *dmar_domain;
5114         struct iommu_domain *domain;
5115
5116         switch (type) {
5117         case IOMMU_DOMAIN_DMA:
5118         case IOMMU_DOMAIN_UNMANAGED:
5119                 dmar_domain = alloc_domain(0);
5120                 if (!dmar_domain) {
5121                         pr_err("Can't allocate dmar_domain\n");
5122                         return NULL;
5123                 }
5124                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5125                         pr_err("Domain initialization failed\n");
5126                         domain_exit(dmar_domain);
5127                         return NULL;
5128                 }
5129
5130                 if (type == IOMMU_DOMAIN_DMA)
5131                         intel_init_iova_domain(dmar_domain);
5132
5133                 domain = &dmar_domain->domain;
5134                 domain->geometry.aperture_start = 0;
5135                 domain->geometry.aperture_end   =
5136                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5137                 domain->geometry.force_aperture = true;
5138
5139                 return domain;
5140         case IOMMU_DOMAIN_IDENTITY:
5141                 return &si_domain->domain;
5142         default:
5143                 return NULL;
5144         }
5145
5146         return NULL;
5147 }
5148
5149 static void intel_iommu_domain_free(struct iommu_domain *domain)
5150 {
5151         if (domain != &si_domain->domain)
5152                 domain_exit(to_dmar_domain(domain));
5153 }
5154
5155 /*
5156  * Check whether a @domain could be attached to the @dev through the
5157  * aux-domain attach/detach APIs.
5158  */
5159 static inline bool
5160 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5161 {
5162         struct device_domain_info *info = get_domain_info(dev);
5163
5164         return info && info->auxd_enabled &&
5165                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5166 }
5167
5168 static void auxiliary_link_device(struct dmar_domain *domain,
5169                                   struct device *dev)
5170 {
5171         struct device_domain_info *info = get_domain_info(dev);
5172
5173         assert_spin_locked(&device_domain_lock);
5174         if (WARN_ON(!info))
5175                 return;
5176
5177         domain->auxd_refcnt++;
5178         list_add(&domain->auxd, &info->auxiliary_domains);
5179 }
5180
5181 static void auxiliary_unlink_device(struct dmar_domain *domain,
5182                                     struct device *dev)
5183 {
5184         struct device_domain_info *info = get_domain_info(dev);
5185
5186         assert_spin_locked(&device_domain_lock);
5187         if (WARN_ON(!info))
5188                 return;
5189
5190         list_del(&domain->auxd);
5191         domain->auxd_refcnt--;
5192
5193         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5194                 ioasid_free(domain->default_pasid);
5195 }
5196
5197 static int aux_domain_add_dev(struct dmar_domain *domain,
5198                               struct device *dev)
5199 {
5200         int ret;
5201         unsigned long flags;
5202         struct intel_iommu *iommu;
5203
5204         iommu = device_to_iommu(dev, NULL, NULL);
5205         if (!iommu)
5206                 return -ENODEV;
5207
5208         if (domain->default_pasid <= 0) {
5209                 u32 pasid;
5210
5211                 /* No private data needed for the default pasid */
5212                 pasid = ioasid_alloc(NULL, PASID_MIN,
5213                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5214                                      NULL);
5215                 if (pasid == INVALID_IOASID) {
5216                         pr_err("Can't allocate default pasid\n");
5217                         return -ENODEV;
5218                 }
5219                 domain->default_pasid = pasid;
5220         }
5221
5222         spin_lock_irqsave(&device_domain_lock, flags);
5223         /*
5224          * iommu->lock must be held to attach domain to iommu and setup the
5225          * pasid entry for second level translation.
5226          */
5227         spin_lock(&iommu->lock);
5228         ret = domain_attach_iommu(domain, iommu);
5229         if (ret)
5230                 goto attach_failed;
5231
5232         /* Setup the PASID entry for mediated devices: */
5233         if (domain_use_first_level(domain))
5234                 ret = domain_setup_first_level(iommu, domain, dev,
5235                                                domain->default_pasid);
5236         else
5237                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5238                                                      domain->default_pasid);
5239         if (ret)
5240                 goto table_failed;
5241         spin_unlock(&iommu->lock);
5242
5243         auxiliary_link_device(domain, dev);
5244
5245         spin_unlock_irqrestore(&device_domain_lock, flags);
5246
5247         return 0;
5248
5249 table_failed:
5250         domain_detach_iommu(domain, iommu);
5251 attach_failed:
5252         spin_unlock(&iommu->lock);
5253         spin_unlock_irqrestore(&device_domain_lock, flags);
5254         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5255                 ioasid_free(domain->default_pasid);
5256
5257         return ret;
5258 }
5259
5260 static void aux_domain_remove_dev(struct dmar_domain *domain,
5261                                   struct device *dev)
5262 {
5263         struct device_domain_info *info;
5264         struct intel_iommu *iommu;
5265         unsigned long flags;
5266
5267         if (!is_aux_domain(dev, &domain->domain))
5268                 return;
5269
5270         spin_lock_irqsave(&device_domain_lock, flags);
5271         info = get_domain_info(dev);
5272         iommu = info->iommu;
5273
5274         auxiliary_unlink_device(domain, dev);
5275
5276         spin_lock(&iommu->lock);
5277         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5278         domain_detach_iommu(domain, iommu);
5279         spin_unlock(&iommu->lock);
5280
5281         spin_unlock_irqrestore(&device_domain_lock, flags);
5282 }
5283
5284 static int prepare_domain_attach_device(struct iommu_domain *domain,
5285                                         struct device *dev)
5286 {
5287         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5288         struct intel_iommu *iommu;
5289         int addr_width;
5290
5291         iommu = device_to_iommu(dev, NULL, NULL);
5292         if (!iommu)
5293                 return -ENODEV;
5294
5295         /* check if this iommu agaw is sufficient for max mapped address */
5296         addr_width = agaw_to_width(iommu->agaw);
5297         if (addr_width > cap_mgaw(iommu->cap))
5298                 addr_width = cap_mgaw(iommu->cap);
5299
5300         if (dmar_domain->max_addr > (1LL << addr_width)) {
5301                 dev_err(dev, "%s: iommu width (%d) is not "
5302                         "sufficient for the mapped address (%llx)\n",
5303                         __func__, addr_width, dmar_domain->max_addr);
5304                 return -EFAULT;
5305         }
5306         dmar_domain->gaw = addr_width;
5307
5308         /*
5309          * Knock out extra levels of page tables if necessary
5310          */
5311         while (iommu->agaw < dmar_domain->agaw) {
5312                 struct dma_pte *pte;
5313
5314                 pte = dmar_domain->pgd;
5315                 if (dma_pte_present(pte)) {
5316                         dmar_domain->pgd = (struct dma_pte *)
5317                                 phys_to_virt(dma_pte_addr(pte));
5318                         free_pgtable_page(pte);
5319                 }
5320                 dmar_domain->agaw--;
5321         }
5322
5323         return 0;
5324 }
5325
5326 static int intel_iommu_attach_device(struct iommu_domain *domain,
5327                                      struct device *dev)
5328 {
5329         int ret;
5330
5331         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5332             device_is_rmrr_locked(dev)) {
5333                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5334                 return -EPERM;
5335         }
5336
5337         if (is_aux_domain(dev, domain))
5338                 return -EPERM;
5339
5340         /* normally dev is not mapped */
5341         if (unlikely(domain_context_mapped(dev))) {
5342                 struct dmar_domain *old_domain;
5343
5344                 old_domain = find_domain(dev);
5345                 if (old_domain)
5346                         dmar_remove_one_dev_info(dev);
5347         }
5348
5349         ret = prepare_domain_attach_device(domain, dev);
5350         if (ret)
5351                 return ret;
5352
5353         return domain_add_dev_info(to_dmar_domain(domain), dev);
5354 }
5355
5356 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5357                                          struct device *dev)
5358 {
5359         int ret;
5360
5361         if (!is_aux_domain(dev, domain))
5362                 return -EPERM;
5363
5364         ret = prepare_domain_attach_device(domain, dev);
5365         if (ret)
5366                 return ret;
5367
5368         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5369 }
5370
5371 static void intel_iommu_detach_device(struct iommu_domain *domain,
5372                                       struct device *dev)
5373 {
5374         dmar_remove_one_dev_info(dev);
5375 }
5376
5377 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5378                                           struct device *dev)
5379 {
5380         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5381 }
5382
5383 /*
5384  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5385  * VT-d granularity. Invalidation is typically included in the unmap operation
5386  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5387  * owns the first level page tables. Invalidations of translation caches in the
5388  * guest are trapped and passed down to the host.
5389  *
5390  * vIOMMU in the guest will only expose first level page tables, therefore
5391  * we do not support IOTLB granularity for request without PASID (second level).
5392  *
5393  * For example, to find the VT-d granularity encoding for IOTLB
5394  * type and page selective granularity within PASID:
5395  * X: indexed by iommu cache type
5396  * Y: indexed by enum iommu_inv_granularity
5397  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5398  */
5399
5400 static const int
5401 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5402         /*
5403          * PASID based IOTLB invalidation: PASID selective (per PASID),
5404          * page selective (address granularity)
5405          */
5406         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5407         /* PASID based dev TLBs */
5408         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5409         /* PASID cache */
5410         {-EINVAL, -EINVAL, -EINVAL}
5411 };
5412
5413 static inline int to_vtd_granularity(int type, int granu)
5414 {
5415         return inv_type_granu_table[type][granu];
5416 }
5417
5418 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5419 {
5420         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5421
5422         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5423          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5424          * granu size in contiguous memory.
5425          */
5426         return order_base_2(nr_pages);
5427 }
5428
5429 #ifdef CONFIG_INTEL_IOMMU_SVM
5430 static int
5431 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5432                            struct iommu_cache_invalidate_info *inv_info)
5433 {
5434         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5435         struct device_domain_info *info;
5436         struct intel_iommu *iommu;
5437         unsigned long flags;
5438         int cache_type;
5439         u8 bus, devfn;
5440         u16 did, sid;
5441         int ret = 0;
5442         u64 size = 0;
5443
5444         if (!inv_info || !dmar_domain)
5445                 return -EINVAL;
5446
5447         if (!dev || !dev_is_pci(dev))
5448                 return -ENODEV;
5449
5450         iommu = device_to_iommu(dev, &bus, &devfn);
5451         if (!iommu)
5452                 return -ENODEV;
5453
5454         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5455                 return -EINVAL;
5456
5457         spin_lock_irqsave(&device_domain_lock, flags);
5458         spin_lock(&iommu->lock);
5459         info = get_domain_info(dev);
5460         if (!info) {
5461                 ret = -EINVAL;
5462                 goto out_unlock;
5463         }
5464         did = dmar_domain->iommu_did[iommu->seq_id];
5465         sid = PCI_DEVID(bus, devfn);
5466
5467         /* Size is only valid in address selective invalidation */
5468         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5469                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5470                                    inv_info->granu.addr_info.nb_granules);
5471
5472         for_each_set_bit(cache_type,
5473                          (unsigned long *)&inv_info->cache,
5474                          IOMMU_CACHE_INV_TYPE_NR) {
5475                 int granu = 0;
5476                 u64 pasid = 0;
5477                 u64 addr = 0;
5478
5479                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5480                 if (granu == -EINVAL) {
5481                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5482                                            cache_type, inv_info->granularity);
5483                         break;
5484                 }
5485
5486                 /*
5487                  * PASID is stored in different locations based on the
5488                  * granularity.
5489                  */
5490                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5491                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5492                         pasid = inv_info->granu.pasid_info.pasid;
5493                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5494                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5495                         pasid = inv_info->granu.addr_info.pasid;
5496
5497                 switch (BIT(cache_type)) {
5498                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5499                         /* HW will ignore LSB bits based on address mask */
5500                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5501                             size &&
5502                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5503                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5504                                                    inv_info->granu.addr_info.addr, size);
5505                         }
5506
5507                         /*
5508                          * If granu is PASID-selective, address is ignored.
5509                          * We use npages = -1 to indicate that.
5510                          */
5511                         qi_flush_piotlb(iommu, did, pasid,
5512                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5513                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5514                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5515
5516                         if (!info->ats_enabled)
5517                                 break;
5518                         /*
5519                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5520                          * in the guest may assume IOTLB flush is inclusive,
5521                          * which is more efficient.
5522                          */
5523                         fallthrough;
5524                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5525                         /*
5526                          * PASID based device TLB invalidation does not support
5527                          * IOMMU_INV_GRANU_PASID granularity but only supports
5528                          * IOMMU_INV_GRANU_ADDR.
5529                          * The equivalent of that is we set the size to be the
5530                          * entire range of 64 bit. User only provides PASID info
5531                          * without address info. So we set addr to 0.
5532                          */
5533                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5534                                 size = 64 - VTD_PAGE_SHIFT;
5535                                 addr = 0;
5536                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5537                                 addr = inv_info->granu.addr_info.addr;
5538                         }
5539
5540                         if (info->ats_enabled)
5541                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5542                                                 info->pfsid, pasid,
5543                                                 info->ats_qdep, addr,
5544                                                 size);
5545                         else
5546                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5547                         break;
5548                 default:
5549                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5550                                             cache_type);
5551                         ret = -EINVAL;
5552                 }
5553         }
5554 out_unlock:
5555         spin_unlock(&iommu->lock);
5556         spin_unlock_irqrestore(&device_domain_lock, flags);
5557
5558         return ret;
5559 }
5560 #endif
5561
5562 static int intel_iommu_map(struct iommu_domain *domain,
5563                            unsigned long iova, phys_addr_t hpa,
5564                            size_t size, int iommu_prot, gfp_t gfp)
5565 {
5566         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5567         u64 max_addr;
5568         int prot = 0;
5569         int ret;
5570
5571         if (iommu_prot & IOMMU_READ)
5572                 prot |= DMA_PTE_READ;
5573         if (iommu_prot & IOMMU_WRITE)
5574                 prot |= DMA_PTE_WRITE;
5575         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5576                 prot |= DMA_PTE_SNP;
5577
5578         max_addr = iova + size;
5579         if (dmar_domain->max_addr < max_addr) {
5580                 u64 end;
5581
5582                 /* check if minimum agaw is sufficient for mapped address */
5583                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5584                 if (end < max_addr) {
5585                         pr_err("%s: iommu width (%d) is not "
5586                                "sufficient for the mapped address (%llx)\n",
5587                                __func__, dmar_domain->gaw, max_addr);
5588                         return -EFAULT;
5589                 }
5590                 dmar_domain->max_addr = max_addr;
5591         }
5592         /* Round up size to next multiple of PAGE_SIZE, if it and
5593            the low bits of hpa would take us onto the next page */
5594         size = aligned_nrpages(hpa, size);
5595         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5596                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5597         return ret;
5598 }
5599
5600 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5601                                 unsigned long iova, size_t size,
5602                                 struct iommu_iotlb_gather *gather)
5603 {
5604         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5605         struct page *freelist = NULL;
5606         unsigned long start_pfn, last_pfn;
5607         unsigned int npages;
5608         int iommu_id, level = 0;
5609
5610         /* Cope with horrid API which requires us to unmap more than the
5611            size argument if it happens to be a large-page mapping. */
5612         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5613
5614         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5615                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5616
5617         start_pfn = iova >> VTD_PAGE_SHIFT;
5618         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5619
5620         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5621
5622         npages = last_pfn - start_pfn + 1;
5623
5624         for_each_domain_iommu(iommu_id, dmar_domain)
5625                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5626                                       start_pfn, npages, !freelist, 0);
5627
5628         dma_free_pagelist(freelist);
5629
5630         if (dmar_domain->max_addr == iova + size)
5631                 dmar_domain->max_addr = iova;
5632
5633         return size;
5634 }
5635
5636 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5637                                             dma_addr_t iova)
5638 {
5639         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5640         struct dma_pte *pte;
5641         int level = 0;
5642         u64 phys = 0;
5643
5644         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5645         if (pte && dma_pte_present(pte))
5646                 phys = dma_pte_addr(pte) +
5647                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5648                                                 VTD_PAGE_SHIFT) - 1));
5649
5650         return phys;
5651 }
5652
5653 static inline bool scalable_mode_support(void)
5654 {
5655         struct dmar_drhd_unit *drhd;
5656         struct intel_iommu *iommu;
5657         bool ret = true;
5658
5659         rcu_read_lock();
5660         for_each_active_iommu(iommu, drhd) {
5661                 if (!sm_supported(iommu)) {
5662                         ret = false;
5663                         break;
5664                 }
5665         }
5666         rcu_read_unlock();
5667
5668         return ret;
5669 }
5670
5671 static inline bool iommu_pasid_support(void)
5672 {
5673         struct dmar_drhd_unit *drhd;
5674         struct intel_iommu *iommu;
5675         bool ret = true;
5676
5677         rcu_read_lock();
5678         for_each_active_iommu(iommu, drhd) {
5679                 if (!pasid_supported(iommu)) {
5680                         ret = false;
5681                         break;
5682                 }
5683         }
5684         rcu_read_unlock();
5685
5686         return ret;
5687 }
5688
5689 static inline bool nested_mode_support(void)
5690 {
5691         struct dmar_drhd_unit *drhd;
5692         struct intel_iommu *iommu;
5693         bool ret = true;
5694
5695         rcu_read_lock();
5696         for_each_active_iommu(iommu, drhd) {
5697                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5698                         ret = false;
5699                         break;
5700                 }
5701         }
5702         rcu_read_unlock();
5703
5704         return ret;
5705 }
5706
5707 static bool intel_iommu_capable(enum iommu_cap cap)
5708 {
5709         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5710                 return domain_update_iommu_snooping(NULL) == 1;
5711         if (cap == IOMMU_CAP_INTR_REMAP)
5712                 return irq_remapping_enabled == 1;
5713
5714         return false;
5715 }
5716
5717 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5718 {
5719         struct intel_iommu *iommu;
5720
5721         iommu = device_to_iommu(dev, NULL, NULL);
5722         if (!iommu)
5723                 return ERR_PTR(-ENODEV);
5724
5725         if (translation_pre_enabled(iommu))
5726                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5727
5728         return &iommu->iommu;
5729 }
5730
5731 static void intel_iommu_release_device(struct device *dev)
5732 {
5733         struct intel_iommu *iommu;
5734
5735         iommu = device_to_iommu(dev, NULL, NULL);
5736         if (!iommu)
5737                 return;
5738
5739         dmar_remove_one_dev_info(dev);
5740
5741         set_dma_ops(dev, NULL);
5742 }
5743
5744 static void intel_iommu_probe_finalize(struct device *dev)
5745 {
5746         struct iommu_domain *domain;
5747
5748         domain = iommu_get_domain_for_dev(dev);
5749         if (device_needs_bounce(dev))
5750                 set_dma_ops(dev, &bounce_dma_ops);
5751         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5752                 set_dma_ops(dev, &intel_dma_ops);
5753         else
5754                 set_dma_ops(dev, NULL);
5755 }
5756
5757 static void intel_iommu_get_resv_regions(struct device *device,
5758                                          struct list_head *head)
5759 {
5760         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5761         struct iommu_resv_region *reg;
5762         struct dmar_rmrr_unit *rmrr;
5763         struct device *i_dev;
5764         int i;
5765
5766         down_read(&dmar_global_lock);
5767         for_each_rmrr_units(rmrr) {
5768                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5769                                           i, i_dev) {
5770                         struct iommu_resv_region *resv;
5771                         enum iommu_resv_type type;
5772                         size_t length;
5773
5774                         if (i_dev != device &&
5775                             !is_downstream_to_pci_bridge(device, i_dev))
5776                                 continue;
5777
5778                         length = rmrr->end_address - rmrr->base_address + 1;
5779
5780                         type = device_rmrr_is_relaxable(device) ?
5781                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5782
5783                         resv = iommu_alloc_resv_region(rmrr->base_address,
5784                                                        length, prot, type);
5785                         if (!resv)
5786                                 break;
5787
5788                         list_add_tail(&resv->list, head);
5789                 }
5790         }
5791         up_read(&dmar_global_lock);
5792
5793 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5794         if (dev_is_pci(device)) {
5795                 struct pci_dev *pdev = to_pci_dev(device);
5796
5797                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5798                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5799                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5800                         if (reg)
5801                                 list_add_tail(&reg->list, head);
5802                 }
5803         }
5804 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5805
5806         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5807                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5808                                       0, IOMMU_RESV_MSI);
5809         if (!reg)
5810                 return;
5811         list_add_tail(&reg->list, head);
5812 }
5813
5814 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5815 {
5816         struct device_domain_info *info;
5817         struct context_entry *context;
5818         struct dmar_domain *domain;
5819         unsigned long flags;
5820         u64 ctx_lo;
5821         int ret;
5822
5823         domain = find_domain(dev);
5824         if (!domain)
5825                 return -EINVAL;
5826
5827         spin_lock_irqsave(&device_domain_lock, flags);
5828         spin_lock(&iommu->lock);
5829
5830         ret = -EINVAL;
5831         info = get_domain_info(dev);
5832         if (!info || !info->pasid_supported)
5833                 goto out;
5834
5835         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5836         if (WARN_ON(!context))
5837                 goto out;
5838
5839         ctx_lo = context[0].lo;
5840
5841         if (!(ctx_lo & CONTEXT_PASIDE)) {
5842                 ctx_lo |= CONTEXT_PASIDE;
5843                 context[0].lo = ctx_lo;
5844                 wmb();
5845                 iommu->flush.flush_context(iommu,
5846                                            domain->iommu_did[iommu->seq_id],
5847                                            PCI_DEVID(info->bus, info->devfn),
5848                                            DMA_CCMD_MASK_NOBIT,
5849                                            DMA_CCMD_DEVICE_INVL);
5850         }
5851
5852         /* Enable PASID support in the device, if it wasn't already */
5853         if (!info->pasid_enabled)
5854                 iommu_enable_dev_iotlb(info);
5855
5856         ret = 0;
5857
5858  out:
5859         spin_unlock(&iommu->lock);
5860         spin_unlock_irqrestore(&device_domain_lock, flags);
5861
5862         return ret;
5863 }
5864
5865 static void intel_iommu_apply_resv_region(struct device *dev,
5866                                           struct iommu_domain *domain,
5867                                           struct iommu_resv_region *region)
5868 {
5869         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5870         unsigned long start, end;
5871
5872         start = IOVA_PFN(region->start);
5873         end   = IOVA_PFN(region->start + region->length - 1);
5874
5875         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5876 }
5877
5878 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5879 {
5880         if (dev_is_pci(dev))
5881                 return pci_device_group(dev);
5882         return generic_device_group(dev);
5883 }
5884
5885 static int intel_iommu_enable_auxd(struct device *dev)
5886 {
5887         struct device_domain_info *info;
5888         struct intel_iommu *iommu;
5889         unsigned long flags;
5890         int ret;
5891
5892         iommu = device_to_iommu(dev, NULL, NULL);
5893         if (!iommu || dmar_disabled)
5894                 return -EINVAL;
5895
5896         if (!sm_supported(iommu) || !pasid_supported(iommu))
5897                 return -EINVAL;
5898
5899         ret = intel_iommu_enable_pasid(iommu, dev);
5900         if (ret)
5901                 return -ENODEV;
5902
5903         spin_lock_irqsave(&device_domain_lock, flags);
5904         info = get_domain_info(dev);
5905         info->auxd_enabled = 1;
5906         spin_unlock_irqrestore(&device_domain_lock, flags);
5907
5908         return 0;
5909 }
5910
5911 static int intel_iommu_disable_auxd(struct device *dev)
5912 {
5913         struct device_domain_info *info;
5914         unsigned long flags;
5915
5916         spin_lock_irqsave(&device_domain_lock, flags);
5917         info = get_domain_info(dev);
5918         if (!WARN_ON(!info))
5919                 info->auxd_enabled = 0;
5920         spin_unlock_irqrestore(&device_domain_lock, flags);
5921
5922         return 0;
5923 }
5924
5925 /*
5926  * A PCI express designated vendor specific extended capability is defined
5927  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5928  * for system software and tools to detect endpoint devices supporting the
5929  * Intel scalable IO virtualization without host driver dependency.
5930  *
5931  * Returns the address of the matching extended capability structure within
5932  * the device's PCI configuration space or 0 if the device does not support
5933  * it.
5934  */
5935 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5936 {
5937         int pos;
5938         u16 vendor, id;
5939
5940         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5941         while (pos) {
5942                 pci_read_config_word(pdev, pos + 4, &vendor);
5943                 pci_read_config_word(pdev, pos + 8, &id);
5944                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5945                         return pos;
5946
5947                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5948         }
5949
5950         return 0;
5951 }
5952
5953 static bool
5954 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5955 {
5956         if (feat == IOMMU_DEV_FEAT_AUX) {
5957                 int ret;
5958
5959                 if (!dev_is_pci(dev) || dmar_disabled ||
5960                     !scalable_mode_support() || !iommu_pasid_support())
5961                         return false;
5962
5963                 ret = pci_pasid_features(to_pci_dev(dev));
5964                 if (ret < 0)
5965                         return false;
5966
5967                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5968         }
5969
5970         if (feat == IOMMU_DEV_FEAT_SVA) {
5971                 struct device_domain_info *info = get_domain_info(dev);
5972
5973                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5974                         info->pasid_supported && info->pri_supported &&
5975                         info->ats_supported;
5976         }
5977
5978         return false;
5979 }
5980
5981 static int
5982 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5983 {
5984         if (feat == IOMMU_DEV_FEAT_AUX)
5985                 return intel_iommu_enable_auxd(dev);
5986
5987         if (feat == IOMMU_DEV_FEAT_SVA) {
5988                 struct device_domain_info *info = get_domain_info(dev);
5989
5990                 if (!info)
5991                         return -EINVAL;
5992
5993                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5994                         return 0;
5995         }
5996
5997         return -ENODEV;
5998 }
5999
6000 static int
6001 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6002 {
6003         if (feat == IOMMU_DEV_FEAT_AUX)
6004                 return intel_iommu_disable_auxd(dev);
6005
6006         return -ENODEV;
6007 }
6008
6009 static bool
6010 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6011 {
6012         struct device_domain_info *info = get_domain_info(dev);
6013
6014         if (feat == IOMMU_DEV_FEAT_AUX)
6015                 return scalable_mode_support() && info && info->auxd_enabled;
6016
6017         return false;
6018 }
6019
6020 static int
6021 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6022 {
6023         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6024
6025         return dmar_domain->default_pasid > 0 ?
6026                         dmar_domain->default_pasid : -EINVAL;
6027 }
6028
6029 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6030                                            struct device *dev)
6031 {
6032         return attach_deferred(dev);
6033 }
6034
6035 static int
6036 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6037                             enum iommu_attr attr, void *data)
6038 {
6039         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6040         unsigned long flags;
6041         int ret = 0;
6042
6043         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6044                 return -EINVAL;
6045
6046         switch (attr) {
6047         case DOMAIN_ATTR_NESTING:
6048                 spin_lock_irqsave(&device_domain_lock, flags);
6049                 if (nested_mode_support() &&
6050                     list_empty(&dmar_domain->devices)) {
6051                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6052                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6053                 } else {
6054                         ret = -ENODEV;
6055                 }
6056                 spin_unlock_irqrestore(&device_domain_lock, flags);
6057                 break;
6058         default:
6059                 ret = -EINVAL;
6060                 break;
6061         }
6062
6063         return ret;
6064 }
6065
6066 /*
6067  * Check that the device does not live on an external facing PCI port that is
6068  * marked as untrusted. Such devices should not be able to apply quirks and
6069  * thus not be able to bypass the IOMMU restrictions.
6070  */
6071 static bool risky_device(struct pci_dev *pdev)
6072 {
6073         if (pdev->untrusted) {
6074                 pci_info(pdev,
6075                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6076                          pdev->vendor, pdev->device);
6077                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6078                 return true;
6079         }
6080         return false;
6081 }
6082
6083 const struct iommu_ops intel_iommu_ops = {
6084         .capable                = intel_iommu_capable,
6085         .domain_alloc           = intel_iommu_domain_alloc,
6086         .domain_free            = intel_iommu_domain_free,
6087         .domain_set_attr        = intel_iommu_domain_set_attr,
6088         .attach_dev             = intel_iommu_attach_device,
6089         .detach_dev             = intel_iommu_detach_device,
6090         .aux_attach_dev         = intel_iommu_aux_attach_device,
6091         .aux_detach_dev         = intel_iommu_aux_detach_device,
6092         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6093         .map                    = intel_iommu_map,
6094         .unmap                  = intel_iommu_unmap,
6095         .iova_to_phys           = intel_iommu_iova_to_phys,
6096         .probe_device           = intel_iommu_probe_device,
6097         .probe_finalize         = intel_iommu_probe_finalize,
6098         .release_device         = intel_iommu_release_device,
6099         .get_resv_regions       = intel_iommu_get_resv_regions,
6100         .put_resv_regions       = generic_iommu_put_resv_regions,
6101         .apply_resv_region      = intel_iommu_apply_resv_region,
6102         .device_group           = intel_iommu_device_group,
6103         .dev_has_feat           = intel_iommu_dev_has_feat,
6104         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6105         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6106         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6107         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6108         .def_domain_type        = device_def_domain_type,
6109         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6110 #ifdef CONFIG_INTEL_IOMMU_SVM
6111         .cache_invalidate       = intel_iommu_sva_invalidate,
6112         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6113         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6114         .sva_bind               = intel_svm_bind,
6115         .sva_unbind             = intel_svm_unbind,
6116         .sva_get_pasid          = intel_svm_get_pasid,
6117         .page_response          = intel_svm_page_response,
6118 #endif
6119 };
6120
6121 static void quirk_iommu_igfx(struct pci_dev *dev)
6122 {
6123         if (risky_device(dev))
6124                 return;
6125
6126         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6127         dmar_map_gfx = 0;
6128 }
6129
6130 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6132 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6133 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6134 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6138
6139 /* Broadwell igfx malfunctions with dmar */
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6164
6165 static void quirk_iommu_rwbf(struct pci_dev *dev)
6166 {
6167         if (risky_device(dev))
6168                 return;
6169
6170         /*
6171          * Mobile 4 Series Chipset neglects to set RWBF capability,
6172          * but needs it. Same seems to hold for the desktop versions.
6173          */
6174         pci_info(dev, "Forcing write-buffer flush capability\n");
6175         rwbf_quirk = 1;
6176 }
6177
6178 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6185
6186 #define GGC 0x52
6187 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6188 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6189 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6190 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6191 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6192 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6193 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6194 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6195
6196 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6197 {
6198         unsigned short ggc;
6199
6200         if (risky_device(dev))
6201                 return;
6202
6203         if (pci_read_config_word(dev, GGC, &ggc))
6204                 return;
6205
6206         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6207                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6208                 dmar_map_gfx = 0;
6209         } else if (dmar_map_gfx) {
6210                 /* we have to ensure the gfx device is idle before we flush */
6211                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6212                 intel_iommu_strict = 1;
6213        }
6214 }
6215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6218 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6219
6220 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6221 {
6222         unsigned short ver;
6223
6224         if (!IS_GFX_DEVICE(dev))
6225                 return;
6226
6227         ver = (dev->device >> 8) & 0xff;
6228         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6229             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6230             ver != 0x9a)
6231                 return;
6232
6233         if (risky_device(dev))
6234                 return;
6235
6236         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6237         iommu_skip_te_disable = 1;
6238 }
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6240
6241 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6242    ISOCH DMAR unit for the Azalia sound device, but not give it any
6243    TLB entries, which causes it to deadlock. Check for that.  We do
6244    this in a function called from init_dmars(), instead of in a PCI
6245    quirk, because we don't want to print the obnoxious "BIOS broken"
6246    message if VT-d is actually disabled.
6247 */
6248 static void __init check_tylersburg_isoch(void)
6249 {
6250         struct pci_dev *pdev;
6251         uint32_t vtisochctrl;
6252
6253         /* If there's no Azalia in the system anyway, forget it. */
6254         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6255         if (!pdev)
6256                 return;
6257
6258         if (risky_device(pdev)) {
6259                 pci_dev_put(pdev);
6260                 return;
6261         }
6262
6263         pci_dev_put(pdev);
6264
6265         /* System Management Registers. Might be hidden, in which case
6266            we can't do the sanity check. But that's OK, because the
6267            known-broken BIOSes _don't_ actually hide it, so far. */
6268         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6269         if (!pdev)
6270                 return;
6271
6272         if (risky_device(pdev)) {
6273                 pci_dev_put(pdev);
6274                 return;
6275         }
6276
6277         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6278                 pci_dev_put(pdev);
6279                 return;
6280         }
6281
6282         pci_dev_put(pdev);
6283
6284         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6285         if (vtisochctrl & 1)
6286                 return;
6287
6288         /* Drop all bits other than the number of TLB entries */
6289         vtisochctrl &= 0x1c;
6290
6291         /* If we have the recommended number of TLB entries (16), fine. */
6292         if (vtisochctrl == 0x10)
6293                 return;
6294
6295         /* Zero TLB entries? You get to ride the short bus to school. */
6296         if (!vtisochctrl) {
6297                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6298                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6299                      dmi_get_system_info(DMI_BIOS_VENDOR),
6300                      dmi_get_system_info(DMI_BIOS_VERSION),
6301                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6302                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6303                 return;
6304         }
6305
6306         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6307                vtisochctrl);
6308 }