drivers/iommu/intel-iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-mapping.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-contiguous.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "irq_remapping.h"
  51 #include "intel-pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(unsigned long pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline unsigned long level_mask(int level)
 132 {
 133         return -1UL << level_to_offset_bits(level);
 134 }
 135
 136 static inline unsigned long level_size(int level)
 137 {
 138         return 1UL << level_to_offset_bits(level);
 139 }
 140
 141 static inline unsigned long align_to_level(unsigned long pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 #define for_each_domain_iommu(idx, domain)                      \
 300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 301                 if (domain->iommu_refcnt[idx])
 302
 303 struct dmar_rmrr_unit {
 304         struct list_head list;          /* list of rmrr units   */
 305         struct acpi_dmar_header *hdr;   /* ACPI header          */
 306         u64     base_address;           /* reserved base address*/
 307         u64     end_address;            /* reserved end address */
 308         struct dmar_dev_scope *devices; /* target devices */
 309         int     devices_cnt;            /* target device count */
 310 };
 311
 312 struct dmar_atsr_unit {
 313         struct list_head list;          /* list of ATSR units */
 314         struct acpi_dmar_header *hdr;   /* ACPI header */
 315         struct dmar_dev_scope *devices; /* target devices */
 316         int devices_cnt;                /* target device count */
 317         u8 include_all:1;               /* include all ports */
 318 };
 319
 320 static LIST_HEAD(dmar_atsr_units);
 321 static LIST_HEAD(dmar_rmrr_units);
 322
 323 #define for_each_rmrr_units(rmrr) \
 324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 325
 326 /* bitmap for indexing intel_iommus */
 327 static int g_num_of_iommus;
 328
 329 static void domain_exit(struct dmar_domain *domain);
 330 static void domain_remove_dev_info(struct dmar_domain *domain);
 331 static void dmar_remove_one_dev_info(struct device *dev);
 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 333 static int intel_iommu_attach_device(struct iommu_domain *domain,
 334                                      struct device *dev);
 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 336                                             dma_addr_t iova);
 337
 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 339 int dmar_disabled = 0;
 340 #else
 341 int dmar_disabled = 1;
 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 343
 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 345 int intel_iommu_sm = 1;
 346 #else
 347 int intel_iommu_sm;
 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 349
 350 int intel_iommu_enabled = 0;
 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 352
 353 static int dmar_map_gfx = 1;
 354 static int dmar_forcedac;
 355 static int intel_iommu_strict;
 356 static int intel_iommu_superpage = 1;
 357 static int iommu_identity_mapping;
 358 static int intel_no_bounce;
 359
 360 #define IDENTMAP_GFX            2
 361 #define IDENTMAP_AZALIA         4
 362
 363 int intel_iommu_gfx_mapped;
 364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 365
 366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 368 struct device_domain_info *get_domain_info(struct device *dev)
 369 {
 370         struct device_domain_info *info;
 371
 372         if (!dev)
 373                 return NULL;
 374
 375         info = dev->archdata.iommu;
 376         if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
 377                      info == DEFER_DEVICE_DOMAIN_INFO))
 378                 return NULL;
 379
 380         return info;
 381 }
 382
 383 DEFINE_SPINLOCK(device_domain_lock);
 384 static LIST_HEAD(device_domain_list);
 385
 386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 387                                 to_pci_dev(d)->untrusted)
 388
 389 /*
 390  * Iterate over elements in device_domain_list and call the specified
 391  * callback @fn against each element.
 392  */
 393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 394                                      void *data), void *data)
 395 {
 396         int ret = 0;
 397         unsigned long flags;
 398         struct device_domain_info *info;
 399
 400         spin_lock_irqsave(&device_domain_lock, flags);
 401         list_for_each_entry(info, &device_domain_list, global) {
 402                 ret = fn(info, data);
 403                 if (ret) {
 404                         spin_unlock_irqrestore(&device_domain_lock, flags);
 405                         return ret;
 406                 }
 407         }
 408         spin_unlock_irqrestore(&device_domain_lock, flags);
 409
 410         return 0;
 411 }
 412
 413 const struct iommu_ops intel_iommu_ops;
 414
 415 static bool translation_pre_enabled(struct intel_iommu *iommu)
 416 {
 417         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 418 }
 419
 420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 421 {
 422         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 423 }
 424
 425 static void init_translation_status(struct intel_iommu *iommu)
 426 {
 427         u32 gsts;
 428
 429         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 430         if (gsts & DMA_GSTS_TES)
 431                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 432 }
 433
 434 static int __init intel_iommu_setup(char *str)
 435 {
 436         if (!str)
 437                 return -EINVAL;
 438         while (*str) {
 439                 if (!strncmp(str, "on", 2)) {
 440                         dmar_disabled = 0;
 441                         pr_info("IOMMU enabled\n");
 442                 } else if (!strncmp(str, "off", 3)) {
 443                         dmar_disabled = 1;
 444                         no_platform_optin = 1;
 445                         pr_info("IOMMU disabled\n");
 446                 } else if (!strncmp(str, "igfx_off", 8)) {
 447                         dmar_map_gfx = 0;
 448                         pr_info("Disable GFX device mapping\n");
 449                 } else if (!strncmp(str, "forcedac", 8)) {
 450                         pr_info("Forcing DAC for PCI devices\n");
 451                         dmar_forcedac = 1;
 452                 } else if (!strncmp(str, "strict", 6)) {
 453                         pr_info("Disable batched IOTLB flush\n");
 454                         intel_iommu_strict = 1;
 455                 } else if (!strncmp(str, "sp_off", 6)) {
 456                         pr_info("Disable supported super page\n");
 457                         intel_iommu_superpage = 0;
 458                 } else if (!strncmp(str, "sm_on", 5)) {
 459                         pr_info("Intel-IOMMU: scalable mode supported\n");
 460                         intel_iommu_sm = 1;
 461                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 462                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 463                         intel_iommu_tboot_noforce = 1;
 464                 } else if (!strncmp(str, "nobounce", 8)) {
 465                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 466                         intel_no_bounce = 1;
 467                 }
 468
 469                 str += strcspn(str, ",");
 470                 while (*str == ',')
 471                         str++;
 472         }
 473         return 0;
 474 }
 475 __setup("intel_iommu=", intel_iommu_setup);
 476
 477 static struct kmem_cache *iommu_domain_cache;
 478 static struct kmem_cache *iommu_devinfo_cache;
 479
 480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 481 {
 482         struct dmar_domain **domains;
 483         int idx = did >> 8;
 484
 485         domains = iommu->domains[idx];
 486         if (!domains)
 487                 return NULL;
 488
 489         return domains[did & 0xff];
 490 }
 491
 492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 493                              struct dmar_domain *domain)
 494 {
 495         struct dmar_domain **domains;
 496         int idx = did >> 8;
 497
 498         if (!iommu->domains[idx]) {
 499                 size_t size = 256 * sizeof(struct dmar_domain *);
 500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 501         }
 502
 503         domains = iommu->domains[idx];
 504         if (WARN_ON(!domains))
 505                 return;
 506         else
 507                 domains[did & 0xff] = domain;
 508 }
 509
 510 void *alloc_pgtable_page(int node)
 511 {
 512         struct page *page;
 513         void *vaddr = NULL;
 514
 515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 516         if (page)
 517                 vaddr = page_address(page);
 518         return vaddr;
 519 }
 520
 521 void free_pgtable_page(void *vaddr)
 522 {
 523         free_page((unsigned long)vaddr);
 524 }
 525
 526 static inline void *alloc_domain_mem(void)
 527 {
 528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 529 }
 530
 531 static void free_domain_mem(void *vaddr)
 532 {
 533         kmem_cache_free(iommu_domain_cache, vaddr);
 534 }
 535
 536 static inline void * alloc_devinfo_mem(void)
 537 {
 538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 539 }
 540
 541 static inline void free_devinfo_mem(void *vaddr)
 542 {
 543         kmem_cache_free(iommu_devinfo_cache, vaddr);
 544 }
 545
 546 static inline int domain_type_is_si(struct dmar_domain *domain)
 547 {
 548         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 549 }
 550
 551 static inline bool domain_use_first_level(struct dmar_domain *domain)
 552 {
 553         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 554 }
 555
 556 static inline int domain_pfn_supported(struct dmar_domain *domain,
 557                                        unsigned long pfn)
 558 {
 559         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 560
 561         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 562 }
 563
 564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 565 {
 566         unsigned long sagaw;
 567         int agaw = -1;
 568
 569         sagaw = cap_sagaw(iommu->cap);
 570         for (agaw = width_to_agaw(max_gaw);
 571              agaw >= 0; agaw--) {
 572                 if (test_bit(agaw, &sagaw))
 573                         break;
 574         }
 575
 576         return agaw;
 577 }
 578
 579 /*
 580  * Calculate max SAGAW for each iommu.
 581  */
 582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 583 {
 584         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 585 }
 586
 587 /*
 588  * calculate agaw for each iommu.
 589  * "SAGAW" may be different across iommus, use a default agaw, and
 590  * get a supported less agaw for iommus that don't support the default agaw.
 591  */
 592 int iommu_calculate_agaw(struct intel_iommu *iommu)
 593 {
 594         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 595 }
 596
 597 /* This functionin only returns single iommu in a domain */
 598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 599 {
 600         int iommu_id;
 601
 602         /* si_domain and vm domain should not get here. */
 603         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 604                 return NULL;
 605
 606         for_each_domain_iommu(iommu_id, domain)
 607                 break;
 608
 609         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 610                 return NULL;
 611
 612         return g_iommus[iommu_id];
 613 }
 614
 615 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 616 {
 617         struct dmar_drhd_unit *drhd;
 618         struct intel_iommu *iommu;
 619         bool found = false;
 620         int i;
 621
 622         domain->iommu_coherency = 1;
 623
 624         for_each_domain_iommu(i, domain) {
 625                 found = true;
 626                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 627                         domain->iommu_coherency = 0;
 628                         break;
 629                 }
 630         }
 631         if (found)
 632                 return;
 633
 634         /* No hardware attached; use lowest common denominator */
 635         rcu_read_lock();
 636         for_each_active_iommu(iommu, drhd) {
 637                 if (!ecap_coherent(iommu->ecap)) {
 638                         domain->iommu_coherency = 0;
 639                         break;
 640                 }
 641         }
 642         rcu_read_unlock();
 643 }
 644
 645 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 646 {
 647         struct dmar_drhd_unit *drhd;
 648         struct intel_iommu *iommu;
 649         int ret = 1;
 650
 651         rcu_read_lock();
 652         for_each_active_iommu(iommu, drhd) {
 653                 if (iommu != skip) {
 654                         if (!ecap_sc_support(iommu->ecap)) {
 655                                 ret = 0;
 656                                 break;
 657                         }
 658                 }
 659         }
 660         rcu_read_unlock();
 661
 662         return ret;
 663 }
 664
 665 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 666                                          struct intel_iommu *skip)
 667 {
 668         struct dmar_drhd_unit *drhd;
 669         struct intel_iommu *iommu;
 670         int mask = 0x3;
 671
 672         if (!intel_iommu_superpage) {
 673                 return 0;
 674         }
 675
 676         /* set iommu_superpage to the smallest common denominator */
 677         rcu_read_lock();
 678         for_each_active_iommu(iommu, drhd) {
 679                 if (iommu != skip) {
 680                         if (domain && domain_use_first_level(domain)) {
 681                                 if (!cap_fl1gp_support(iommu->cap))
 682                                         mask = 0x1;
 683                         } else {
 684                                 mask &= cap_super_page_val(iommu->cap);
 685                         }
 686
 687                         if (!mask)
 688                                 break;
 689                 }
 690         }
 691         rcu_read_unlock();
 692
 693         return fls(mask);
 694 }
 695
 696 /* Some capabilities may be different across iommus */
 697 static void domain_update_iommu_cap(struct dmar_domain *domain)
 698 {
 699         domain_update_iommu_coherency(domain);
 700         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 701         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 702 }
 703
 704 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 705                                          u8 devfn, int alloc)
 706 {
 707         struct root_entry *root = &iommu->root_entry[bus];
 708         struct context_entry *context;
 709         u64 *entry;
 710
 711         entry = &root->lo;
 712         if (sm_supported(iommu)) {
 713                 if (devfn >= 0x80) {
 714                         devfn -= 0x80;
 715                         entry = &root->hi;
 716                 }
 717                 devfn *= 2;
 718         }
 719         if (*entry & 1)
 720                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 721         else {
 722                 unsigned long phy_addr;
 723                 if (!alloc)
 724                         return NULL;
 725
 726                 context = alloc_pgtable_page(iommu->node);
 727                 if (!context)
 728                         return NULL;
 729
 730                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 731                 phy_addr = virt_to_phys((void *)context);
 732                 *entry = phy_addr | 1;
 733                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 734         }
 735         return &context[devfn];
 736 }
 737
 738 static int iommu_dummy(struct device *dev)
 739 {
 740         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 741 }
 742
 743 static bool attach_deferred(struct device *dev)
 744 {
 745         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
 746 }
 747
 748 /**
 749  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 750  *                               sub-hierarchy of a candidate PCI-PCI bridge
 751  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 752  * @bridge: the candidate PCI-PCI bridge
 753  *
 754  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 755  */
 756 static bool
 757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 758 {
 759         struct pci_dev *pdev, *pbridge;
 760
 761         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 762                 return false;
 763
 764         pdev = to_pci_dev(dev);
 765         pbridge = to_pci_dev(bridge);
 766
 767         if (pbridge->subordinate &&
 768             pbridge->subordinate->number <= pdev->bus->number &&
 769             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 770                 return true;
 771
 772         return false;
 773 }
 774
 775 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 776 {
 777         struct dmar_drhd_unit *drhd = NULL;
 778         struct intel_iommu *iommu;
 779         struct device *tmp;
 780         struct pci_dev *pdev = NULL;
 781         u16 segment = 0;
 782         int i;
 783
 784         if (iommu_dummy(dev))
 785                 return NULL;
 786
 787         if (dev_is_pci(dev)) {
 788                 struct pci_dev *pf_pdev;
 789
 790                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 791
 792                 /* VFs aren't listed in scope tables; we need to look up
 793                  * the PF instead to find the IOMMU. */
 794                 pf_pdev = pci_physfn(pdev);
 795                 dev = &pf_pdev->dev;
 796                 segment = pci_domain_nr(pdev->bus);
 797         } else if (has_acpi_companion(dev))
 798                 dev = &ACPI_COMPANION(dev)->dev;
 799
 800         rcu_read_lock();
 801         for_each_active_iommu(iommu, drhd) {
 802                 if (pdev && segment != drhd->segment)
 803                         continue;
 804
 805                 for_each_active_dev_scope(drhd->devices,
 806                                           drhd->devices_cnt, i, tmp) {
 807                         if (tmp == dev) {
 808                                 /* For a VF use its original BDF# not that of the PF
 809                                  * which we used for the IOMMU lookup. Strictly speaking
 810                                  * we could do this for all PCI devices; we only need to
 811                                  * get the BDF# from the scope table for ACPI matches. */
 812                                 if (pdev && pdev->is_virtfn)
 813                                         goto got_pdev;
 814
 815                                 *bus = drhd->devices[i].bus;
 816                                 *devfn = drhd->devices[i].devfn;
 817                                 goto out;
 818                         }
 819
 820                         if (is_downstream_to_pci_bridge(dev, tmp))
 821                                 goto got_pdev;
 822                 }
 823
 824                 if (pdev && drhd->include_all) {
 825                 got_pdev:
 826                         *bus = pdev->bus->number;
 827                         *devfn = pdev->devfn;
 828                         goto out;
 829                 }
 830         }
 831         iommu = NULL;
 832  out:
 833         rcu_read_unlock();
 834
 835         return iommu;
 836 }
 837
 838 static void domain_flush_cache(struct dmar_domain *domain,
 839                                void *addr, int size)
 840 {
 841         if (!domain->iommu_coherency)
 842                 clflush_cache_range(addr, size);
 843 }
 844
 845 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 846 {
 847         struct context_entry *context;
 848         int ret = 0;
 849         unsigned long flags;
 850
 851         spin_lock_irqsave(&iommu->lock, flags);
 852         context = iommu_context_addr(iommu, bus, devfn, 0);
 853         if (context)
 854                 ret = context_present(context);
 855         spin_unlock_irqrestore(&iommu->lock, flags);
 856         return ret;
 857 }
 858
 859 static void free_context_table(struct intel_iommu *iommu)
 860 {
 861         int i;
 862         unsigned long flags;
 863         struct context_entry *context;
 864
 865         spin_lock_irqsave(&iommu->lock, flags);
 866         if (!iommu->root_entry) {
 867                 goto out;
 868         }
 869         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 870                 context = iommu_context_addr(iommu, i, 0, 0);
 871                 if (context)
 872                         free_pgtable_page(context);
 873
 874                 if (!sm_supported(iommu))
 875                         continue;
 876
 877                 context = iommu_context_addr(iommu, i, 0x80, 0);
 878                 if (context)
 879                         free_pgtable_page(context);
 880
 881         }
 882         free_pgtable_page(iommu->root_entry);
 883         iommu->root_entry = NULL;
 884 out:
 885         spin_unlock_irqrestore(&iommu->lock, flags);
 886 }
 887
 888 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 889                                       unsigned long pfn, int *target_level)
 890 {
 891         struct dma_pte *parent, *pte;
 892         int level = agaw_to_level(domain->agaw);
 893         int offset;
 894
 895         BUG_ON(!domain->pgd);
 896
 897         if (!domain_pfn_supported(domain, pfn))
 898                 /* Address beyond IOMMU's addressing capabilities. */
 899                 return NULL;
 900
 901         parent = domain->pgd;
 902
 903         while (1) {
 904                 void *tmp_page;
 905
 906                 offset = pfn_level_offset(pfn, level);
 907                 pte = &parent[offset];
 908                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 909                         break;
 910                 if (level == *target_level)
 911                         break;
 912
 913                 if (!dma_pte_present(pte)) {
 914                         uint64_t pteval;
 915
 916                         tmp_page = alloc_pgtable_page(domain->nid);
 917
 918                         if (!tmp_page)
 919                                 return NULL;
 920
 921                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 922                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 923                         if (domain_use_first_level(domain))
 924                                 pteval |= DMA_FL_PTE_XD;
 925                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 926                                 /* Someone else set it while we were thinking; use theirs. */
 927                                 free_pgtable_page(tmp_page);
 928                         else
 929                                 domain_flush_cache(domain, pte, sizeof(*pte));
 930                 }
 931                 if (level == 1)
 932                         break;
 933
 934                 parent = phys_to_virt(dma_pte_addr(pte));
 935                 level--;
 936         }
 937
 938         if (!*target_level)
 939                 *target_level = level;
 940
 941         return pte;
 942 }
 943
 944 /* return address's pte at specific level */
 945 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 946                                          unsigned long pfn,
 947                                          int level, int *large_page)
 948 {
 949         struct dma_pte *parent, *pte;
 950         int total = agaw_to_level(domain->agaw);
 951         int offset;
 952
 953         parent = domain->pgd;
 954         while (level <= total) {
 955                 offset = pfn_level_offset(pfn, total);
 956                 pte = &parent[offset];
 957                 if (level == total)
 958                         return pte;
 959
 960                 if (!dma_pte_present(pte)) {
 961                         *large_page = total;
 962                         break;
 963                 }
 964
 965                 if (dma_pte_superpage(pte)) {
 966                         *large_page = total;
 967                         return pte;
 968                 }
 969
 970                 parent = phys_to_virt(dma_pte_addr(pte));
 971                 total--;
 972         }
 973         return NULL;
 974 }
 975
 976 /* clear last level pte, a tlb flush should be followed */
 977 static void dma_pte_clear_range(struct dmar_domain *domain,
 978                                 unsigned long start_pfn,
 979                                 unsigned long last_pfn)
 980 {
 981         unsigned int large_page;
 982         struct dma_pte *first_pte, *pte;
 983
 984         BUG_ON(!domain_pfn_supported(domain, start_pfn));
 985         BUG_ON(!domain_pfn_supported(domain, last_pfn));
 986         BUG_ON(start_pfn > last_pfn);
 987
 988         /* we don't need lock here; nobody else touches the iova range */
 989         do {
 990                 large_page = 1;
 991                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 992                 if (!pte) {
 993                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 994                         continue;
 995                 }
 996                 do {
 997                         dma_clear_pte(pte);
 998                         start_pfn += lvl_to_nr_pages(large_page);
 999                         pte++;
1000                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1001
1002                 domain_flush_cache(domain, first_pte,
1003                                    (void *)pte - (void *)first_pte);
1004
1005         } while (start_pfn && start_pfn <= last_pfn);
1006 }
1007
1008 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1009                                int retain_level, struct dma_pte *pte,
1010                                unsigned long pfn, unsigned long start_pfn,
1011                                unsigned long last_pfn)
1012 {
1013         pfn = max(start_pfn, pfn);
1014         pte = &pte[pfn_level_offset(pfn, level)];
1015
1016         do {
1017                 unsigned long level_pfn;
1018                 struct dma_pte *level_pte;
1019
1020                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1021                         goto next;
1022
1023                 level_pfn = pfn & level_mask(level);
1024                 level_pte = phys_to_virt(dma_pte_addr(pte));
1025
1026                 if (level > 2) {
1027                         dma_pte_free_level(domain, level - 1, retain_level,
1028                                            level_pte, level_pfn, start_pfn,
1029                                            last_pfn);
1030                 }
1031
1032                 /*
1033                  * Free the page table if we're below the level we want to
1034                  * retain and the range covers the entire table.
1035                  */
1036                 if (level < retain_level && !(start_pfn > level_pfn ||
1037                       last_pfn < level_pfn + level_size(level) - 1)) {
1038                         dma_clear_pte(pte);
1039                         domain_flush_cache(domain, pte, sizeof(*pte));
1040                         free_pgtable_page(level_pte);
1041                 }
1042 next:
1043                 pfn += level_size(level);
1044         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1045 }
1046
1047 /*
1048  * clear last level (leaf) ptes and free page table pages below the
1049  * level we wish to keep intact.
1050  */
1051 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1052                                    unsigned long start_pfn,
1053                                    unsigned long last_pfn,
1054                                    int retain_level)
1055 {
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         dma_pte_clear_range(domain, start_pfn, last_pfn);
1061
1062         /* We don't need lock here; nobody else touches the iova range */
1063         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1064                            domain->pgd, 0, start_pfn, last_pfn);
1065
1066         /* free pgd */
1067         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1068                 free_pgtable_page(domain->pgd);
1069                 domain->pgd = NULL;
1070         }
1071 }
1072
1073 /* When a page at a given level is being unlinked from its parent, we don't
1074    need to *modify* it at all. All we need to do is make a list of all the
1075    pages which can be freed just as soon as we've flushed the IOTLB and we
1076    know the hardware page-walk will no longer touch them.
1077    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1078    be freed. */
1079 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1080                                             int level, struct dma_pte *pte,
1081                                             struct page *freelist)
1082 {
1083         struct page *pg;
1084
1085         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1086         pg->freelist = freelist;
1087         freelist = pg;
1088
1089         if (level == 1)
1090                 return freelist;
1091
1092         pte = page_address(pg);
1093         do {
1094                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1095                         freelist = dma_pte_list_pagetables(domain, level - 1,
1096                                                            pte, freelist);
1097                 pte++;
1098         } while (!first_pte_in_page(pte));
1099
1100         return freelist;
1101 }
1102
1103 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1104                                         struct dma_pte *pte, unsigned long pfn,
1105                                         unsigned long start_pfn,
1106                                         unsigned long last_pfn,
1107                                         struct page *freelist)
1108 {
1109         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1110
1111         pfn = max(start_pfn, pfn);
1112         pte = &pte[pfn_level_offset(pfn, level)];
1113
1114         do {
1115                 unsigned long level_pfn;
1116
1117                 if (!dma_pte_present(pte))
1118                         goto next;
1119
1120                 level_pfn = pfn & level_mask(level);
1121
1122                 /* If range covers entire pagetable, free it */
1123                 if (start_pfn <= level_pfn &&
1124                     last_pfn >= level_pfn + level_size(level) - 1) {
1125                         /* These suborbinate page tables are going away entirely. Don't
1126                            bother to clear them; we're just going to *free* them. */
1127                         if (level > 1 && !dma_pte_superpage(pte))
1128                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1129
1130                         dma_clear_pte(pte);
1131                         if (!first_pte)
1132                                 first_pte = pte;
1133                         last_pte = pte;
1134                 } else if (level > 1) {
1135                         /* Recurse down into a level that isn't *entirely* obsolete */
1136                         freelist = dma_pte_clear_level(domain, level - 1,
1137                                                        phys_to_virt(dma_pte_addr(pte)),
1138                                                        level_pfn, start_pfn, last_pfn,
1139                                                        freelist);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144
1145         if (first_pte)
1146                 domain_flush_cache(domain, first_pte,
1147                                    (void *)++last_pte - (void *)first_pte);
1148
1149         return freelist;
1150 }
1151
1152 /* We can't just free the pages because the IOMMU may still be walking
1153    the page tables, and may have cached the intermediate levels. The
1154    pages can only be freed after the IOTLB flush has been done. */
1155 static struct page *domain_unmap(struct dmar_domain *domain,
1156                                  unsigned long start_pfn,
1157                                  unsigned long last_pfn)
1158 {
1159         struct page *freelist;
1160
1161         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1162         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1163         BUG_ON(start_pfn > last_pfn);
1164
1165         /* we don't need lock here; nobody else touches the iova range */
1166         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1167                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1168
1169         /* free pgd */
1170         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1171                 struct page *pgd_page = virt_to_page(domain->pgd);
1172                 pgd_page->freelist = freelist;
1173                 freelist = pgd_page;
1174
1175                 domain->pgd = NULL;
1176         }
1177
1178         return freelist;
1179 }
1180
1181 static void dma_free_pagelist(struct page *freelist)
1182 {
1183         struct page *pg;
1184
1185         while ((pg = freelist)) {
1186                 freelist = pg->freelist;
1187                 free_pgtable_page(page_address(pg));
1188         }
1189 }
1190
1191 static void iova_entry_free(unsigned long data)
1192 {
1193         struct page *freelist = (struct page *)data;
1194
1195         dma_free_pagelist(freelist);
1196 }
1197
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201         struct root_entry *root;
1202         unsigned long flags;
1203
1204         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1205         if (!root) {
1206                 pr_err("Allocating root entry for %s failed\n",
1207                         iommu->name);
1208                 return -ENOMEM;
1209         }
1210
1211         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1212
1213         spin_lock_irqsave(&iommu->lock, flags);
1214         iommu->root_entry = root;
1215         spin_unlock_irqrestore(&iommu->lock, flags);
1216
1217         return 0;
1218 }
1219
1220 static void iommu_set_root_entry(struct intel_iommu *iommu)
1221 {
1222         u64 addr;
1223         u32 sts;
1224         unsigned long flag;
1225
1226         addr = virt_to_phys(iommu->root_entry);
1227         if (sm_supported(iommu))
1228                 addr |= DMA_RTADDR_SMT;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1232
1233         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1234
1235         /* Make sure hardware complete it */
1236         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237                       readl, (sts & DMA_GSTS_RTPS), sts);
1238
1239         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 }
1241
1242 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1243 {
1244         u32 val;
1245         unsigned long flag;
1246
1247         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1248                 return;
1249
1250         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1251         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1252
1253         /* Make sure hardware complete it */
1254         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255                       readl, (!(val & DMA_GSTS_WBFS)), val);
1256
1257         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1258 }
1259
1260 /* return value determine if we need a write buffer flush */
1261 static void __iommu_flush_context(struct intel_iommu *iommu,
1262                                   u16 did, u16 source_id, u8 function_mask,
1263                                   u64 type)
1264 {
1265         u64 val = 0;
1266         unsigned long flag;
1267
1268         switch (type) {
1269         case DMA_CCMD_GLOBAL_INVL:
1270                 val = DMA_CCMD_GLOBAL_INVL;
1271                 break;
1272         case DMA_CCMD_DOMAIN_INVL:
1273                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1274                 break;
1275         case DMA_CCMD_DEVICE_INVL:
1276                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1277                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1278                 break;
1279         default:
1280                 BUG();
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 BUG();
1317         }
1318         /* Note: set drain read/write */
1319 #if 0
1320         /*
1321          * This is probably to be super secure.. Looks like we can
1322          * ignore it without any impact.
1323          */
1324         if (cap_read_drain(iommu->cap))
1325                 val |= DMA_TLB_READ_DRAIN;
1326 #endif
1327         if (cap_write_drain(iommu->cap))
1328                 val |= DMA_TLB_WRITE_DRAIN;
1329
1330         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1331         /* Note: Only uses first TLB reg currently */
1332         if (val_iva)
1333                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1334         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1335
1336         /* Make sure hardware complete it */
1337         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1338                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1339
1340         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1341
1342         /* check IOTLB invalidation granularity */
1343         if (DMA_TLB_IAIG(val) == 0)
1344                 pr_err("Flush IOTLB failed\n");
1345         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1346                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1347                         (unsigned long long)DMA_TLB_IIRG(type),
1348                         (unsigned long long)DMA_TLB_IAIG(val));
1349 }
1350
1351 static struct device_domain_info *
1352 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1353                          u8 bus, u8 devfn)
1354 {
1355         struct device_domain_info *info;
1356
1357         assert_spin_locked(&device_domain_lock);
1358
1359         if (!iommu->qi)
1360                 return NULL;
1361
1362         list_for_each_entry(info, &domain->devices, link)
1363                 if (info->iommu == iommu && info->bus == bus &&
1364                     info->devfn == devfn) {
1365                         if (info->ats_supported && info->dev)
1366                                 return info;
1367                         break;
1368                 }
1369
1370         return NULL;
1371 }
1372
1373 static void domain_update_iotlb(struct dmar_domain *domain)
1374 {
1375         struct device_domain_info *info;
1376         bool has_iotlb_device = false;
1377
1378         assert_spin_locked(&device_domain_lock);
1379
1380         list_for_each_entry(info, &domain->devices, link) {
1381                 struct pci_dev *pdev;
1382
1383                 if (!info->dev || !dev_is_pci(info->dev))
1384                         continue;
1385
1386                 pdev = to_pci_dev(info->dev);
1387                 if (pdev->ats_enabled) {
1388                         has_iotlb_device = true;
1389                         break;
1390                 }
1391         }
1392
1393         domain->has_iotlb_device = has_iotlb_device;
1394 }
1395
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1397 {
1398         struct pci_dev *pdev;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         if (!info || !dev_is_pci(info->dev))
1403                 return;
1404
1405         pdev = to_pci_dev(info->dev);
1406         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1407          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1408          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1409          * reserved, which should be set to 0.
1410          */
1411         if (!ecap_dit(info->iommu->ecap))
1412                 info->pfsid = 0;
1413         else {
1414                 struct pci_dev *pf_pdev;
1415
1416                 /* pdev will be returned if device is not a vf */
1417                 pf_pdev = pci_physfn(pdev);
1418                 info->pfsid = pci_dev_id(pf_pdev);
1419         }
1420
1421 #ifdef CONFIG_INTEL_IOMMU_SVM
1422         /* The PCIe spec, in its wisdom, declares that the behaviour of
1423            the device if you enable PASID support after ATS support is
1424            undefined. So always enable PASID support on devices which
1425            have it, even if we can't yet know if we're ever going to
1426            use it. */
1427         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428                 info->pasid_enabled = 1;
1429
1430         if (info->pri_supported &&
1431             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1432             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1433                 info->pri_enabled = 1;
1434 #endif
1435         if (!pdev->untrusted && info->ats_supported &&
1436             pci_ats_page_aligned(pdev) &&
1437             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438                 info->ats_enabled = 1;
1439                 domain_update_iotlb(info->domain);
1440                 info->ats_qdep = pci_ats_queue_depth(pdev);
1441         }
1442 }
1443
1444 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1445 {
1446         struct pci_dev *pdev;
1447
1448         assert_spin_locked(&device_domain_lock);
1449
1450         if (!dev_is_pci(info->dev))
1451                 return;
1452
1453         pdev = to_pci_dev(info->dev);
1454
1455         if (info->ats_enabled) {
1456                 pci_disable_ats(pdev);
1457                 info->ats_enabled = 0;
1458                 domain_update_iotlb(info->domain);
1459         }
1460 #ifdef CONFIG_INTEL_IOMMU_SVM
1461         if (info->pri_enabled) {
1462                 pci_disable_pri(pdev);
1463                 info->pri_enabled = 0;
1464         }
1465         if (info->pasid_enabled) {
1466                 pci_disable_pasid(pdev);
1467                 info->pasid_enabled = 0;
1468         }
1469 #endif
1470 }
1471
1472 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1473                                   u64 addr, unsigned mask)
1474 {
1475         u16 sid, qdep;
1476         unsigned long flags;
1477         struct device_domain_info *info;
1478
1479         if (!domain->has_iotlb_device)
1480                 return;
1481
1482         spin_lock_irqsave(&device_domain_lock, flags);
1483         list_for_each_entry(info, &domain->devices, link) {
1484                 if (!info->ats_enabled)
1485                         continue;
1486
1487                 sid = info->bus << 8 | info->devfn;
1488                 qdep = info->ats_qdep;
1489                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1490                                 qdep, addr, mask);
1491         }
1492         spin_unlock_irqrestore(&device_domain_lock, flags);
1493 }
1494
1495 static void domain_flush_piotlb(struct intel_iommu *iommu,
1496                                 struct dmar_domain *domain,
1497                                 u64 addr, unsigned long npages, bool ih)
1498 {
1499         u16 did = domain->iommu_did[iommu->seq_id];
1500
1501         if (domain->default_pasid)
1502                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1503                                 addr, npages, ih);
1504
1505         if (!list_empty(&domain->devices))
1506                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1507 }
1508
1509 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1510                                   struct dmar_domain *domain,
1511                                   unsigned long pfn, unsigned int pages,
1512                                   int ih, int map)
1513 {
1514         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1515         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1516         u16 did = domain->iommu_did[iommu->seq_id];
1517
1518         BUG_ON(pages == 0);
1519
1520         if (ih)
1521                 ih = 1 << 6;
1522
1523         if (domain_use_first_level(domain)) {
1524                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1525         } else {
1526                 /*
1527                  * Fallback to domain selective flush if no PSI support or
1528                  * the size is too big. PSI requires page size to be 2 ^ x,
1529                  * and the base address is naturally aligned to the size.
1530                  */
1531                 if (!cap_pgsel_inv(iommu->cap) ||
1532                     mask > cap_max_amask_val(iommu->cap))
1533                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1534                                                         DMA_TLB_DSI_FLUSH);
1535                 else
1536                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1537                                                         DMA_TLB_PSI_FLUSH);
1538         }
1539
1540         /*
1541          * In caching mode, changes of pages from non-present to present require
1542          * flush. However, device IOTLB doesn't need to be flushed in this case.
1543          */
1544         if (!cap_caching_mode(iommu->cap) || !map)
1545                 iommu_flush_dev_iotlb(domain, addr, mask);
1546 }
1547
1548 /* Notification for newly created mappings */
1549 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1550                                         struct dmar_domain *domain,
1551                                         unsigned long pfn, unsigned int pages)
1552 {
1553         /*
1554          * It's a non-present to present mapping. Only flush if caching mode
1555          * and second level.
1556          */
1557         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1558                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1559         else
1560                 iommu_flush_write_buffer(iommu);
1561 }
1562
1563 static void iommu_flush_iova(struct iova_domain *iovad)
1564 {
1565         struct dmar_domain *domain;
1566         int idx;
1567
1568         domain = container_of(iovad, struct dmar_domain, iovad);
1569
1570         for_each_domain_iommu(idx, domain) {
1571                 struct intel_iommu *iommu = g_iommus[idx];
1572                 u16 did = domain->iommu_did[iommu->seq_id];
1573
1574                 if (domain_use_first_level(domain))
1575                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1576                 else
1577                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1578                                                  DMA_TLB_DSI_FLUSH);
1579
1580                 if (!cap_caching_mode(iommu->cap))
1581                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1582                                               0, MAX_AGAW_PFN_WIDTH);
1583         }
1584 }
1585
1586 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1587 {
1588         u32 pmen;
1589         unsigned long flags;
1590
1591         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1592                 return;
1593
1594         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1595         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1596         pmen &= ~DMA_PMEN_EPM;
1597         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1598
1599         /* wait for the protected region status bit to clear */
1600         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1601                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1602
1603         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1604 }
1605
1606 static void iommu_enable_translation(struct intel_iommu *iommu)
1607 {
1608         u32 sts;
1609         unsigned long flags;
1610
1611         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612         iommu->gcmd |= DMA_GCMD_TE;
1613         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1614
1615         /* Make sure hardware complete it */
1616         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1617                       readl, (sts & DMA_GSTS_TES), sts);
1618
1619         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1620 }
1621
1622 static void iommu_disable_translation(struct intel_iommu *iommu)
1623 {
1624         u32 sts;
1625         unsigned long flag;
1626
1627         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1628         iommu->gcmd &= ~DMA_GCMD_TE;
1629         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1630
1631         /* Make sure hardware complete it */
1632         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1633                       readl, (!(sts & DMA_GSTS_TES)), sts);
1634
1635         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1636 }
1637
1638 static int iommu_init_domains(struct intel_iommu *iommu)
1639 {
1640         u32 ndomains, nlongs;
1641         size_t size;
1642
1643         ndomains = cap_ndoms(iommu->cap);
1644         pr_debug("%s: Number of Domains supported <%d>\n",
1645                  iommu->name, ndomains);
1646         nlongs = BITS_TO_LONGS(ndomains);
1647
1648         spin_lock_init(&iommu->lock);
1649
1650         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1651         if (!iommu->domain_ids) {
1652                 pr_err("%s: Allocating domain id array failed\n",
1653                        iommu->name);
1654                 return -ENOMEM;
1655         }
1656
1657         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1658         iommu->domains = kzalloc(size, GFP_KERNEL);
1659
1660         if (iommu->domains) {
1661                 size = 256 * sizeof(struct dmar_domain *);
1662                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1663         }
1664
1665         if (!iommu->domains || !iommu->domains[0]) {
1666                 pr_err("%s: Allocating domain array failed\n",
1667                        iommu->name);
1668                 kfree(iommu->domain_ids);
1669                 kfree(iommu->domains);
1670                 iommu->domain_ids = NULL;
1671                 iommu->domains    = NULL;
1672                 return -ENOMEM;
1673         }
1674
1675         /*
1676          * If Caching mode is set, then invalid translations are tagged
1677          * with domain-id 0, hence we need to pre-allocate it. We also
1678          * use domain-id 0 as a marker for non-allocated domain-id, so
1679          * make sure it is not used for a real domain.
1680          */
1681         set_bit(0, iommu->domain_ids);
1682
1683         /*
1684          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1685          * entry for first-level or pass-through translation modes should
1686          * be programmed with a domain id different from those used for
1687          * second-level or nested translation. We reserve a domain id for
1688          * this purpose.
1689          */
1690         if (sm_supported(iommu))
1691                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1692
1693         return 0;
1694 }
1695
1696 static void disable_dmar_iommu(struct intel_iommu *iommu)
1697 {
1698         struct device_domain_info *info, *tmp;
1699         unsigned long flags;
1700
1701         if (!iommu->domains || !iommu->domain_ids)
1702                 return;
1703
1704         spin_lock_irqsave(&device_domain_lock, flags);
1705         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1706                 if (info->iommu != iommu)
1707                         continue;
1708
1709                 if (!info->dev || !info->domain)
1710                         continue;
1711
1712                 __dmar_remove_one_dev_info(info);
1713         }
1714         spin_unlock_irqrestore(&device_domain_lock, flags);
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if ((iommu->domains) && (iommu->domain_ids)) {
1723                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1724                 int i;
1725
1726                 for (i = 0; i < elems; i++)
1727                         kfree(iommu->domains[i]);
1728                 kfree(iommu->domains);
1729                 kfree(iommu->domain_ids);
1730                 iommu->domains = NULL;
1731                 iommu->domain_ids = NULL;
1732         }
1733
1734         g_iommus[iommu->seq_id] = NULL;
1735
1736         /* free context mapping */
1737         free_context_table(iommu);
1738
1739 #ifdef CONFIG_INTEL_IOMMU_SVM
1740         if (pasid_supported(iommu)) {
1741                 if (ecap_prs(iommu->ecap))
1742                         intel_svm_finish_prq(iommu);
1743         }
1744         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1745                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1746
1747 #endif
1748 }
1749
1750 /*
1751  * Check and return whether first level is used by default for
1752  * DMA translation.
1753  */
1754 static bool first_level_by_default(void)
1755 {
1756         struct dmar_drhd_unit *drhd;
1757         struct intel_iommu *iommu;
1758         static int first_level_support = -1;
1759
1760         if (likely(first_level_support != -1))
1761                 return first_level_support;
1762
1763         first_level_support = 1;
1764
1765         rcu_read_lock();
1766         for_each_active_iommu(iommu, drhd) {
1767                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1768                         first_level_support = 0;
1769                         break;
1770                 }
1771         }
1772         rcu_read_unlock();
1773
1774         return first_level_support;
1775 }
1776
1777 static struct dmar_domain *alloc_domain(int flags)
1778 {
1779         struct dmar_domain *domain;
1780
1781         domain = alloc_domain_mem();
1782         if (!domain)
1783                 return NULL;
1784
1785         memset(domain, 0, sizeof(*domain));
1786         domain->nid = NUMA_NO_NODE;
1787         domain->flags = flags;
1788         if (first_level_by_default())
1789                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1790         domain->has_iotlb_device = false;
1791         INIT_LIST_HEAD(&domain->devices);
1792
1793         return domain;
1794 }
1795
1796 /* Must be called with iommu->lock */
1797 static int domain_attach_iommu(struct dmar_domain *domain,
1798                                struct intel_iommu *iommu)
1799 {
1800         unsigned long ndomains;
1801         int num;
1802
1803         assert_spin_locked(&device_domain_lock);
1804         assert_spin_locked(&iommu->lock);
1805
1806         domain->iommu_refcnt[iommu->seq_id] += 1;
1807         domain->iommu_count += 1;
1808         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1809                 ndomains = cap_ndoms(iommu->cap);
1810                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1811
1812                 if (num >= ndomains) {
1813                         pr_err("%s: No free domain ids\n", iommu->name);
1814                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1815                         domain->iommu_count -= 1;
1816                         return -ENOSPC;
1817                 }
1818
1819                 set_bit(num, iommu->domain_ids);
1820                 set_iommu_domain(iommu, num, domain);
1821
1822                 domain->iommu_did[iommu->seq_id] = num;
1823                 domain->nid                      = iommu->node;
1824
1825                 domain_update_iommu_cap(domain);
1826         }
1827
1828         return 0;
1829 }
1830
1831 static int domain_detach_iommu(struct dmar_domain *domain,
1832                                struct intel_iommu *iommu)
1833 {
1834         int num, count;
1835
1836         assert_spin_locked(&device_domain_lock);
1837         assert_spin_locked(&iommu->lock);
1838
1839         domain->iommu_refcnt[iommu->seq_id] -= 1;
1840         count = --domain->iommu_count;
1841         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1842                 num = domain->iommu_did[iommu->seq_id];
1843                 clear_bit(num, iommu->domain_ids);
1844                 set_iommu_domain(iommu, num, NULL);
1845
1846                 domain_update_iommu_cap(domain);
1847                 domain->iommu_did[iommu->seq_id] = 0;
1848         }
1849
1850         return count;
1851 }
1852
1853 static struct iova_domain reserved_iova_list;
1854 static struct lock_class_key reserved_rbtree_key;
1855
1856 static int dmar_init_reserved_ranges(void)
1857 {
1858         struct pci_dev *pdev = NULL;
1859         struct iova *iova;
1860         int i;
1861
1862         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1863
1864         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1865                 &reserved_rbtree_key);
1866
1867         /* IOAPIC ranges shouldn't be accessed by DMA */
1868         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1869                 IOVA_PFN(IOAPIC_RANGE_END));
1870         if (!iova) {
1871                 pr_err("Reserve IOAPIC range failed\n");
1872                 return -ENODEV;
1873         }
1874
1875         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1876         for_each_pci_dev(pdev) {
1877                 struct resource *r;
1878
1879                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1880                         r = &pdev->resource[i];
1881                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1882                                 continue;
1883                         iova = reserve_iova(&reserved_iova_list,
1884                                             IOVA_PFN(r->start),
1885                                             IOVA_PFN(r->end));
1886                         if (!iova) {
1887                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1888                                 return -ENODEV;
1889                         }
1890                 }
1891         }
1892         return 0;
1893 }
1894
1895 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1896 {
1897         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1898 }
1899
1900 static inline int guestwidth_to_adjustwidth(int gaw)
1901 {
1902         int agaw;
1903         int r = (gaw - 12) % 9;
1904
1905         if (r == 0)
1906                 agaw = gaw;
1907         else
1908                 agaw = gaw + 9 - r;
1909         if (agaw > 64)
1910                 agaw = 64;
1911         return agaw;
1912 }
1913
1914 static void domain_exit(struct dmar_domain *domain)
1915 {
1916
1917         /* Remove associated devices and clear attached or cached domains */
1918         domain_remove_dev_info(domain);
1919
1920         /* destroy iovas */
1921         put_iova_domain(&domain->iovad);
1922
1923         if (domain->pgd) {
1924                 struct page *freelist;
1925
1926                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1927                 dma_free_pagelist(freelist);
1928         }
1929
1930         free_domain_mem(domain);
1931 }
1932
1933 /*
1934  * Get the PASID directory size for scalable mode context entry.
1935  * Value of X in the PDTS field of a scalable mode context entry
1936  * indicates PASID directory with 2^(X + 7) entries.
1937  */
1938 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1939 {
1940         int pds, max_pde;
1941
1942         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1943         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1944         if (pds < 7)
1945                 return 0;
1946
1947         return pds - 7;
1948 }
1949
1950 /*
1951  * Set the RID_PASID field of a scalable mode context entry. The
1952  * IOMMU hardware will use the PASID value set in this field for
1953  * DMA translations of DMA requests without PASID.
1954  */
1955 static inline void
1956 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1957 {
1958         context->hi |= pasid & ((1 << 20) - 1);
1959         context->hi |= (1 << 20);
1960 }
1961
1962 /*
1963  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1964  * entry.
1965  */
1966 static inline void context_set_sm_dte(struct context_entry *context)
1967 {
1968         context->lo |= (1 << 2);
1969 }
1970
1971 /*
1972  * Set the PRE(Page Request Enable) field of a scalable mode context
1973  * entry.
1974  */
1975 static inline void context_set_sm_pre(struct context_entry *context)
1976 {
1977         context->lo |= (1 << 4);
1978 }
1979
1980 /* Convert value to context PASID directory size field coding. */
1981 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1982
1983 static int domain_context_mapping_one(struct dmar_domain *domain,
1984                                       struct intel_iommu *iommu,
1985                                       struct pasid_table *table,
1986                                       u8 bus, u8 devfn)
1987 {
1988         u16 did = domain->iommu_did[iommu->seq_id];
1989         int translation = CONTEXT_TT_MULTI_LEVEL;
1990         struct device_domain_info *info = NULL;
1991         struct context_entry *context;
1992         unsigned long flags;
1993         int ret;
1994
1995         WARN_ON(did == 0);
1996
1997         if (hw_pass_through && domain_type_is_si(domain))
1998                 translation = CONTEXT_TT_PASS_THROUGH;
1999
2000         pr_debug("Set context mapping for %02x:%02x.%d\n",
2001                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2002
2003         BUG_ON(!domain->pgd);
2004
2005         spin_lock_irqsave(&device_domain_lock, flags);
2006         spin_lock(&iommu->lock);
2007
2008         ret = -ENOMEM;
2009         context = iommu_context_addr(iommu, bus, devfn, 1);
2010         if (!context)
2011                 goto out_unlock;
2012
2013         ret = 0;
2014         if (context_present(context))
2015                 goto out_unlock;
2016
2017         /*
2018          * For kdump cases, old valid entries may be cached due to the
2019          * in-flight DMA and copied pgtable, but there is no unmapping
2020          * behaviour for them, thus we need an explicit cache flush for
2021          * the newly-mapped device. For kdump, at this point, the device
2022          * is supposed to finish reset at its driver probe stage, so no
2023          * in-flight DMA will exist, and we don't need to worry anymore
2024          * hereafter.
2025          */
2026         if (context_copied(context)) {
2027                 u16 did_old = context_domain_id(context);
2028
2029                 if (did_old < cap_ndoms(iommu->cap)) {
2030                         iommu->flush.flush_context(iommu, did_old,
2031                                                    (((u16)bus) << 8) | devfn,
2032                                                    DMA_CCMD_MASK_NOBIT,
2033                                                    DMA_CCMD_DEVICE_INVL);
2034                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2035                                                  DMA_TLB_DSI_FLUSH);
2036                 }
2037         }
2038
2039         context_clear_entry(context);
2040
2041         if (sm_supported(iommu)) {
2042                 unsigned long pds;
2043
2044                 WARN_ON(!table);
2045
2046                 /* Setup the PASID DIR pointer: */
2047                 pds = context_get_sm_pds(table);
2048                 context->lo = (u64)virt_to_phys(table->table) |
2049                                 context_pdts(pds);
2050
2051                 /* Setup the RID_PASID field: */
2052                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2053
2054                 /*
2055                  * Setup the Device-TLB enable bit and Page request
2056                  * Enable bit:
2057                  */
2058                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059                 if (info && info->ats_supported)
2060                         context_set_sm_dte(context);
2061                 if (info && info->pri_supported)
2062                         context_set_sm_pre(context);
2063         } else {
2064                 struct dma_pte *pgd = domain->pgd;
2065                 int agaw;
2066
2067                 context_set_domain_id(context, did);
2068
2069                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2070                         /*
2071                          * Skip top levels of page tables for iommu which has
2072                          * less agaw than default. Unnecessary for PT mode.
2073                          */
2074                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2075                                 ret = -ENOMEM;
2076                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2077                                 if (!dma_pte_present(pgd))
2078                                         goto out_unlock;
2079                         }
2080
2081                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2082                         if (info && info->ats_supported)
2083                                 translation = CONTEXT_TT_DEV_IOTLB;
2084                         else
2085                                 translation = CONTEXT_TT_MULTI_LEVEL;
2086
2087                         context_set_address_root(context, virt_to_phys(pgd));
2088                         context_set_address_width(context, agaw);
2089                 } else {
2090                         /*
2091                          * In pass through mode, AW must be programmed to
2092                          * indicate the largest AGAW value supported by
2093                          * hardware. And ASR is ignored by hardware.
2094                          */
2095                         context_set_address_width(context, iommu->msagaw);
2096                 }
2097
2098                 context_set_translation_type(context, translation);
2099         }
2100
2101         context_set_fault_enable(context);
2102         context_set_present(context);
2103         domain_flush_cache(domain, context, sizeof(*context));
2104
2105         /*
2106          * It's a non-present to present mapping. If hardware doesn't cache
2107          * non-present entry we only need to flush the write-buffer. If the
2108          * _does_ cache non-present entries, then it does so in the special
2109          * domain #0, which we have to flush:
2110          */
2111         if (cap_caching_mode(iommu->cap)) {
2112                 iommu->flush.flush_context(iommu, 0,
2113                                            (((u16)bus) << 8) | devfn,
2114                                            DMA_CCMD_MASK_NOBIT,
2115                                            DMA_CCMD_DEVICE_INVL);
2116                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2117         } else {
2118                 iommu_flush_write_buffer(iommu);
2119         }
2120         iommu_enable_dev_iotlb(info);
2121
2122         ret = 0;
2123
2124 out_unlock:
2125         spin_unlock(&iommu->lock);
2126         spin_unlock_irqrestore(&device_domain_lock, flags);
2127
2128         return ret;
2129 }
2130
2131 struct domain_context_mapping_data {
2132         struct dmar_domain *domain;
2133         struct intel_iommu *iommu;
2134         struct pasid_table *table;
2135 };
2136
2137 static int domain_context_mapping_cb(struct pci_dev *pdev,
2138                                      u16 alias, void *opaque)
2139 {
2140         struct domain_context_mapping_data *data = opaque;
2141
2142         return domain_context_mapping_one(data->domain, data->iommu,
2143                                           data->table, PCI_BUS_NUM(alias),
2144                                           alias & 0xff);
2145 }
2146
2147 static int
2148 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2149 {
2150         struct domain_context_mapping_data data;
2151         struct pasid_table *table;
2152         struct intel_iommu *iommu;
2153         u8 bus, devfn;
2154
2155         iommu = device_to_iommu(dev, &bus, &devfn);
2156         if (!iommu)
2157                 return -ENODEV;
2158
2159         table = intel_pasid_get_table(dev);
2160
2161         if (!dev_is_pci(dev))
2162                 return domain_context_mapping_one(domain, iommu, table,
2163                                                   bus, devfn);
2164
2165         data.domain = domain;
2166         data.iommu = iommu;
2167         data.table = table;
2168
2169         return pci_for_each_dma_alias(to_pci_dev(dev),
2170                                       &domain_context_mapping_cb, &data);
2171 }
2172
2173 static int domain_context_mapped_cb(struct pci_dev *pdev,
2174                                     u16 alias, void *opaque)
2175 {
2176         struct intel_iommu *iommu = opaque;
2177
2178         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2179 }
2180
2181 static int domain_context_mapped(struct device *dev)
2182 {
2183         struct intel_iommu *iommu;
2184         u8 bus, devfn;
2185
2186         iommu = device_to_iommu(dev, &bus, &devfn);
2187         if (!iommu)
2188                 return -ENODEV;
2189
2190         if (!dev_is_pci(dev))
2191                 return device_context_mapped(iommu, bus, devfn);
2192
2193         return !pci_for_each_dma_alias(to_pci_dev(dev),
2194                                        domain_context_mapped_cb, iommu);
2195 }
2196
2197 /* Returns a number of VTD pages, but aligned to MM page size */
2198 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2199                                             size_t size)
2200 {
2201         host_addr &= ~PAGE_MASK;
2202         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2203 }
2204
2205 /* Return largest possible superpage level for a given mapping */
2206 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2207                                           unsigned long iov_pfn,
2208                                           unsigned long phy_pfn,
2209                                           unsigned long pages)
2210 {
2211         int support, level = 1;
2212         unsigned long pfnmerge;
2213
2214         support = domain->iommu_superpage;
2215
2216         /* To use a large page, the virtual *and* physical addresses
2217            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2218            of them will mean we have to use smaller pages. So just
2219            merge them and check both at once. */
2220         pfnmerge = iov_pfn | phy_pfn;
2221
2222         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2223                 pages >>= VTD_STRIDE_SHIFT;
2224                 if (!pages)
2225                         break;
2226                 pfnmerge >>= VTD_STRIDE_SHIFT;
2227                 level++;
2228                 support--;
2229         }
2230         return level;
2231 }
2232
2233 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2234                             struct scatterlist *sg, unsigned long phys_pfn,
2235                             unsigned long nr_pages, int prot)
2236 {
2237         struct dma_pte *first_pte = NULL, *pte = NULL;
2238         phys_addr_t uninitialized_var(pteval);
2239         unsigned long sg_res = 0;
2240         unsigned int largepage_lvl = 0;
2241         unsigned long lvl_pages = 0;
2242         u64 attr;
2243
2244         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2245
2246         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2247                 return -EINVAL;
2248
2249         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2250         if (domain_use_first_level(domain))
2251                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2252
2253         if (!sg) {
2254                 sg_res = nr_pages;
2255                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2256         }
2257
2258         while (nr_pages > 0) {
2259                 uint64_t tmp;
2260
2261                 if (!sg_res) {
2262                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2263
2264                         sg_res = aligned_nrpages(sg->offset, sg->length);
2265                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2266                         sg->dma_length = sg->length;
2267                         pteval = (sg_phys(sg) - pgoff) | attr;
2268                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2269                 }
2270
2271                 if (!pte) {
2272                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2273
2274                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2275                         if (!pte)
2276                                 return -ENOMEM;
2277                         /* It is large page*/
2278                         if (largepage_lvl > 1) {
2279                                 unsigned long nr_superpages, end_pfn;
2280
2281                                 pteval |= DMA_PTE_LARGE_PAGE;
2282                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2283
2284                                 nr_superpages = sg_res / lvl_pages;
2285                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2286
2287                                 /*
2288                                  * Ensure that old small page tables are
2289                                  * removed to make room for superpage(s).
2290                                  * We're adding new large pages, so make sure
2291                                  * we don't remove their parent tables.
2292                                  */
2293                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2294                                                        largepage_lvl + 1);
2295                         } else {
2296                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2297                         }
2298
2299                 }
2300                 /* We don't need lock here, nobody else
2301                  * touches the iova range
2302                  */
2303                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2304                 if (tmp) {
2305                         static int dumps = 5;
2306                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2307                                 iov_pfn, tmp, (unsigned long long)pteval);
2308                         if (dumps) {
2309                                 dumps--;
2310                                 debug_dma_dump_mappings(NULL);
2311                         }
2312                         WARN_ON(1);
2313                 }
2314
2315                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2316
2317                 BUG_ON(nr_pages < lvl_pages);
2318                 BUG_ON(sg_res < lvl_pages);
2319
2320                 nr_pages -= lvl_pages;
2321                 iov_pfn += lvl_pages;
2322                 phys_pfn += lvl_pages;
2323                 pteval += lvl_pages * VTD_PAGE_SIZE;
2324                 sg_res -= lvl_pages;
2325
2326                 /* If the next PTE would be the first in a new page, then we
2327                    need to flush the cache on the entries we've just written.
2328                    And then we'll need to recalculate 'pte', so clear it and
2329                    let it get set again in the if (!pte) block above.
2330
2331                    If we're done (!nr_pages) we need to flush the cache too.
2332
2333                    Also if we've been setting superpages, we may need to
2334                    recalculate 'pte' and switch back to smaller pages for the
2335                    end of the mapping, if the trailing size is not enough to
2336                    use another superpage (i.e. sg_res < lvl_pages). */
2337                 pte++;
2338                 if (!nr_pages || first_pte_in_page(pte) ||
2339                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2340                         domain_flush_cache(domain, first_pte,
2341                                            (void *)pte - (void *)first_pte);
2342                         pte = NULL;
2343                 }
2344
2345                 if (!sg_res && nr_pages)
2346                         sg = sg_next(sg);
2347         }
2348         return 0;
2349 }
2350
2351 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2352                           struct scatterlist *sg, unsigned long phys_pfn,
2353                           unsigned long nr_pages, int prot)
2354 {
2355         int iommu_id, ret;
2356         struct intel_iommu *iommu;
2357
2358         /* Do the real mapping first */
2359         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2360         if (ret)
2361                 return ret;
2362
2363         for_each_domain_iommu(iommu_id, domain) {
2364                 iommu = g_iommus[iommu_id];
2365                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2366         }
2367
2368         return 0;
2369 }
2370
2371 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2372                                     struct scatterlist *sg, unsigned long nr_pages,
2373                                     int prot)
2374 {
2375         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2376 }
2377
2378 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2379                                      unsigned long phys_pfn, unsigned long nr_pages,
2380                                      int prot)
2381 {
2382         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2383 }
2384
2385 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2386 {
2387         unsigned long flags;
2388         struct context_entry *context;
2389         u16 did_old;
2390
2391         if (!iommu)
2392                 return;
2393
2394         spin_lock_irqsave(&iommu->lock, flags);
2395         context = iommu_context_addr(iommu, bus, devfn, 0);
2396         if (!context) {
2397                 spin_unlock_irqrestore(&iommu->lock, flags);
2398                 return;
2399         }
2400         did_old = context_domain_id(context);
2401         context_clear_entry(context);
2402         __iommu_flush_cache(iommu, context, sizeof(*context));
2403         spin_unlock_irqrestore(&iommu->lock, flags);
2404         iommu->flush.flush_context(iommu,
2405                                    did_old,
2406                                    (((u16)bus) << 8) | devfn,
2407                                    DMA_CCMD_MASK_NOBIT,
2408                                    DMA_CCMD_DEVICE_INVL);
2409         iommu->flush.flush_iotlb(iommu,
2410                                  did_old,
2411                                  0,
2412                                  0,
2413                                  DMA_TLB_DSI_FLUSH);
2414 }
2415
2416 static inline void unlink_domain_info(struct device_domain_info *info)
2417 {
2418         assert_spin_locked(&device_domain_lock);
2419         list_del(&info->link);
2420         list_del(&info->global);
2421         if (info->dev)
2422                 info->dev->archdata.iommu = NULL;
2423 }
2424
2425 static void domain_remove_dev_info(struct dmar_domain *domain)
2426 {
2427         struct device_domain_info *info, *tmp;
2428         unsigned long flags;
2429
2430         spin_lock_irqsave(&device_domain_lock, flags);
2431         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2432                 __dmar_remove_one_dev_info(info);
2433         spin_unlock_irqrestore(&device_domain_lock, flags);
2434 }
2435
2436 struct dmar_domain *find_domain(struct device *dev)
2437 {
2438         struct device_domain_info *info;
2439
2440         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2441                 return NULL;
2442
2443         if (dev_is_pci(dev))
2444                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2445
2446         /* No lock here, assumes no domain exit in normal case */
2447         info = get_domain_info(dev);
2448         if (likely(info))
2449                 return info->domain;
2450
2451         return NULL;
2452 }
2453
2454 static void do_deferred_attach(struct device *dev)
2455 {
2456         struct iommu_domain *domain;
2457
2458         dev->archdata.iommu = NULL;
2459         domain = iommu_get_domain_for_dev(dev);
2460         if (domain)
2461                 intel_iommu_attach_device(domain, dev);
2462 }
2463
2464 static inline struct device_domain_info *
2465 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2466 {
2467         struct device_domain_info *info;
2468
2469         list_for_each_entry(info, &device_domain_list, global)
2470                 if (info->iommu->segment == segment && info->bus == bus &&
2471                     info->devfn == devfn)
2472                         return info;
2473
2474         return NULL;
2475 }
2476
2477 static int domain_setup_first_level(struct intel_iommu *iommu,
2478                                     struct dmar_domain *domain,
2479                                     struct device *dev,
2480                                     int pasid)
2481 {
2482         int flags = PASID_FLAG_SUPERVISOR_MODE;
2483         struct dma_pte *pgd = domain->pgd;
2484         int agaw, level;
2485
2486         /*
2487          * Skip top levels of page tables for iommu which has
2488          * less agaw than default. Unnecessary for PT mode.
2489          */
2490         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2491                 pgd = phys_to_virt(dma_pte_addr(pgd));
2492                 if (!dma_pte_present(pgd))
2493                         return -ENOMEM;
2494         }
2495
2496         level = agaw_to_level(agaw);
2497         if (level != 4 && level != 5)
2498                 return -EINVAL;
2499
2500         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2501
2502         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2503                                              domain->iommu_did[iommu->seq_id],
2504                                              flags);
2505 }
2506
2507 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2508                                                     int bus, int devfn,
2509                                                     struct device *dev,
2510                                                     struct dmar_domain *domain)
2511 {
2512         struct dmar_domain *found = NULL;
2513         struct device_domain_info *info;
2514         unsigned long flags;
2515         int ret;
2516
2517         info = alloc_devinfo_mem();
2518         if (!info)
2519                 return NULL;
2520
2521         info->bus = bus;
2522         info->devfn = devfn;
2523         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2524         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2525         info->ats_qdep = 0;
2526         info->dev = dev;
2527         info->domain = domain;
2528         info->iommu = iommu;
2529         info->pasid_table = NULL;
2530         info->auxd_enabled = 0;
2531         INIT_LIST_HEAD(&info->auxiliary_domains);
2532
2533         if (dev && dev_is_pci(dev)) {
2534                 struct pci_dev *pdev = to_pci_dev(info->dev);
2535
2536                 if (!pdev->untrusted &&
2537                     !pci_ats_disabled() &&
2538                     ecap_dev_iotlb_support(iommu->ecap) &&
2539                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2540                     dmar_find_matched_atsr_unit(pdev))
2541                         info->ats_supported = 1;
2542
2543                 if (sm_supported(iommu)) {
2544                         if (pasid_supported(iommu)) {
2545                                 int features = pci_pasid_features(pdev);
2546                                 if (features >= 0)
2547                                         info->pasid_supported = features | 1;
2548                         }
2549
2550                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2551                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2552                                 info->pri_supported = 1;
2553                 }
2554         }
2555
2556         spin_lock_irqsave(&device_domain_lock, flags);
2557         if (dev)
2558                 found = find_domain(dev);
2559
2560         if (!found) {
2561                 struct device_domain_info *info2;
2562                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2563                 if (info2) {
2564                         found      = info2->domain;
2565                         info2->dev = dev;
2566                 }
2567         }
2568
2569         if (found) {
2570                 spin_unlock_irqrestore(&device_domain_lock, flags);
2571                 free_devinfo_mem(info);
2572                 /* Caller must free the original domain */
2573                 return found;
2574         }
2575
2576         spin_lock(&iommu->lock);
2577         ret = domain_attach_iommu(domain, iommu);
2578         spin_unlock(&iommu->lock);
2579
2580         if (ret) {
2581                 spin_unlock_irqrestore(&device_domain_lock, flags);
2582                 free_devinfo_mem(info);
2583                 return NULL;
2584         }
2585
2586         list_add(&info->link, &domain->devices);
2587         list_add(&info->global, &device_domain_list);
2588         if (dev)
2589                 dev->archdata.iommu = info;
2590         spin_unlock_irqrestore(&device_domain_lock, flags);
2591
2592         /* PASID table is mandatory for a PCI device in scalable mode. */
2593         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2594                 ret = intel_pasid_alloc_table(dev);
2595                 if (ret) {
2596                         dev_err(dev, "PASID table allocation failed\n");
2597                         dmar_remove_one_dev_info(dev);
2598                         return NULL;
2599                 }
2600
2601                 /* Setup the PASID entry for requests without PASID: */
2602                 spin_lock(&iommu->lock);
2603                 if (hw_pass_through && domain_type_is_si(domain))
2604                         ret = intel_pasid_setup_pass_through(iommu, domain,
2605                                         dev, PASID_RID2PASID);
2606                 else if (domain_use_first_level(domain))
2607                         ret = domain_setup_first_level(iommu, domain, dev,
2608                                         PASID_RID2PASID);
2609                 else
2610                         ret = intel_pasid_setup_second_level(iommu, domain,
2611                                         dev, PASID_RID2PASID);
2612                 spin_unlock(&iommu->lock);
2613                 if (ret) {
2614                         dev_err(dev, "Setup RID2PASID failed\n");
2615                         dmar_remove_one_dev_info(dev);
2616                         return NULL;
2617                 }
2618         }
2619
2620         if (dev && domain_context_mapping(domain, dev)) {
2621                 dev_err(dev, "Domain context map failed\n");
2622                 dmar_remove_one_dev_info(dev);
2623                 return NULL;
2624         }
2625
2626         return domain;
2627 }
2628
2629 static int iommu_domain_identity_map(struct dmar_domain *domain,
2630                                      unsigned long long start,
2631                                      unsigned long long end)
2632 {
2633         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2634         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2635
2636         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2637                           dma_to_mm_pfn(last_vpfn))) {
2638                 pr_err("Reserving iova failed\n");
2639                 return -ENOMEM;
2640         }
2641
2642         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2643         /*
2644          * RMRR range might have overlap with physical memory range,
2645          * clear it first
2646          */
2647         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2648
2649         return __domain_mapping(domain, first_vpfn, NULL,
2650                                 first_vpfn, last_vpfn - first_vpfn + 1,
2651                                 DMA_PTE_READ|DMA_PTE_WRITE);
2652 }
2653
2654 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2655
2656 static int __init si_domain_init(int hw)
2657 {
2658         struct dmar_rmrr_unit *rmrr;
2659         struct device *dev;
2660         int i, nid, ret;
2661
2662         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2663         if (!si_domain)
2664                 return -EFAULT;
2665
2666         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2667                 domain_exit(si_domain);
2668                 return -EFAULT;
2669         }
2670
2671         if (hw)
2672                 return 0;
2673
2674         for_each_online_node(nid) {
2675                 unsigned long start_pfn, end_pfn;
2676                 int i;
2677
2678                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2679                         ret = iommu_domain_identity_map(si_domain,
2680                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2681                         if (ret)
2682                                 return ret;
2683                 }
2684         }
2685
2686         /*
2687          * Identity map the RMRRs so that devices with RMRRs could also use
2688          * the si_domain.
2689          */
2690         for_each_rmrr_units(rmrr) {
2691                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2692                                           i, dev) {
2693                         unsigned long long start = rmrr->base_address;
2694                         unsigned long long end = rmrr->end_address;
2695
2696                         if (WARN_ON(end < start ||
2697                                     end >> agaw_to_width(si_domain->agaw)))
2698                                 continue;
2699
2700                         ret = iommu_domain_identity_map(si_domain, start, end);
2701                         if (ret)
2702                                 return ret;
2703                 }
2704         }
2705
2706         return 0;
2707 }
2708
2709 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2710 {
2711         struct dmar_domain *ndomain;
2712         struct intel_iommu *iommu;
2713         u8 bus, devfn;
2714
2715         iommu = device_to_iommu(dev, &bus, &devfn);
2716         if (!iommu)
2717                 return -ENODEV;
2718
2719         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2720         if (ndomain != domain)
2721                 return -EBUSY;
2722
2723         return 0;
2724 }
2725
2726 static bool device_has_rmrr(struct device *dev)
2727 {
2728         struct dmar_rmrr_unit *rmrr;
2729         struct device *tmp;
2730         int i;
2731
2732         rcu_read_lock();
2733         for_each_rmrr_units(rmrr) {
2734                 /*
2735                  * Return TRUE if this RMRR contains the device that
2736                  * is passed in.
2737                  */
2738                 for_each_active_dev_scope(rmrr->devices,
2739                                           rmrr->devices_cnt, i, tmp)
2740                         if (tmp == dev ||
2741                             is_downstream_to_pci_bridge(dev, tmp)) {
2742                                 rcu_read_unlock();
2743                                 return true;
2744                         }
2745         }
2746         rcu_read_unlock();
2747         return false;
2748 }
2749
2750 /**
2751  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2752  * is relaxable (ie. is allowed to be not enforced under some conditions)
2753  * @dev: device handle
2754  *
2755  * We assume that PCI USB devices with RMRRs have them largely
2756  * for historical reasons and that the RMRR space is not actively used post
2757  * boot.  This exclusion may change if vendors begin to abuse it.
2758  *
2759  * The same exception is made for graphics devices, with the requirement that
2760  * any use of the RMRR regions will be torn down before assigning the device
2761  * to a guest.
2762  *
2763  * Return: true if the RMRR is relaxable, false otherwise
2764  */
2765 static bool device_rmrr_is_relaxable(struct device *dev)
2766 {
2767         struct pci_dev *pdev;
2768
2769         if (!dev_is_pci(dev))
2770                 return false;
2771
2772         pdev = to_pci_dev(dev);
2773         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2774                 return true;
2775         else
2776                 return false;
2777 }
2778
2779 /*
2780  * There are a couple cases where we need to restrict the functionality of
2781  * devices associated with RMRRs.  The first is when evaluating a device for
2782  * identity mapping because problems exist when devices are moved in and out
2783  * of domains and their respective RMRR information is lost.  This means that
2784  * a device with associated RMRRs will never be in a "passthrough" domain.
2785  * The second is use of the device through the IOMMU API.  This interface
2786  * expects to have full control of the IOVA space for the device.  We cannot
2787  * satisfy both the requirement that RMRR access is maintained and have an
2788  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2789  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2790  * We therefore prevent devices associated with an RMRR from participating in
2791  * the IOMMU API, which eliminates them from device assignment.
2792  *
2793  * In both cases, devices which have relaxable RMRRs are not concerned by this
2794  * restriction. See device_rmrr_is_relaxable comment.
2795  */
2796 static bool device_is_rmrr_locked(struct device *dev)
2797 {
2798         if (!device_has_rmrr(dev))
2799                 return false;
2800
2801         if (device_rmrr_is_relaxable(dev))
2802                 return false;
2803
2804         return true;
2805 }
2806
2807 /*
2808  * Return the required default domain type for a specific device.
2809  *
2810  * @dev: the device in query
2811  * @startup: true if this is during early boot
2812  *
2813  * Returns:
2814  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2815  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2816  *  - 0: both identity and dynamic domains work for this device
2817  */
2818 static int device_def_domain_type(struct device *dev)
2819 {
2820         if (dev_is_pci(dev)) {
2821                 struct pci_dev *pdev = to_pci_dev(dev);
2822
2823                 /*
2824                  * Prevent any device marked as untrusted from getting
2825                  * placed into the statically identity mapping domain.
2826                  */
2827                 if (pdev->untrusted)
2828                         return IOMMU_DOMAIN_DMA;
2829
2830                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2831                         return IOMMU_DOMAIN_IDENTITY;
2832
2833                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2834                         return IOMMU_DOMAIN_IDENTITY;
2835         }
2836
2837         return 0;
2838 }
2839
2840 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2841 {
2842         /*
2843          * Start from the sane iommu hardware state.
2844          * If the queued invalidation is already initialized by us
2845          * (for example, while enabling interrupt-remapping) then
2846          * we got the things already rolling from a sane state.
2847          */
2848         if (!iommu->qi) {
2849                 /*
2850                  * Clear any previous faults.
2851                  */
2852                 dmar_fault(-1, iommu);
2853                 /*
2854                  * Disable queued invalidation if supported and already enabled
2855                  * before OS handover.
2856                  */
2857                 dmar_disable_qi(iommu);
2858         }
2859
2860         if (dmar_enable_qi(iommu)) {
2861                 /*
2862                  * Queued Invalidate not enabled, use Register Based Invalidate
2863                  */
2864                 iommu->flush.flush_context = __iommu_flush_context;
2865                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2866                 pr_info("%s: Using Register based invalidation\n",
2867                         iommu->name);
2868         } else {
2869                 iommu->flush.flush_context = qi_flush_context;
2870                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2871                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2872         }
2873 }
2874
2875 static int copy_context_table(struct intel_iommu *iommu,
2876                               struct root_entry *old_re,
2877                               struct context_entry **tbl,
2878                               int bus, bool ext)
2879 {
2880         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2881         struct context_entry *new_ce = NULL, ce;
2882         struct context_entry *old_ce = NULL;
2883         struct root_entry re;
2884         phys_addr_t old_ce_phys;
2885
2886         tbl_idx = ext ? bus * 2 : bus;
2887         memcpy(&re, old_re, sizeof(re));
2888
2889         for (devfn = 0; devfn < 256; devfn++) {
2890                 /* First calculate the correct index */
2891                 idx = (ext ? devfn * 2 : devfn) % 256;
2892
2893                 if (idx == 0) {
2894                         /* First save what we may have and clean up */
2895                         if (new_ce) {
2896                                 tbl[tbl_idx] = new_ce;
2897                                 __iommu_flush_cache(iommu, new_ce,
2898                                                     VTD_PAGE_SIZE);
2899                                 pos = 1;
2900                         }
2901
2902                         if (old_ce)
2903                                 memunmap(old_ce);
2904
2905                         ret = 0;
2906                         if (devfn < 0x80)
2907                                 old_ce_phys = root_entry_lctp(&re);
2908                         else
2909                                 old_ce_phys = root_entry_uctp(&re);
2910
2911                         if (!old_ce_phys) {
2912                                 if (ext && devfn == 0) {
2913                                         /* No LCTP, try UCTP */
2914                                         devfn = 0x7f;
2915                                         continue;
2916                                 } else {
2917                                         goto out;
2918                                 }
2919                         }
2920
2921                         ret = -ENOMEM;
2922                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2923                                         MEMREMAP_WB);
2924                         if (!old_ce)
2925                                 goto out;
2926
2927                         new_ce = alloc_pgtable_page(iommu->node);
2928                         if (!new_ce)
2929                                 goto out_unmap;
2930
2931                         ret = 0;
2932                 }
2933
2934                 /* Now copy the context entry */
2935                 memcpy(&ce, old_ce + idx, sizeof(ce));
2936
2937                 if (!__context_present(&ce))
2938                         continue;
2939
2940                 did = context_domain_id(&ce);
2941                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2942                         set_bit(did, iommu->domain_ids);
2943
2944                 /*
2945                  * We need a marker for copied context entries. This
2946                  * marker needs to work for the old format as well as
2947                  * for extended context entries.
2948                  *
2949                  * Bit 67 of the context entry is used. In the old
2950                  * format this bit is available to software, in the
2951                  * extended format it is the PGE bit, but PGE is ignored
2952                  * by HW if PASIDs are disabled (and thus still
2953                  * available).
2954                  *
2955                  * So disable PASIDs first and then mark the entry
2956                  * copied. This means that we don't copy PASID
2957                  * translations from the old kernel, but this is fine as
2958                  * faults there are not fatal.
2959                  */
2960                 context_clear_pasid_enable(&ce);
2961                 context_set_copied(&ce);
2962
2963                 new_ce[idx] = ce;
2964         }
2965
2966         tbl[tbl_idx + pos] = new_ce;
2967
2968         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2969
2970 out_unmap:
2971         memunmap(old_ce);
2972
2973 out:
2974         return ret;
2975 }
2976
2977 static int copy_translation_tables(struct intel_iommu *iommu)
2978 {
2979         struct context_entry **ctxt_tbls;
2980         struct root_entry *old_rt;
2981         phys_addr_t old_rt_phys;
2982         int ctxt_table_entries;
2983         unsigned long flags;
2984         u64 rtaddr_reg;
2985         int bus, ret;
2986         bool new_ext, ext;
2987
2988         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2989         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2990         new_ext    = !!ecap_ecs(iommu->ecap);
2991
2992         /*
2993          * The RTT bit can only be changed when translation is disabled,
2994          * but disabling translation means to open a window for data
2995          * corruption. So bail out and don't copy anything if we would
2996          * have to change the bit.
2997          */
2998         if (new_ext != ext)
2999                 return -EINVAL;
3000
3001         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3002         if (!old_rt_phys)
3003                 return -EINVAL;
3004
3005         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3006         if (!old_rt)
3007                 return -ENOMEM;
3008
3009         /* This is too big for the stack - allocate it from slab */
3010         ctxt_table_entries = ext ? 512 : 256;
3011         ret = -ENOMEM;
3012         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3013         if (!ctxt_tbls)
3014                 goto out_unmap;
3015
3016         for (bus = 0; bus < 256; bus++) {
3017                 ret = copy_context_table(iommu, &old_rt[bus],
3018                                          ctxt_tbls, bus, ext);
3019                 if (ret) {
3020                         pr_err("%s: Failed to copy context table for bus %d\n",
3021                                 iommu->name, bus);
3022                         continue;
3023                 }
3024         }
3025
3026         spin_lock_irqsave(&iommu->lock, flags);
3027
3028         /* Context tables are copied, now write them to the root_entry table */
3029         for (bus = 0; bus < 256; bus++) {
3030                 int idx = ext ? bus * 2 : bus;
3031                 u64 val;
3032
3033                 if (ctxt_tbls[idx]) {
3034                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3035                         iommu->root_entry[bus].lo = val;
3036                 }
3037
3038                 if (!ext || !ctxt_tbls[idx + 1])
3039                         continue;
3040
3041                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3042                 iommu->root_entry[bus].hi = val;
3043         }
3044
3045         spin_unlock_irqrestore(&iommu->lock, flags);
3046
3047         kfree(ctxt_tbls);
3048
3049         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3050
3051         ret = 0;
3052
3053 out_unmap:
3054         memunmap(old_rt);
3055
3056         return ret;
3057 }
3058
3059 #ifdef CONFIG_INTEL_IOMMU_SVM
3060 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3061 {
3062         struct intel_iommu *iommu = data;
3063         ioasid_t ioasid;
3064
3065         if (!iommu)
3066                 return INVALID_IOASID;
3067         /*
3068          * VT-d virtual command interface always uses the full 20 bit
3069          * PASID range. Host can partition guest PASID range based on
3070          * policies but it is out of guest's control.
3071          */
3072         if (min < PASID_MIN || max > intel_pasid_max_id)
3073                 return INVALID_IOASID;
3074
3075         if (vcmd_alloc_pasid(iommu, &ioasid))
3076                 return INVALID_IOASID;
3077
3078         return ioasid;
3079 }
3080
3081 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3082 {
3083         struct intel_iommu *iommu = data;
3084
3085         if (!iommu)
3086                 return;
3087         /*
3088          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3089          * We can only free the PASID when all the devices are unbound.
3090          */
3091         if (ioasid_find(NULL, ioasid, NULL)) {
3092                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3093                 return;
3094         }
3095         vcmd_free_pasid(iommu, ioasid);
3096 }
3097
3098 static void register_pasid_allocator(struct intel_iommu *iommu)
3099 {
3100         /*
3101          * If we are running in the host, no need for custom allocator
3102          * in that PASIDs are allocated from the host system-wide.
3103          */
3104         if (!cap_caching_mode(iommu->cap))
3105                 return;
3106
3107         if (!sm_supported(iommu)) {
3108                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3109                 return;
3110         }
3111
3112         /*
3113          * Register a custom PASID allocator if we are running in a guest,
3114          * guest PASID must be obtained via virtual command interface.
3115          * There can be multiple vIOMMUs in each guest but only one allocator
3116          * is active. All vIOMMU allocators will eventually be calling the same
3117          * host allocator.
3118          */
3119         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3120                 return;
3121
3122         pr_info("Register custom PASID allocator\n");
3123         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3124         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3125         iommu->pasid_allocator.pdata = (void *)iommu;
3126         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3127                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3128                 /*
3129                  * Disable scalable mode on this IOMMU if there
3130                  * is no custom allocator. Mixing SM capable vIOMMU
3131                  * and non-SM vIOMMU are not supported.
3132                  */
3133                 intel_iommu_sm = 0;
3134         }
3135 }
3136 #endif
3137
3138 static int __init init_dmars(void)
3139 {
3140         struct dmar_drhd_unit *drhd;
3141         struct intel_iommu *iommu;
3142         int ret;
3143
3144         /*
3145          * for each drhd
3146          *    allocate root
3147          *    initialize and program root entry to not present
3148          * endfor
3149          */
3150         for_each_drhd_unit(drhd) {
3151                 /*
3152                  * lock not needed as this is only incremented in the single
3153                  * threaded kernel __init code path all other access are read
3154                  * only
3155                  */
3156                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3157                         g_num_of_iommus++;
3158                         continue;
3159                 }
3160                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3161         }
3162
3163         /* Preallocate enough resources for IOMMU hot-addition */
3164         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3165                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3166
3167         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3168                         GFP_KERNEL);
3169         if (!g_iommus) {
3170                 pr_err("Allocating global iommu array failed\n");
3171                 ret = -ENOMEM;
3172                 goto error;
3173         }
3174
3175         for_each_iommu(iommu, drhd) {
3176                 if (drhd->ignored) {
3177                         iommu_disable_translation(iommu);
3178                         continue;
3179                 }
3180
3181                 /*
3182                  * Find the max pasid size of all IOMMU's in the system.
3183                  * We need to ensure the system pasid table is no bigger
3184                  * than the smallest supported.
3185                  */
3186                 if (pasid_supported(iommu)) {
3187                         u32 temp = 2 << ecap_pss(iommu->ecap);
3188
3189                         intel_pasid_max_id = min_t(u32, temp,
3190                                                    intel_pasid_max_id);
3191                 }
3192
3193                 g_iommus[iommu->seq_id] = iommu;
3194
3195                 intel_iommu_init_qi(iommu);
3196
3197                 ret = iommu_init_domains(iommu);
3198                 if (ret)
3199                         goto free_iommu;
3200
3201                 init_translation_status(iommu);
3202
3203                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3204                         iommu_disable_translation(iommu);
3205                         clear_translation_pre_enabled(iommu);
3206                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3207                                 iommu->name);
3208                 }
3209
3210                 /*
3211                  * TBD:
3212                  * we could share the same root & context tables
3213                  * among all IOMMU's. Need to Split it later.
3214                  */
3215                 ret = iommu_alloc_root_entry(iommu);
3216                 if (ret)
3217                         goto free_iommu;
3218
3219                 if (translation_pre_enabled(iommu)) {
3220                         pr_info("Translation already enabled - trying to copy translation structures\n");
3221
3222                         ret = copy_translation_tables(iommu);
3223                         if (ret) {
3224                                 /*
3225                                  * We found the IOMMU with translation
3226                                  * enabled - but failed to copy over the
3227                                  * old root-entry table. Try to proceed
3228                                  * by disabling translation now and
3229                                  * allocating a clean root-entry table.
3230                                  * This might cause DMAR faults, but
3231                                  * probably the dump will still succeed.
3232                                  */
3233                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3234                                        iommu->name);
3235                                 iommu_disable_translation(iommu);
3236                                 clear_translation_pre_enabled(iommu);
3237                         } else {
3238                                 pr_info("Copied translation tables from previous kernel for %s\n",
3239                                         iommu->name);
3240                         }
3241                 }
3242
3243                 if (!ecap_pass_through(iommu->ecap))
3244                         hw_pass_through = 0;
3245                 intel_svm_check(iommu);
3246         }
3247
3248         /*
3249          * Now that qi is enabled on all iommus, set the root entry and flush
3250          * caches. This is required on some Intel X58 chipsets, otherwise the
3251          * flush_context function will loop forever and the boot hangs.
3252          */
3253         for_each_active_iommu(iommu, drhd) {
3254                 iommu_flush_write_buffer(iommu);
3255 #ifdef CONFIG_INTEL_IOMMU_SVM
3256                 register_pasid_allocator(iommu);
3257 #endif
3258                 iommu_set_root_entry(iommu);
3259                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3260                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3261         }
3262
3263 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3264         dmar_map_gfx = 0;
3265 #endif
3266
3267         if (!dmar_map_gfx)
3268                 iommu_identity_mapping |= IDENTMAP_GFX;
3269
3270         check_tylersburg_isoch();
3271
3272         ret = si_domain_init(hw_pass_through);
3273         if (ret)
3274                 goto free_iommu;
3275
3276         /*
3277          * for each drhd
3278          *   enable fault log
3279          *   global invalidate context cache
3280          *   global invalidate iotlb
3281          *   enable translation
3282          */
3283         for_each_iommu(iommu, drhd) {
3284                 if (drhd->ignored) {
3285                         /*
3286                          * we always have to disable PMRs or DMA may fail on
3287                          * this device
3288                          */
3289                         if (force_on)
3290                                 iommu_disable_protect_mem_regions(iommu);
3291                         continue;
3292                 }
3293
3294                 iommu_flush_write_buffer(iommu);
3295
3296 #ifdef CONFIG_INTEL_IOMMU_SVM
3297                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3298                         /*
3299                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3300                          * could cause possible lock race condition.
3301                          */
3302                         up_write(&dmar_global_lock);
3303                         ret = intel_svm_enable_prq(iommu);
3304                         down_write(&dmar_global_lock);
3305                         if (ret)
3306                                 goto free_iommu;
3307                 }
3308 #endif
3309                 ret = dmar_set_interrupt(iommu);
3310                 if (ret)
3311                         goto free_iommu;
3312         }
3313
3314         return 0;
3315
3316 free_iommu:
3317         for_each_active_iommu(iommu, drhd) {
3318                 disable_dmar_iommu(iommu);
3319                 free_dmar_iommu(iommu);
3320         }
3321
3322         kfree(g_iommus);
3323
3324 error:
3325         return ret;
3326 }
3327
3328 /* This takes a number of _MM_ pages, not VTD pages */
3329 static unsigned long intel_alloc_iova(struct device *dev,
3330                                      struct dmar_domain *domain,
3331                                      unsigned long nrpages, uint64_t dma_mask)
3332 {
3333         unsigned long iova_pfn;
3334
3335         /*
3336          * Restrict dma_mask to the width that the iommu can handle.
3337          * First-level translation restricts the input-address to a
3338          * canonical address (i.e., address bits 63:N have the same
3339          * value as address bit [N-1], where N is 48-bits with 4-level
3340          * paging and 57-bits with 5-level paging). Hence, skip bit
3341          * [N-1].
3342          */
3343         if (domain_use_first_level(domain))
3344                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3345                                  dma_mask);
3346         else
3347                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3348                                  dma_mask);
3349
3350         /* Ensure we reserve the whole size-aligned region */
3351         nrpages = __roundup_pow_of_two(nrpages);
3352
3353         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3354                 /*
3355                  * First try to allocate an io virtual address in
3356                  * DMA_BIT_MASK(32) and if that fails then try allocating
3357                  * from higher range
3358                  */
3359                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3360                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3361                 if (iova_pfn)
3362                         return iova_pfn;
3363         }
3364         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3365                                    IOVA_PFN(dma_mask), true);
3366         if (unlikely(!iova_pfn)) {
3367                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3368                              nrpages);
3369                 return 0;
3370         }
3371
3372         return iova_pfn;
3373 }
3374
3375 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3376                                      size_t size, int dir, u64 dma_mask)
3377 {
3378         struct dmar_domain *domain;
3379         phys_addr_t start_paddr;
3380         unsigned long iova_pfn;
3381         int prot = 0;
3382         int ret;
3383         struct intel_iommu *iommu;
3384         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3385
3386         BUG_ON(dir == DMA_NONE);
3387
3388         if (unlikely(attach_deferred(dev)))
3389                 do_deferred_attach(dev);
3390
3391         domain = find_domain(dev);
3392         if (!domain)
3393                 return DMA_MAPPING_ERROR;
3394
3395         iommu = domain_get_iommu(domain);
3396         size = aligned_nrpages(paddr, size);
3397
3398         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3399         if (!iova_pfn)
3400                 goto error;
3401
3402         /*
3403          * Check if DMAR supports zero-length reads on write only
3404          * mappings..
3405          */
3406         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3407                         !cap_zlr(iommu->cap))
3408                 prot |= DMA_PTE_READ;
3409         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3410                 prot |= DMA_PTE_WRITE;
3411         /*
3412          * paddr - (paddr + size) might be partial page, we should map the whole
3413          * page.  Note: if two part of one page are separately mapped, we
3414          * might have two guest_addr mapping to the same host paddr, but this
3415          * is not a big problem
3416          */
3417         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3418                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3419         if (ret)
3420                 goto error;
3421
3422         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3423         start_paddr += paddr & ~PAGE_MASK;
3424
3425         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3426
3427         return start_paddr;
3428
3429 error:
3430         if (iova_pfn)
3431                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3432         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3433                 size, (unsigned long long)paddr, dir);
3434         return DMA_MAPPING_ERROR;
3435 }
3436
3437 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3438                                  unsigned long offset, size_t size,
3439                                  enum dma_data_direction dir,
3440                                  unsigned long attrs)
3441 {
3442         return __intel_map_single(dev, page_to_phys(page) + offset,
3443                                   size, dir, *dev->dma_mask);
3444 }
3445
3446 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3447                                      size_t size, enum dma_data_direction dir,
3448                                      unsigned long attrs)
3449 {
3450         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3451 }
3452
3453 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3454 {
3455         struct dmar_domain *domain;
3456         unsigned long start_pfn, last_pfn;
3457         unsigned long nrpages;
3458         unsigned long iova_pfn;
3459         struct intel_iommu *iommu;
3460         struct page *freelist;
3461         struct pci_dev *pdev = NULL;
3462
3463         domain = find_domain(dev);
3464         BUG_ON(!domain);
3465
3466         iommu = domain_get_iommu(domain);
3467
3468         iova_pfn = IOVA_PFN(dev_addr);
3469
3470         nrpages = aligned_nrpages(dev_addr, size);
3471         start_pfn = mm_to_dma_pfn(iova_pfn);
3472         last_pfn = start_pfn + nrpages - 1;
3473
3474         if (dev_is_pci(dev))
3475                 pdev = to_pci_dev(dev);
3476
3477         freelist = domain_unmap(domain, start_pfn, last_pfn);
3478         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3479                         !has_iova_flush_queue(&domain->iovad)) {
3480                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3481                                       nrpages, !freelist, 0);
3482                 /* free iova */
3483                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3484                 dma_free_pagelist(freelist);
3485         } else {
3486                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3487                            (unsigned long)freelist);
3488                 /*
3489                  * queue up the release of the unmap to save the 1/6th of the
3490                  * cpu used up by the iotlb flush operation...
3491                  */
3492         }
3493
3494         trace_unmap_single(dev, dev_addr, size);
3495 }
3496
3497 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3498                              size_t size, enum dma_data_direction dir,
3499                              unsigned long attrs)
3500 {
3501         intel_unmap(dev, dev_addr, size);
3502 }
3503
3504 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3505                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3506 {
3507         intel_unmap(dev, dev_addr, size);
3508 }
3509
3510 static void *intel_alloc_coherent(struct device *dev, size_t size,
3511                                   dma_addr_t *dma_handle, gfp_t flags,
3512                                   unsigned long attrs)
3513 {
3514         struct page *page = NULL;
3515         int order;
3516
3517         if (unlikely(attach_deferred(dev)))
3518                 do_deferred_attach(dev);
3519
3520         size = PAGE_ALIGN(size);
3521         order = get_order(size);
3522
3523         if (gfpflags_allow_blocking(flags)) {
3524                 unsigned int count = size >> PAGE_SHIFT;
3525
3526                 page = dma_alloc_from_contiguous(dev, count, order,
3527                                                  flags & __GFP_NOWARN);
3528         }
3529
3530         if (!page)
3531                 page = alloc_pages(flags, order);
3532         if (!page)
3533                 return NULL;
3534         memset(page_address(page), 0, size);
3535
3536         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3537                                          DMA_BIDIRECTIONAL,
3538                                          dev->coherent_dma_mask);
3539         if (*dma_handle != DMA_MAPPING_ERROR)
3540                 return page_address(page);
3541         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3542                 __free_pages(page, order);
3543
3544         return NULL;
3545 }
3546
3547 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3548                                 dma_addr_t dma_handle, unsigned long attrs)
3549 {
3550         int order;
3551         struct page *page = virt_to_page(vaddr);
3552
3553         size = PAGE_ALIGN(size);
3554         order = get_order(size);
3555
3556         intel_unmap(dev, dma_handle, size);
3557         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3558                 __free_pages(page, order);
3559 }
3560
3561 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3562                            int nelems, enum dma_data_direction dir,
3563                            unsigned long attrs)
3564 {
3565         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3566         unsigned long nrpages = 0;
3567         struct scatterlist *sg;
3568         int i;
3569
3570         for_each_sg(sglist, sg, nelems, i) {
3571                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3572         }
3573
3574         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3575
3576         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3577 }
3578
3579 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3580                         enum dma_data_direction dir, unsigned long attrs)
3581 {
3582         int i;
3583         struct dmar_domain *domain;
3584         size_t size = 0;
3585         int prot = 0;
3586         unsigned long iova_pfn;
3587         int ret;
3588         struct scatterlist *sg;
3589         unsigned long start_vpfn;
3590         struct intel_iommu *iommu;
3591
3592         BUG_ON(dir == DMA_NONE);
3593
3594         if (unlikely(attach_deferred(dev)))
3595                 do_deferred_attach(dev);
3596
3597         domain = find_domain(dev);
3598         if (!domain)
3599                 return 0;
3600
3601         iommu = domain_get_iommu(domain);
3602
3603         for_each_sg(sglist, sg, nelems, i)
3604                 size += aligned_nrpages(sg->offset, sg->length);
3605
3606         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3607                                 *dev->dma_mask);
3608         if (!iova_pfn) {
3609                 sglist->dma_length = 0;
3610                 return 0;
3611         }
3612
3613         /*
3614          * Check if DMAR supports zero-length reads on write only
3615          * mappings..
3616          */
3617         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3618                         !cap_zlr(iommu->cap))
3619                 prot |= DMA_PTE_READ;
3620         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3621                 prot |= DMA_PTE_WRITE;
3622
3623         start_vpfn = mm_to_dma_pfn(iova_pfn);
3624
3625         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3626         if (unlikely(ret)) {
3627                 dma_pte_free_pagetable(domain, start_vpfn,
3628                                        start_vpfn + size - 1,
3629                                        agaw_to_level(domain->agaw) + 1);
3630                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3631                 return 0;
3632         }
3633
3634         for_each_sg(sglist, sg, nelems, i)
3635                 trace_map_sg(dev, i + 1, nelems, sg);
3636
3637         return nelems;
3638 }
3639
3640 static u64 intel_get_required_mask(struct device *dev)
3641 {
3642         return DMA_BIT_MASK(32);
3643 }
3644
3645 static const struct dma_map_ops intel_dma_ops = {
3646         .alloc = intel_alloc_coherent,
3647         .free = intel_free_coherent,
3648         .map_sg = intel_map_sg,
3649         .unmap_sg = intel_unmap_sg,
3650         .map_page = intel_map_page,
3651         .unmap_page = intel_unmap_page,
3652         .map_resource = intel_map_resource,
3653         .unmap_resource = intel_unmap_resource,
3654         .dma_supported = dma_direct_supported,
3655         .mmap = dma_common_mmap,
3656         .get_sgtable = dma_common_get_sgtable,
3657         .get_required_mask = intel_get_required_mask,
3658 };
3659
3660 static void
3661 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3662                    enum dma_data_direction dir, enum dma_sync_target target)
3663 {
3664         struct dmar_domain *domain;
3665         phys_addr_t tlb_addr;
3666
3667         domain = find_domain(dev);
3668         if (WARN_ON(!domain))
3669                 return;
3670
3671         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3672         if (is_swiotlb_buffer(tlb_addr))
3673                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3674 }
3675
3676 static dma_addr_t
3677 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3678                   enum dma_data_direction dir, unsigned long attrs,
3679                   u64 dma_mask)
3680 {
3681         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3682         struct dmar_domain *domain;
3683         struct intel_iommu *iommu;
3684         unsigned long iova_pfn;
3685         unsigned long nrpages;
3686         phys_addr_t tlb_addr;
3687         int prot = 0;
3688         int ret;
3689
3690         if (unlikely(attach_deferred(dev)))
3691                 do_deferred_attach(dev);
3692
3693         domain = find_domain(dev);
3694
3695         if (WARN_ON(dir == DMA_NONE || !domain))
3696                 return DMA_MAPPING_ERROR;
3697
3698         iommu = domain_get_iommu(domain);
3699         if (WARN_ON(!iommu))
3700                 return DMA_MAPPING_ERROR;
3701
3702         nrpages = aligned_nrpages(0, size);
3703         iova_pfn = intel_alloc_iova(dev, domain,
3704                                     dma_to_mm_pfn(nrpages), dma_mask);
3705         if (!iova_pfn)
3706                 return DMA_MAPPING_ERROR;
3707
3708         /*
3709          * Check if DMAR supports zero-length reads on write only
3710          * mappings..
3711          */
3712         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3713                         !cap_zlr(iommu->cap))
3714                 prot |= DMA_PTE_READ;
3715         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3716                 prot |= DMA_PTE_WRITE;
3717
3718         /*
3719          * If both the physical buffer start address and size are
3720          * page aligned, we don't need to use a bounce page.
3721          */
3722         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3723                 tlb_addr = swiotlb_tbl_map_single(dev,
3724                                 __phys_to_dma(dev, io_tlb_start),
3725                                 paddr, size, aligned_size, dir, attrs);
3726                 if (tlb_addr == DMA_MAPPING_ERROR) {
3727                         goto swiotlb_error;
3728                 } else {
3729                         /* Cleanup the padding area. */
3730                         void *padding_start = phys_to_virt(tlb_addr);
3731                         size_t padding_size = aligned_size;
3732
3733                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3734                             (dir == DMA_TO_DEVICE ||
3735                              dir == DMA_BIDIRECTIONAL)) {
3736                                 padding_start += size;
3737                                 padding_size -= size;
3738                         }
3739
3740                         memset(padding_start, 0, padding_size);
3741                 }
3742         } else {
3743                 tlb_addr = paddr;
3744         }
3745
3746         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3747                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3748         if (ret)
3749                 goto mapping_error;
3750
3751         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3752
3753         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3754
3755 mapping_error:
3756         if (is_swiotlb_buffer(tlb_addr))
3757                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3758                                          aligned_size, dir, attrs);
3759 swiotlb_error:
3760         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3761         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3762                 size, (unsigned long long)paddr, dir);
3763
3764         return DMA_MAPPING_ERROR;
3765 }
3766
3767 static void
3768 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3769                     enum dma_data_direction dir, unsigned long attrs)
3770 {
3771         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3772         struct dmar_domain *domain;
3773         phys_addr_t tlb_addr;
3774
3775         domain = find_domain(dev);
3776         if (WARN_ON(!domain))
3777                 return;
3778
3779         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3780         if (WARN_ON(!tlb_addr))
3781                 return;
3782
3783         intel_unmap(dev, dev_addr, size);
3784         if (is_swiotlb_buffer(tlb_addr))
3785                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3786                                          aligned_size, dir, attrs);
3787
3788         trace_bounce_unmap_single(dev, dev_addr, size);
3789 }
3790
3791 static dma_addr_t
3792 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3793                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3794 {
3795         return bounce_map_single(dev, page_to_phys(page) + offset,
3796                                  size, dir, attrs, *dev->dma_mask);
3797 }
3798
3799 static dma_addr_t
3800 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3801                     enum dma_data_direction dir, unsigned long attrs)
3802 {
3803         return bounce_map_single(dev, phys_addr, size,
3804                                  dir, attrs, *dev->dma_mask);
3805 }
3806
3807 static void
3808 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3809                   enum dma_data_direction dir, unsigned long attrs)
3810 {
3811         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3812 }
3813
3814 static void
3815 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3816                       enum dma_data_direction dir, unsigned long attrs)
3817 {
3818         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3819 }
3820
3821 static void
3822 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3823                 enum dma_data_direction dir, unsigned long attrs)
3824 {
3825         struct scatterlist *sg;
3826         int i;
3827
3828         for_each_sg(sglist, sg, nelems, i)
3829                 bounce_unmap_page(dev, sg->dma_address,
3830                                   sg_dma_len(sg), dir, attrs);
3831 }
3832
3833 static int
3834 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3835               enum dma_data_direction dir, unsigned long attrs)
3836 {
3837         int i;
3838         struct scatterlist *sg;
3839
3840         for_each_sg(sglist, sg, nelems, i) {
3841                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3842                                                   sg->offset, sg->length,
3843                                                   dir, attrs);
3844                 if (sg->dma_address == DMA_MAPPING_ERROR)
3845                         goto out_unmap;
3846                 sg_dma_len(sg) = sg->length;
3847         }
3848
3849         for_each_sg(sglist, sg, nelems, i)
3850                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3851
3852         return nelems;
3853
3854 out_unmap:
3855         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3856         return 0;
3857 }
3858
3859 static void
3860 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3861                            size_t size, enum dma_data_direction dir)
3862 {
3863         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3864 }
3865
3866 static void
3867 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3868                               size_t size, enum dma_data_direction dir)
3869 {
3870         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3871 }
3872
3873 static void
3874 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3875                        int nelems, enum dma_data_direction dir)
3876 {
3877         struct scatterlist *sg;
3878         int i;
3879
3880         for_each_sg(sglist, sg, nelems, i)
3881                 bounce_sync_single(dev, sg_dma_address(sg),
3882                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3883 }
3884
3885 static void
3886 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3887                           int nelems, enum dma_data_direction dir)
3888 {
3889         struct scatterlist *sg;
3890         int i;
3891
3892         for_each_sg(sglist, sg, nelems, i)
3893                 bounce_sync_single(dev, sg_dma_address(sg),
3894                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3895 }
3896
3897 static const struct dma_map_ops bounce_dma_ops = {
3898         .alloc                  = intel_alloc_coherent,
3899         .free                   = intel_free_coherent,
3900         .map_sg                 = bounce_map_sg,
3901         .unmap_sg               = bounce_unmap_sg,
3902         .map_page               = bounce_map_page,
3903         .unmap_page             = bounce_unmap_page,
3904         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3905         .sync_single_for_device = bounce_sync_single_for_device,
3906         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3907         .sync_sg_for_device     = bounce_sync_sg_for_device,
3908         .map_resource           = bounce_map_resource,
3909         .unmap_resource         = bounce_unmap_resource,
3910         .dma_supported          = dma_direct_supported,
3911 };
3912
3913 static inline int iommu_domain_cache_init(void)
3914 {
3915         int ret = 0;
3916
3917         iommu_domain_cache = kmem_cache_create("iommu_domain",
3918                                          sizeof(struct dmar_domain),
3919                                          0,
3920                                          SLAB_HWCACHE_ALIGN,
3921
3922                                          NULL);
3923         if (!iommu_domain_cache) {
3924                 pr_err("Couldn't create iommu_domain cache\n");
3925                 ret = -ENOMEM;
3926         }
3927
3928         return ret;
3929 }
3930
3931 static inline int iommu_devinfo_cache_init(void)
3932 {
3933         int ret = 0;
3934
3935         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3936                                          sizeof(struct device_domain_info),
3937                                          0,
3938                                          SLAB_HWCACHE_ALIGN,
3939                                          NULL);
3940         if (!iommu_devinfo_cache) {
3941                 pr_err("Couldn't create devinfo cache\n");
3942                 ret = -ENOMEM;
3943         }
3944
3945         return ret;
3946 }
3947
3948 static int __init iommu_init_mempool(void)
3949 {
3950         int ret;
3951         ret = iova_cache_get();
3952         if (ret)
3953                 return ret;
3954
3955         ret = iommu_domain_cache_init();
3956         if (ret)
3957                 goto domain_error;
3958
3959         ret = iommu_devinfo_cache_init();
3960         if (!ret)
3961                 return ret;
3962
3963         kmem_cache_destroy(iommu_domain_cache);
3964 domain_error:
3965         iova_cache_put();
3966
3967         return -ENOMEM;
3968 }
3969
3970 static void __init iommu_exit_mempool(void)
3971 {
3972         kmem_cache_destroy(iommu_devinfo_cache);
3973         kmem_cache_destroy(iommu_domain_cache);
3974         iova_cache_put();
3975 }
3976
3977 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3978 {
3979         struct dmar_drhd_unit *drhd;
3980         u32 vtbar;
3981         int rc;
3982
3983         /* We know that this device on this chipset has its own IOMMU.
3984          * If we find it under a different IOMMU, then the BIOS is lying
3985          * to us. Hope that the IOMMU for this device is actually
3986          * disabled, and it needs no translation...
3987          */
3988         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3989         if (rc) {
3990                 /* "can't" happen */
3991                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3992                 return;
3993         }
3994         vtbar &= 0xffff0000;
3995
3996         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3997         drhd = dmar_find_matched_drhd_unit(pdev);
3998         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3999                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4000                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4001                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4002         }
4003 }
4004 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4005
4006 static void __init init_no_remapping_devices(void)
4007 {
4008         struct dmar_drhd_unit *drhd;
4009         struct device *dev;
4010         int i;
4011
4012         for_each_drhd_unit(drhd) {
4013                 if (!drhd->include_all) {
4014                         for_each_active_dev_scope(drhd->devices,
4015                                                   drhd->devices_cnt, i, dev)
4016                                 break;
4017                         /* ignore DMAR unit if no devices exist */
4018                         if (i == drhd->devices_cnt)
4019                                 drhd->ignored = 1;
4020                 }
4021         }
4022
4023         for_each_active_drhd_unit(drhd) {
4024                 if (drhd->include_all)
4025                         continue;
4026
4027                 for_each_active_dev_scope(drhd->devices,
4028                                           drhd->devices_cnt, i, dev)
4029                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4030                                 break;
4031                 if (i < drhd->devices_cnt)
4032                         continue;
4033
4034                 /* This IOMMU has *only* gfx devices. Either bypass it or
4035                    set the gfx_mapped flag, as appropriate */
4036                 if (!dmar_map_gfx) {
4037                         drhd->ignored = 1;
4038                         for_each_active_dev_scope(drhd->devices,
4039                                                   drhd->devices_cnt, i, dev)
4040                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4041                 }
4042         }
4043 }
4044
4045 #ifdef CONFIG_SUSPEND
4046 static int init_iommu_hw(void)
4047 {
4048         struct dmar_drhd_unit *drhd;
4049         struct intel_iommu *iommu = NULL;
4050
4051         for_each_active_iommu(iommu, drhd)
4052                 if (iommu->qi)
4053                         dmar_reenable_qi(iommu);
4054
4055         for_each_iommu(iommu, drhd) {
4056                 if (drhd->ignored) {
4057                         /*
4058                          * we always have to disable PMRs or DMA may fail on
4059                          * this device
4060                          */
4061                         if (force_on)
4062                                 iommu_disable_protect_mem_regions(iommu);
4063                         continue;
4064                 }
4065
4066                 iommu_flush_write_buffer(iommu);
4067
4068                 iommu_set_root_entry(iommu);
4069
4070                 iommu->flush.flush_context(iommu, 0, 0, 0,
4071                                            DMA_CCMD_GLOBAL_INVL);
4072                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4073                 iommu_enable_translation(iommu);
4074                 iommu_disable_protect_mem_regions(iommu);
4075         }
4076
4077         return 0;
4078 }
4079
4080 static void iommu_flush_all(void)
4081 {
4082         struct dmar_drhd_unit *drhd;
4083         struct intel_iommu *iommu;
4084
4085         for_each_active_iommu(iommu, drhd) {
4086                 iommu->flush.flush_context(iommu, 0, 0, 0,
4087                                            DMA_CCMD_GLOBAL_INVL);
4088                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4089                                          DMA_TLB_GLOBAL_FLUSH);
4090         }
4091 }
4092
4093 static int iommu_suspend(void)
4094 {
4095         struct dmar_drhd_unit *drhd;
4096         struct intel_iommu *iommu = NULL;
4097         unsigned long flag;
4098
4099         for_each_active_iommu(iommu, drhd) {
4100                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4101                                                  GFP_ATOMIC);
4102                 if (!iommu->iommu_state)
4103                         goto nomem;
4104         }
4105
4106         iommu_flush_all();
4107
4108         for_each_active_iommu(iommu, drhd) {
4109                 iommu_disable_translation(iommu);
4110
4111                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4112
4113                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4114                         readl(iommu->reg + DMAR_FECTL_REG);
4115                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4116                         readl(iommu->reg + DMAR_FEDATA_REG);
4117                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4118                         readl(iommu->reg + DMAR_FEADDR_REG);
4119                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4120                         readl(iommu->reg + DMAR_FEUADDR_REG);
4121
4122                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4123         }
4124         return 0;
4125
4126 nomem:
4127         for_each_active_iommu(iommu, drhd)
4128                 kfree(iommu->iommu_state);
4129
4130         return -ENOMEM;
4131 }
4132
4133 static void iommu_resume(void)
4134 {
4135         struct dmar_drhd_unit *drhd;
4136         struct intel_iommu *iommu = NULL;
4137         unsigned long flag;
4138
4139         if (init_iommu_hw()) {
4140                 if (force_on)
4141                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4142                 else
4143                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4144                 return;
4145         }
4146
4147         for_each_active_iommu(iommu, drhd) {
4148
4149                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4150
4151                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4152                         iommu->reg + DMAR_FECTL_REG);
4153                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4154                         iommu->reg + DMAR_FEDATA_REG);
4155                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4156                         iommu->reg + DMAR_FEADDR_REG);
4157                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4158                         iommu->reg + DMAR_FEUADDR_REG);
4159
4160                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4161         }
4162
4163         for_each_active_iommu(iommu, drhd)
4164                 kfree(iommu->iommu_state);
4165 }
4166
4167 static struct syscore_ops iommu_syscore_ops = {
4168         .resume         = iommu_resume,
4169         .suspend        = iommu_suspend,
4170 };
4171
4172 static void __init init_iommu_pm_ops(void)
4173 {
4174         register_syscore_ops(&iommu_syscore_ops);
4175 }
4176
4177 #else
4178 static inline void init_iommu_pm_ops(void) {}
4179 #endif  /* CONFIG_PM */
4180
4181 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4182 {
4183         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4184             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4185             rmrr->end_address <= rmrr->base_address ||
4186             arch_rmrr_sanity_check(rmrr))
4187                 return -EINVAL;
4188
4189         return 0;
4190 }
4191
4192 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4193 {
4194         struct acpi_dmar_reserved_memory *rmrr;
4195         struct dmar_rmrr_unit *rmrru;
4196
4197         rmrr = (struct acpi_dmar_reserved_memory *)header;
4198         if (rmrr_sanity_check(rmrr)) {
4199                 pr_warn(FW_BUG
4200                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4201                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4202                            rmrr->base_address, rmrr->end_address,
4203                            dmi_get_system_info(DMI_BIOS_VENDOR),
4204                            dmi_get_system_info(DMI_BIOS_VERSION),
4205                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4206                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4207         }
4208
4209         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4210         if (!rmrru)
4211                 goto out;
4212
4213         rmrru->hdr = header;
4214
4215         rmrru->base_address = rmrr->base_address;
4216         rmrru->end_address = rmrr->end_address;
4217
4218         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4219                                 ((void *)rmrr) + rmrr->header.length,
4220                                 &rmrru->devices_cnt);
4221         if (rmrru->devices_cnt && rmrru->devices == NULL)
4222                 goto free_rmrru;
4223
4224         list_add(&rmrru->list, &dmar_rmrr_units);
4225
4226         return 0;
4227 free_rmrru:
4228         kfree(rmrru);
4229 out:
4230         return -ENOMEM;
4231 }
4232
4233 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4234 {
4235         struct dmar_atsr_unit *atsru;
4236         struct acpi_dmar_atsr *tmp;
4237
4238         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4239                                 dmar_rcu_check()) {
4240                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4241                 if (atsr->segment != tmp->segment)
4242                         continue;
4243                 if (atsr->header.length != tmp->header.length)
4244                         continue;
4245                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4246                         return atsru;
4247         }
4248
4249         return NULL;
4250 }
4251
4252 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4253 {
4254         struct acpi_dmar_atsr *atsr;
4255         struct dmar_atsr_unit *atsru;
4256
4257         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4258                 return 0;
4259
4260         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4261         atsru = dmar_find_atsr(atsr);
4262         if (atsru)
4263                 return 0;
4264
4265         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4266         if (!atsru)
4267                 return -ENOMEM;
4268
4269         /*
4270          * If memory is allocated from slab by ACPI _DSM method, we need to
4271          * copy the memory content because the memory buffer will be freed
4272          * on return.
4273          */
4274         atsru->hdr = (void *)(atsru + 1);
4275         memcpy(atsru->hdr, hdr, hdr->length);
4276         atsru->include_all = atsr->flags & 0x1;
4277         if (!atsru->include_all) {
4278                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4279                                 (void *)atsr + atsr->header.length,
4280                                 &atsru->devices_cnt);
4281                 if (atsru->devices_cnt && atsru->devices == NULL) {
4282                         kfree(atsru);
4283                         return -ENOMEM;
4284                 }
4285         }
4286
4287         list_add_rcu(&atsru->list, &dmar_atsr_units);
4288
4289         return 0;
4290 }
4291
4292 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4293 {
4294         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4295         kfree(atsru);
4296 }
4297
4298 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4299 {
4300         struct acpi_dmar_atsr *atsr;
4301         struct dmar_atsr_unit *atsru;
4302
4303         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4304         atsru = dmar_find_atsr(atsr);
4305         if (atsru) {
4306                 list_del_rcu(&atsru->list);
4307                 synchronize_rcu();
4308                 intel_iommu_free_atsr(atsru);
4309         }
4310
4311         return 0;
4312 }
4313
4314 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4315 {
4316         int i;
4317         struct device *dev;
4318         struct acpi_dmar_atsr *atsr;
4319         struct dmar_atsr_unit *atsru;
4320
4321         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4322         atsru = dmar_find_atsr(atsr);
4323         if (!atsru)
4324                 return 0;
4325
4326         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4327                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4328                                           i, dev)
4329                         return -EBUSY;
4330         }
4331
4332         return 0;
4333 }
4334
4335 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4336 {
4337         int sp, ret;
4338         struct intel_iommu *iommu = dmaru->iommu;
4339
4340         if (g_iommus[iommu->seq_id])
4341                 return 0;
4342
4343         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4344                 pr_warn("%s: Doesn't support hardware pass through.\n",
4345                         iommu->name);
4346                 return -ENXIO;
4347         }
4348         if (!ecap_sc_support(iommu->ecap) &&
4349             domain_update_iommu_snooping(iommu)) {
4350                 pr_warn("%s: Doesn't support snooping.\n",
4351                         iommu->name);
4352                 return -ENXIO;
4353         }
4354         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4355         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4356                 pr_warn("%s: Doesn't support large page.\n",
4357                         iommu->name);
4358                 return -ENXIO;
4359         }
4360
4361         /*
4362          * Disable translation if already enabled prior to OS handover.
4363          */
4364         if (iommu->gcmd & DMA_GCMD_TE)
4365                 iommu_disable_translation(iommu);
4366
4367         g_iommus[iommu->seq_id] = iommu;
4368         ret = iommu_init_domains(iommu);
4369         if (ret == 0)
4370                 ret = iommu_alloc_root_entry(iommu);
4371         if (ret)
4372                 goto out;
4373
4374         intel_svm_check(iommu);
4375
4376         if (dmaru->ignored) {
4377                 /*
4378                  * we always have to disable PMRs or DMA may fail on this device
4379                  */
4380                 if (force_on)
4381                         iommu_disable_protect_mem_regions(iommu);
4382                 return 0;
4383         }
4384
4385         intel_iommu_init_qi(iommu);
4386         iommu_flush_write_buffer(iommu);
4387
4388 #ifdef CONFIG_INTEL_IOMMU_SVM
4389         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4390                 ret = intel_svm_enable_prq(iommu);
4391                 if (ret)
4392                         goto disable_iommu;
4393         }
4394 #endif
4395         ret = dmar_set_interrupt(iommu);
4396         if (ret)
4397                 goto disable_iommu;
4398
4399         iommu_set_root_entry(iommu);
4400         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4401         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4402         iommu_enable_translation(iommu);
4403
4404         iommu_disable_protect_mem_regions(iommu);
4405         return 0;
4406
4407 disable_iommu:
4408         disable_dmar_iommu(iommu);
4409 out:
4410         free_dmar_iommu(iommu);
4411         return ret;
4412 }
4413
4414 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4415 {
4416         int ret = 0;
4417         struct intel_iommu *iommu = dmaru->iommu;
4418
4419         if (!intel_iommu_enabled)
4420                 return 0;
4421         if (iommu == NULL)
4422                 return -EINVAL;
4423
4424         if (insert) {
4425                 ret = intel_iommu_add(dmaru);
4426         } else {
4427                 disable_dmar_iommu(iommu);
4428                 free_dmar_iommu(iommu);
4429         }
4430
4431         return ret;
4432 }
4433
4434 static void intel_iommu_free_dmars(void)
4435 {
4436         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4437         struct dmar_atsr_unit *atsru, *atsr_n;
4438
4439         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4440                 list_del(&rmrru->list);
4441                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4442                 kfree(rmrru);
4443         }
4444
4445         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4446                 list_del(&atsru->list);
4447                 intel_iommu_free_atsr(atsru);
4448         }
4449 }
4450
4451 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4452 {
4453         int i, ret = 1;
4454         struct pci_bus *bus;
4455         struct pci_dev *bridge = NULL;
4456         struct device *tmp;
4457         struct acpi_dmar_atsr *atsr;
4458         struct dmar_atsr_unit *atsru;
4459
4460         dev = pci_physfn(dev);
4461         for (bus = dev->bus; bus; bus = bus->parent) {
4462                 bridge = bus->self;
4463                 /* If it's an integrated device, allow ATS */
4464                 if (!bridge)
4465                         return 1;
4466                 /* Connected via non-PCIe: no ATS */
4467                 if (!pci_is_pcie(bridge) ||
4468                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4469                         return 0;
4470                 /* If we found the root port, look it up in the ATSR */
4471                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4472                         break;
4473         }
4474
4475         rcu_read_lock();
4476         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4477                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4478                 if (atsr->segment != pci_domain_nr(dev->bus))
4479                         continue;
4480
4481                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4482                         if (tmp == &bridge->dev)
4483                                 goto out;
4484
4485                 if (atsru->include_all)
4486                         goto out;
4487         }
4488         ret = 0;
4489 out:
4490         rcu_read_unlock();
4491
4492         return ret;
4493 }
4494
4495 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4496 {
4497         int ret;
4498         struct dmar_rmrr_unit *rmrru;
4499         struct dmar_atsr_unit *atsru;
4500         struct acpi_dmar_atsr *atsr;
4501         struct acpi_dmar_reserved_memory *rmrr;
4502
4503         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4504                 return 0;
4505
4506         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4507                 rmrr = container_of(rmrru->hdr,
4508                                     struct acpi_dmar_reserved_memory, header);
4509                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4510                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4511                                 ((void *)rmrr) + rmrr->header.length,
4512                                 rmrr->segment, rmrru->devices,
4513                                 rmrru->devices_cnt);
4514                         if (ret < 0)
4515                                 return ret;
4516                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4517                         dmar_remove_dev_scope(info, rmrr->segment,
4518                                 rmrru->devices, rmrru->devices_cnt);
4519                 }
4520         }
4521
4522         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4523                 if (atsru->include_all)
4524                         continue;
4525
4526                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4527                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4528                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4529                                         (void *)atsr + atsr->header.length,
4530                                         atsr->segment, atsru->devices,
4531                                         atsru->devices_cnt);
4532                         if (ret > 0)
4533                                 break;
4534                         else if (ret < 0)
4535                                 return ret;
4536                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4537                         if (dmar_remove_dev_scope(info, atsr->segment,
4538                                         atsru->devices, atsru->devices_cnt))
4539                                 break;
4540                 }
4541         }
4542
4543         return 0;
4544 }
4545
4546 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4547                                        unsigned long val, void *v)
4548 {
4549         struct memory_notify *mhp = v;
4550         unsigned long long start, end;
4551         unsigned long start_vpfn, last_vpfn;
4552
4553         switch (val) {
4554         case MEM_GOING_ONLINE:
4555                 start = mhp->start_pfn << PAGE_SHIFT;
4556                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4557                 if (iommu_domain_identity_map(si_domain, start, end)) {
4558                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4559                                 start, end);
4560                         return NOTIFY_BAD;
4561                 }
4562                 break;
4563
4564         case MEM_OFFLINE:
4565         case MEM_CANCEL_ONLINE:
4566                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4568                 while (start_vpfn <= last_vpfn) {
4569                         struct iova *iova;
4570                         struct dmar_drhd_unit *drhd;
4571                         struct intel_iommu *iommu;
4572                         struct page *freelist;
4573
4574                         iova = find_iova(&si_domain->iovad, start_vpfn);
4575                         if (iova == NULL) {
4576                                 pr_debug("Failed get IOVA for PFN %lx\n",
4577                                          start_vpfn);
4578                                 break;
4579                         }
4580
4581                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4582                                                      start_vpfn, last_vpfn);
4583                         if (iova == NULL) {
4584                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4585                                         start_vpfn, last_vpfn);
4586                                 return NOTIFY_BAD;
4587                         }
4588
4589                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4590                                                iova->pfn_hi);
4591
4592                         rcu_read_lock();
4593                         for_each_active_iommu(iommu, drhd)
4594                                 iommu_flush_iotlb_psi(iommu, si_domain,
4595                                         iova->pfn_lo, iova_size(iova),
4596                                         !freelist, 0);
4597                         rcu_read_unlock();
4598                         dma_free_pagelist(freelist);
4599
4600                         start_vpfn = iova->pfn_hi + 1;
4601                         free_iova_mem(iova);
4602                 }
4603                 break;
4604         }
4605
4606         return NOTIFY_OK;
4607 }
4608
4609 static struct notifier_block intel_iommu_memory_nb = {
4610         .notifier_call = intel_iommu_memory_notifier,
4611         .priority = 0
4612 };
4613
4614 static void free_all_cpu_cached_iovas(unsigned int cpu)
4615 {
4616         int i;
4617
4618         for (i = 0; i < g_num_of_iommus; i++) {
4619                 struct intel_iommu *iommu = g_iommus[i];
4620                 struct dmar_domain *domain;
4621                 int did;
4622
4623                 if (!iommu)
4624                         continue;
4625
4626                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4627                         domain = get_iommu_domain(iommu, (u16)did);
4628
4629                         if (!domain)
4630                                 continue;
4631                         free_cpu_cached_iovas(cpu, &domain->iovad);
4632                 }
4633         }
4634 }
4635
4636 static int intel_iommu_cpu_dead(unsigned int cpu)
4637 {
4638         free_all_cpu_cached_iovas(cpu);
4639         return 0;
4640 }
4641
4642 static void intel_disable_iommus(void)
4643 {
4644         struct intel_iommu *iommu = NULL;
4645         struct dmar_drhd_unit *drhd;
4646
4647         for_each_iommu(iommu, drhd)
4648                 iommu_disable_translation(iommu);
4649 }
4650
4651 void intel_iommu_shutdown(void)
4652 {
4653         struct dmar_drhd_unit *drhd;
4654         struct intel_iommu *iommu = NULL;
4655
4656         if (no_iommu || dmar_disabled)
4657                 return;
4658
4659         down_write(&dmar_global_lock);
4660
4661         /* Disable PMRs explicitly here. */
4662         for_each_iommu(iommu, drhd)
4663                 iommu_disable_protect_mem_regions(iommu);
4664
4665         /* Make sure the IOMMUs are switched off */
4666         intel_disable_iommus();
4667
4668         up_write(&dmar_global_lock);
4669 }
4670
4671 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4672 {
4673         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4674
4675         return container_of(iommu_dev, struct intel_iommu, iommu);
4676 }
4677
4678 static ssize_t intel_iommu_show_version(struct device *dev,
4679                                         struct device_attribute *attr,
4680                                         char *buf)
4681 {
4682         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4683         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4684         return sprintf(buf, "%d:%d\n",
4685                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4686 }
4687 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4688
4689 static ssize_t intel_iommu_show_address(struct device *dev,
4690                                         struct device_attribute *attr,
4691                                         char *buf)
4692 {
4693         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4694         return sprintf(buf, "%llx\n", iommu->reg_phys);
4695 }
4696 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4697
4698 static ssize_t intel_iommu_show_cap(struct device *dev,
4699                                     struct device_attribute *attr,
4700                                     char *buf)
4701 {
4702         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4703         return sprintf(buf, "%llx\n", iommu->cap);
4704 }
4705 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4706
4707 static ssize_t intel_iommu_show_ecap(struct device *dev,
4708                                     struct device_attribute *attr,
4709                                     char *buf)
4710 {
4711         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4712         return sprintf(buf, "%llx\n", iommu->ecap);
4713 }
4714 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4715
4716 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4717                                       struct device_attribute *attr,
4718                                       char *buf)
4719 {
4720         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4721         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4722 }
4723 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4724
4725 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4726                                            struct device_attribute *attr,
4727                                            char *buf)
4728 {
4729         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4730         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4731                                                   cap_ndoms(iommu->cap)));
4732 }
4733 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4734
4735 static struct attribute *intel_iommu_attrs[] = {
4736         &dev_attr_version.attr,
4737         &dev_attr_address.attr,
4738         &dev_attr_cap.attr,
4739         &dev_attr_ecap.attr,
4740         &dev_attr_domains_supported.attr,
4741         &dev_attr_domains_used.attr,
4742         NULL,
4743 };
4744
4745 static struct attribute_group intel_iommu_group = {
4746         .name = "intel-iommu",
4747         .attrs = intel_iommu_attrs,
4748 };
4749
4750 const struct attribute_group *intel_iommu_groups[] = {
4751         &intel_iommu_group,
4752         NULL,
4753 };
4754
4755 static inline bool has_untrusted_dev(void)
4756 {
4757         struct pci_dev *pdev = NULL;
4758
4759         for_each_pci_dev(pdev)
4760                 if (pdev->untrusted)
4761                         return true;
4762
4763         return false;
4764 }
4765
4766 static int __init platform_optin_force_iommu(void)
4767 {
4768         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4769                 return 0;
4770
4771         if (no_iommu || dmar_disabled)
4772                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4773
4774         /*
4775          * If Intel-IOMMU is disabled by default, we will apply identity
4776          * map for all devices except those marked as being untrusted.
4777          */
4778         if (dmar_disabled)
4779                 iommu_set_default_passthrough(false);
4780
4781         dmar_disabled = 0;
4782         no_iommu = 0;
4783
4784         return 1;
4785 }
4786
4787 static int __init probe_acpi_namespace_devices(void)
4788 {
4789         struct dmar_drhd_unit *drhd;
4790         /* To avoid a -Wunused-but-set-variable warning. */
4791         struct intel_iommu *iommu __maybe_unused;
4792         struct device *dev;
4793         int i, ret = 0;
4794
4795         for_each_active_iommu(iommu, drhd) {
4796                 for_each_active_dev_scope(drhd->devices,
4797                                           drhd->devices_cnt, i, dev) {
4798                         struct acpi_device_physical_node *pn;
4799                         struct iommu_group *group;
4800                         struct acpi_device *adev;
4801
4802                         if (dev->bus != &acpi_bus_type)
4803                                 continue;
4804
4805                         adev = to_acpi_device(dev);
4806                         mutex_lock(&adev->physical_node_lock);
4807                         list_for_each_entry(pn,
4808                                             &adev->physical_node_list, node) {
4809                                 group = iommu_group_get(pn->dev);
4810                                 if (group) {
4811                                         iommu_group_put(group);
4812                                         continue;
4813                                 }
4814
4815                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4816                                 ret = iommu_probe_device(pn->dev);
4817                                 if (ret)
4818                                         break;
4819                         }
4820                         mutex_unlock(&adev->physical_node_lock);
4821
4822                         if (ret)
4823                                 return ret;
4824                 }
4825         }
4826
4827         return 0;
4828 }
4829
4830 int __init intel_iommu_init(void)
4831 {
4832         int ret = -ENODEV;
4833         struct dmar_drhd_unit *drhd;
4834         struct intel_iommu *iommu;
4835
4836         /*
4837          * Intel IOMMU is required for a TXT/tboot launch or platform
4838          * opt in, so enforce that.
4839          */
4840         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4841
4842         if (iommu_init_mempool()) {
4843                 if (force_on)
4844                         panic("tboot: Failed to initialize iommu memory\n");
4845                 return -ENOMEM;
4846         }
4847
4848         down_write(&dmar_global_lock);
4849         if (dmar_table_init()) {
4850                 if (force_on)
4851                         panic("tboot: Failed to initialize DMAR table\n");
4852                 goto out_free_dmar;
4853         }
4854
4855         if (dmar_dev_scope_init() < 0) {
4856                 if (force_on)
4857                         panic("tboot: Failed to initialize DMAR device scope\n");
4858                 goto out_free_dmar;
4859         }
4860
4861         up_write(&dmar_global_lock);
4862
4863         /*
4864          * The bus notifier takes the dmar_global_lock, so lockdep will
4865          * complain later when we register it under the lock.
4866          */
4867         dmar_register_bus_notifier();
4868
4869         down_write(&dmar_global_lock);
4870
4871         if (!no_iommu)
4872                 intel_iommu_debugfs_init();
4873
4874         if (no_iommu || dmar_disabled) {
4875                 /*
4876                  * We exit the function here to ensure IOMMU's remapping and
4877                  * mempool aren't setup, which means that the IOMMU's PMRs
4878                  * won't be disabled via the call to init_dmars(). So disable
4879                  * it explicitly here. The PMRs were setup by tboot prior to
4880                  * calling SENTER, but the kernel is expected to reset/tear
4881                  * down the PMRs.
4882                  */
4883                 if (intel_iommu_tboot_noforce) {
4884                         for_each_iommu(iommu, drhd)
4885                                 iommu_disable_protect_mem_regions(iommu);
4886                 }
4887
4888                 /*
4889                  * Make sure the IOMMUs are switched off, even when we
4890                  * boot into a kexec kernel and the previous kernel left
4891                  * them enabled
4892                  */
4893                 intel_disable_iommus();
4894                 goto out_free_dmar;
4895         }
4896
4897         if (list_empty(&dmar_rmrr_units))
4898                 pr_info("No RMRR found\n");
4899
4900         if (list_empty(&dmar_atsr_units))
4901                 pr_info("No ATSR found\n");
4902
4903         if (dmar_init_reserved_ranges()) {
4904                 if (force_on)
4905                         panic("tboot: Failed to reserve iommu ranges\n");
4906                 goto out_free_reserved_range;
4907         }
4908
4909         if (dmar_map_gfx)
4910                 intel_iommu_gfx_mapped = 1;
4911
4912         init_no_remapping_devices();
4913
4914         ret = init_dmars();
4915         if (ret) {
4916                 if (force_on)
4917                         panic("tboot: Failed to initialize DMARs\n");
4918                 pr_err("Initialization failed\n");
4919                 goto out_free_reserved_range;
4920         }
4921         up_write(&dmar_global_lock);
4922
4923         init_iommu_pm_ops();
4924
4925         down_read(&dmar_global_lock);
4926         for_each_active_iommu(iommu, drhd) {
4927                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4928                                        intel_iommu_groups,
4929                                        "%s", iommu->name);
4930                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4931                 iommu_device_register(&iommu->iommu);
4932         }
4933         up_read(&dmar_global_lock);
4934
4935         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4936         if (si_domain && !hw_pass_through)
4937                 register_memory_notifier(&intel_iommu_memory_nb);
4938         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4939                           intel_iommu_cpu_dead);
4940
4941         down_read(&dmar_global_lock);
4942         if (probe_acpi_namespace_devices())
4943                 pr_warn("ACPI name space devices didn't probe correctly\n");
4944
4945         /* Finally, we enable the DMA remapping hardware. */
4946         for_each_iommu(iommu, drhd) {
4947                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4948                         iommu_enable_translation(iommu);
4949
4950                 iommu_disable_protect_mem_regions(iommu);
4951         }
4952         up_read(&dmar_global_lock);
4953
4954         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4955
4956         intel_iommu_enabled = 1;
4957
4958         return 0;
4959
4960 out_free_reserved_range:
4961         put_iova_domain(&reserved_iova_list);
4962 out_free_dmar:
4963         intel_iommu_free_dmars();
4964         up_write(&dmar_global_lock);
4965         iommu_exit_mempool();
4966         return ret;
4967 }
4968
4969 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4970 {
4971         struct intel_iommu *iommu = opaque;
4972
4973         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4974         return 0;
4975 }
4976
4977 /*
4978  * NB - intel-iommu lacks any sort of reference counting for the users of
4979  * dependent devices.  If multiple endpoints have intersecting dependent
4980  * devices, unbinding the driver from any one of them will possibly leave
4981  * the others unable to operate.
4982  */
4983 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4984 {
4985         if (!iommu || !dev || !dev_is_pci(dev))
4986                 return;
4987
4988         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4989 }
4990
4991 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4992 {
4993         struct dmar_domain *domain;
4994         struct intel_iommu *iommu;
4995         unsigned long flags;
4996
4997         assert_spin_locked(&device_domain_lock);
4998
4999         if (WARN_ON(!info))
5000                 return;
5001
5002         iommu = info->iommu;
5003         domain = info->domain;
5004
5005         if (info->dev) {
5006                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5007                         intel_pasid_tear_down_entry(iommu, info->dev,
5008                                         PASID_RID2PASID, false);
5009
5010                 iommu_disable_dev_iotlb(info);
5011                 domain_context_clear(iommu, info->dev);
5012                 intel_pasid_free_table(info->dev);
5013         }
5014
5015         unlink_domain_info(info);
5016
5017         spin_lock_irqsave(&iommu->lock, flags);
5018         domain_detach_iommu(domain, iommu);
5019         spin_unlock_irqrestore(&iommu->lock, flags);
5020
5021         free_devinfo_mem(info);
5022 }
5023
5024 static void dmar_remove_one_dev_info(struct device *dev)
5025 {
5026         struct device_domain_info *info;
5027         unsigned long flags;
5028
5029         spin_lock_irqsave(&device_domain_lock, flags);
5030         info = get_domain_info(dev);
5031         if (info)
5032                 __dmar_remove_one_dev_info(info);
5033         spin_unlock_irqrestore(&device_domain_lock, flags);
5034 }
5035
5036 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5037 {
5038         int adjust_width;
5039
5040         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5041         domain_reserve_special_ranges(domain);
5042
5043         /* calculate AGAW */
5044         domain->gaw = guest_width;
5045         adjust_width = guestwidth_to_adjustwidth(guest_width);
5046         domain->agaw = width_to_agaw(adjust_width);
5047
5048         domain->iommu_coherency = 0;
5049         domain->iommu_snooping = 0;
5050         domain->iommu_superpage = 0;
5051         domain->max_addr = 0;
5052
5053         /* always allocate the top pgd */
5054         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5055         if (!domain->pgd)
5056                 return -ENOMEM;
5057         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5058         return 0;
5059 }
5060
5061 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5062 {
5063         struct dmar_domain *dmar_domain;
5064         struct iommu_domain *domain;
5065         int ret;
5066
5067         switch (type) {
5068         case IOMMU_DOMAIN_DMA:
5069         /* fallthrough */
5070         case IOMMU_DOMAIN_UNMANAGED:
5071                 dmar_domain = alloc_domain(0);
5072                 if (!dmar_domain) {
5073                         pr_err("Can't allocate dmar_domain\n");
5074                         return NULL;
5075                 }
5076                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5077                         pr_err("Domain initialization failed\n");
5078                         domain_exit(dmar_domain);
5079                         return NULL;
5080                 }
5081
5082                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5083                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5084                                                     iommu_flush_iova,
5085                                                     iova_entry_free);
5086                         if (ret)
5087                                 pr_info("iova flush queue initialization failed\n");
5088                 }
5089
5090                 domain_update_iommu_cap(dmar_domain);
5091
5092                 domain = &dmar_domain->domain;
5093                 domain->geometry.aperture_start = 0;
5094                 domain->geometry.aperture_end   =
5095                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5096                 domain->geometry.force_aperture = true;
5097
5098                 return domain;
5099         case IOMMU_DOMAIN_IDENTITY:
5100                 return &si_domain->domain;
5101         default:
5102                 return NULL;
5103         }
5104
5105         return NULL;
5106 }
5107
5108 static void intel_iommu_domain_free(struct iommu_domain *domain)
5109 {
5110         if (domain != &si_domain->domain)
5111                 domain_exit(to_dmar_domain(domain));
5112 }
5113
5114 /*
5115  * Check whether a @domain could be attached to the @dev through the
5116  * aux-domain attach/detach APIs.
5117  */
5118 static inline bool
5119 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5120 {
5121         struct device_domain_info *info = get_domain_info(dev);
5122
5123         return info && info->auxd_enabled &&
5124                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5125 }
5126
5127 static void auxiliary_link_device(struct dmar_domain *domain,
5128                                   struct device *dev)
5129 {
5130         struct device_domain_info *info = get_domain_info(dev);
5131
5132         assert_spin_locked(&device_domain_lock);
5133         if (WARN_ON(!info))
5134                 return;
5135
5136         domain->auxd_refcnt++;
5137         list_add(&domain->auxd, &info->auxiliary_domains);
5138 }
5139
5140 static void auxiliary_unlink_device(struct dmar_domain *domain,
5141                                     struct device *dev)
5142 {
5143         struct device_domain_info *info = get_domain_info(dev);
5144
5145         assert_spin_locked(&device_domain_lock);
5146         if (WARN_ON(!info))
5147                 return;
5148
5149         list_del(&domain->auxd);
5150         domain->auxd_refcnt--;
5151
5152         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5153                 ioasid_free(domain->default_pasid);
5154 }
5155
5156 static int aux_domain_add_dev(struct dmar_domain *domain,
5157                               struct device *dev)
5158 {
5159         int ret;
5160         u8 bus, devfn;
5161         unsigned long flags;
5162         struct intel_iommu *iommu;
5163
5164         iommu = device_to_iommu(dev, &bus, &devfn);
5165         if (!iommu)
5166                 return -ENODEV;
5167
5168         if (domain->default_pasid <= 0) {
5169                 int pasid;
5170
5171                 /* No private data needed for the default pasid */
5172                 pasid = ioasid_alloc(NULL, PASID_MIN,
5173                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5174                                      NULL);
5175                 if (pasid == INVALID_IOASID) {
5176                         pr_err("Can't allocate default pasid\n");
5177                         return -ENODEV;
5178                 }
5179                 domain->default_pasid = pasid;
5180         }
5181
5182         spin_lock_irqsave(&device_domain_lock, flags);
5183         /*
5184          * iommu->lock must be held to attach domain to iommu and setup the
5185          * pasid entry for second level translation.
5186          */
5187         spin_lock(&iommu->lock);
5188         ret = domain_attach_iommu(domain, iommu);
5189         if (ret)
5190                 goto attach_failed;
5191
5192         /* Setup the PASID entry for mediated devices: */
5193         if (domain_use_first_level(domain))
5194                 ret = domain_setup_first_level(iommu, domain, dev,
5195                                                domain->default_pasid);
5196         else
5197                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5198                                                      domain->default_pasid);
5199         if (ret)
5200                 goto table_failed;
5201         spin_unlock(&iommu->lock);
5202
5203         auxiliary_link_device(domain, dev);
5204
5205         spin_unlock_irqrestore(&device_domain_lock, flags);
5206
5207         return 0;
5208
5209 table_failed:
5210         domain_detach_iommu(domain, iommu);
5211 attach_failed:
5212         spin_unlock(&iommu->lock);
5213         spin_unlock_irqrestore(&device_domain_lock, flags);
5214         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5215                 ioasid_free(domain->default_pasid);
5216
5217         return ret;
5218 }
5219
5220 static void aux_domain_remove_dev(struct dmar_domain *domain,
5221                                   struct device *dev)
5222 {
5223         struct device_domain_info *info;
5224         struct intel_iommu *iommu;
5225         unsigned long flags;
5226
5227         if (!is_aux_domain(dev, &domain->domain))
5228                 return;
5229
5230         spin_lock_irqsave(&device_domain_lock, flags);
5231         info = get_domain_info(dev);
5232         iommu = info->iommu;
5233
5234         auxiliary_unlink_device(domain, dev);
5235
5236         spin_lock(&iommu->lock);
5237         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5238         domain_detach_iommu(domain, iommu);
5239         spin_unlock(&iommu->lock);
5240
5241         spin_unlock_irqrestore(&device_domain_lock, flags);
5242 }
5243
5244 static int prepare_domain_attach_device(struct iommu_domain *domain,
5245                                         struct device *dev)
5246 {
5247         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5248         struct intel_iommu *iommu;
5249         int addr_width;
5250         u8 bus, devfn;
5251
5252         iommu = device_to_iommu(dev, &bus, &devfn);
5253         if (!iommu)
5254                 return -ENODEV;
5255
5256         /* check if this iommu agaw is sufficient for max mapped address */
5257         addr_width = agaw_to_width(iommu->agaw);
5258         if (addr_width > cap_mgaw(iommu->cap))
5259                 addr_width = cap_mgaw(iommu->cap);
5260
5261         if (dmar_domain->max_addr > (1LL << addr_width)) {
5262                 dev_err(dev, "%s: iommu width (%d) is not "
5263                         "sufficient for the mapped address (%llx)\n",
5264                         __func__, addr_width, dmar_domain->max_addr);
5265                 return -EFAULT;
5266         }
5267         dmar_domain->gaw = addr_width;
5268
5269         /*
5270          * Knock out extra levels of page tables if necessary
5271          */
5272         while (iommu->agaw < dmar_domain->agaw) {
5273                 struct dma_pte *pte;
5274
5275                 pte = dmar_domain->pgd;
5276                 if (dma_pte_present(pte)) {
5277                         dmar_domain->pgd = (struct dma_pte *)
5278                                 phys_to_virt(dma_pte_addr(pte));
5279                         free_pgtable_page(pte);
5280                 }
5281                 dmar_domain->agaw--;
5282         }
5283
5284         return 0;
5285 }
5286
5287 static int intel_iommu_attach_device(struct iommu_domain *domain,
5288                                      struct device *dev)
5289 {
5290         int ret;
5291
5292         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5293             device_is_rmrr_locked(dev)) {
5294                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5295                 return -EPERM;
5296         }
5297
5298         if (is_aux_domain(dev, domain))
5299                 return -EPERM;
5300
5301         /* normally dev is not mapped */
5302         if (unlikely(domain_context_mapped(dev))) {
5303                 struct dmar_domain *old_domain;
5304
5305                 old_domain = find_domain(dev);
5306                 if (old_domain)
5307                         dmar_remove_one_dev_info(dev);
5308         }
5309
5310         ret = prepare_domain_attach_device(domain, dev);
5311         if (ret)
5312                 return ret;
5313
5314         return domain_add_dev_info(to_dmar_domain(domain), dev);
5315 }
5316
5317 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5318                                          struct device *dev)
5319 {
5320         int ret;
5321
5322         if (!is_aux_domain(dev, domain))
5323                 return -EPERM;
5324
5325         ret = prepare_domain_attach_device(domain, dev);
5326         if (ret)
5327                 return ret;
5328
5329         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5330 }
5331
5332 static void intel_iommu_detach_device(struct iommu_domain *domain,
5333                                       struct device *dev)
5334 {
5335         dmar_remove_one_dev_info(dev);
5336 }
5337
5338 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5339                                           struct device *dev)
5340 {
5341         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5342 }
5343
5344 /*
5345  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5346  * VT-d granularity. Invalidation is typically included in the unmap operation
5347  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5348  * owns the first level page tables. Invalidations of translation caches in the
5349  * guest are trapped and passed down to the host.
5350  *
5351  * vIOMMU in the guest will only expose first level page tables, therefore
5352  * we do not support IOTLB granularity for request without PASID (second level).
5353  *
5354  * For example, to find the VT-d granularity encoding for IOTLB
5355  * type and page selective granularity within PASID:
5356  * X: indexed by iommu cache type
5357  * Y: indexed by enum iommu_inv_granularity
5358  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5359  */
5360
5361 const static int
5362 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5363         /*
5364          * PASID based IOTLB invalidation: PASID selective (per PASID),
5365          * page selective (address granularity)
5366          */
5367         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5368         /* PASID based dev TLBs */
5369         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5370         /* PASID cache */
5371         {-EINVAL, -EINVAL, -EINVAL}
5372 };
5373
5374 static inline int to_vtd_granularity(int type, int granu)
5375 {
5376         return inv_type_granu_table[type][granu];
5377 }
5378
5379 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5380 {
5381         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5382
5383         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5384          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5385          * granu size in contiguous memory.
5386          */
5387         return order_base_2(nr_pages);
5388 }
5389
5390 #ifdef CONFIG_INTEL_IOMMU_SVM
5391 static int
5392 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5393                            struct iommu_cache_invalidate_info *inv_info)
5394 {
5395         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5396         struct device_domain_info *info;
5397         struct intel_iommu *iommu;
5398         unsigned long flags;
5399         int cache_type;
5400         u8 bus, devfn;
5401         u16 did, sid;
5402         int ret = 0;
5403         u64 size = 0;
5404
5405         if (!inv_info || !dmar_domain ||
5406             inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5407                 return -EINVAL;
5408
5409         if (!dev || !dev_is_pci(dev))
5410                 return -ENODEV;
5411
5412         iommu = device_to_iommu(dev, &bus, &devfn);
5413         if (!iommu)
5414                 return -ENODEV;
5415
5416         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5417                 return -EINVAL;
5418
5419         spin_lock_irqsave(&device_domain_lock, flags);
5420         spin_lock(&iommu->lock);
5421         info = get_domain_info(dev);
5422         if (!info) {
5423                 ret = -EINVAL;
5424                 goto out_unlock;
5425         }
5426         did = dmar_domain->iommu_did[iommu->seq_id];
5427         sid = PCI_DEVID(bus, devfn);
5428
5429         /* Size is only valid in address selective invalidation */
5430         if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5431                 size = to_vtd_size(inv_info->addr_info.granule_size,
5432                                    inv_info->addr_info.nb_granules);
5433
5434         for_each_set_bit(cache_type,
5435                          (unsigned long *)&inv_info->cache,
5436                          IOMMU_CACHE_INV_TYPE_NR) {
5437                 int granu = 0;
5438                 u64 pasid = 0;
5439
5440                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5441                 if (granu == -EINVAL) {
5442                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5443                                            cache_type, inv_info->granularity);
5444                         break;
5445                 }
5446
5447                 /*
5448                  * PASID is stored in different locations based on the
5449                  * granularity.
5450                  */
5451                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5452                     (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5453                         pasid = inv_info->pasid_info.pasid;
5454                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5455                          (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5456                         pasid = inv_info->addr_info.pasid;
5457
5458                 switch (BIT(cache_type)) {
5459                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5460                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5461                             size &&
5462                             (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5463                                 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5464                                                    inv_info->addr_info.addr, size);
5465                                 ret = -ERANGE;
5466                                 goto out_unlock;
5467                         }
5468
5469                         /*
5470                          * If granu is PASID-selective, address is ignored.
5471                          * We use npages = -1 to indicate that.
5472                          */
5473                         qi_flush_piotlb(iommu, did, pasid,
5474                                         mm_to_dma_pfn(inv_info->addr_info.addr),
5475                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5476                                         inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5477
5478                         /*
5479                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5480                          * in the guest may assume IOTLB flush is inclusive,
5481                          * which is more efficient.
5482                          */
5483                         if (info->ats_enabled)
5484                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5485                                                 info->pfsid, pasid,
5486                                                 info->ats_qdep,
5487                                                 inv_info->addr_info.addr,
5488                                                 size, granu);
5489                         break;
5490                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5491                         if (info->ats_enabled)
5492                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5493                                                 info->pfsid, pasid,
5494                                                 info->ats_qdep,
5495                                                 inv_info->addr_info.addr,
5496                                                 size, granu);
5497                         else
5498                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5499                         break;
5500                 default:
5501                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5502                                             cache_type);
5503                         ret = -EINVAL;
5504                 }
5505         }
5506 out_unlock:
5507         spin_unlock(&iommu->lock);
5508         spin_unlock_irqrestore(&device_domain_lock, flags);
5509
5510         return ret;
5511 }
5512 #endif
5513
5514 static int intel_iommu_map(struct iommu_domain *domain,
5515                            unsigned long iova, phys_addr_t hpa,
5516                            size_t size, int iommu_prot, gfp_t gfp)
5517 {
5518         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5519         u64 max_addr;
5520         int prot = 0;
5521         int ret;
5522
5523         if (iommu_prot & IOMMU_READ)
5524                 prot |= DMA_PTE_READ;
5525         if (iommu_prot & IOMMU_WRITE)
5526                 prot |= DMA_PTE_WRITE;
5527         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5528                 prot |= DMA_PTE_SNP;
5529
5530         max_addr = iova + size;
5531         if (dmar_domain->max_addr < max_addr) {
5532                 u64 end;
5533
5534                 /* check if minimum agaw is sufficient for mapped address */
5535                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5536                 if (end < max_addr) {
5537                         pr_err("%s: iommu width (%d) is not "
5538                                "sufficient for the mapped address (%llx)\n",
5539                                __func__, dmar_domain->gaw, max_addr);
5540                         return -EFAULT;
5541                 }
5542                 dmar_domain->max_addr = max_addr;
5543         }
5544         /* Round up size to next multiple of PAGE_SIZE, if it and
5545            the low bits of hpa would take us onto the next page */
5546         size = aligned_nrpages(hpa, size);
5547         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5548                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5549         return ret;
5550 }
5551
5552 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5553                                 unsigned long iova, size_t size,
5554                                 struct iommu_iotlb_gather *gather)
5555 {
5556         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5557         struct page *freelist = NULL;
5558         unsigned long start_pfn, last_pfn;
5559         unsigned int npages;
5560         int iommu_id, level = 0;
5561
5562         /* Cope with horrid API which requires us to unmap more than the
5563            size argument if it happens to be a large-page mapping. */
5564         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5565
5566         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5567                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5568
5569         start_pfn = iova >> VTD_PAGE_SHIFT;
5570         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5571
5572         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5573
5574         npages = last_pfn - start_pfn + 1;
5575
5576         for_each_domain_iommu(iommu_id, dmar_domain)
5577                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5578                                       start_pfn, npages, !freelist, 0);
5579
5580         dma_free_pagelist(freelist);
5581
5582         if (dmar_domain->max_addr == iova + size)
5583                 dmar_domain->max_addr = iova;
5584
5585         return size;
5586 }
5587
5588 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5589                                             dma_addr_t iova)
5590 {
5591         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5592         struct dma_pte *pte;
5593         int level = 0;
5594         u64 phys = 0;
5595
5596         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5597         if (pte && dma_pte_present(pte))
5598                 phys = dma_pte_addr(pte) +
5599                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5600                                                 VTD_PAGE_SHIFT) - 1));
5601
5602         return phys;
5603 }
5604
5605 static inline bool scalable_mode_support(void)
5606 {
5607         struct dmar_drhd_unit *drhd;
5608         struct intel_iommu *iommu;
5609         bool ret = true;
5610
5611         rcu_read_lock();
5612         for_each_active_iommu(iommu, drhd) {
5613                 if (!sm_supported(iommu)) {
5614                         ret = false;
5615                         break;
5616                 }
5617         }
5618         rcu_read_unlock();
5619
5620         return ret;
5621 }
5622
5623 static inline bool iommu_pasid_support(void)
5624 {
5625         struct dmar_drhd_unit *drhd;
5626         struct intel_iommu *iommu;
5627         bool ret = true;
5628
5629         rcu_read_lock();
5630         for_each_active_iommu(iommu, drhd) {
5631                 if (!pasid_supported(iommu)) {
5632                         ret = false;
5633                         break;
5634                 }
5635         }
5636         rcu_read_unlock();
5637
5638         return ret;
5639 }
5640
5641 static inline bool nested_mode_support(void)
5642 {
5643         struct dmar_drhd_unit *drhd;
5644         struct intel_iommu *iommu;
5645         bool ret = true;
5646
5647         rcu_read_lock();
5648         for_each_active_iommu(iommu, drhd) {
5649                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5650                         ret = false;
5651                         break;
5652                 }
5653         }
5654         rcu_read_unlock();
5655
5656         return ret;
5657 }
5658
5659 static bool intel_iommu_capable(enum iommu_cap cap)
5660 {
5661         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5662                 return domain_update_iommu_snooping(NULL) == 1;
5663         if (cap == IOMMU_CAP_INTR_REMAP)
5664                 return irq_remapping_enabled == 1;
5665
5666         return false;
5667 }
5668
5669 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5670 {
5671         struct intel_iommu *iommu;
5672         u8 bus, devfn;
5673
5674         iommu = device_to_iommu(dev, &bus, &devfn);
5675         if (!iommu)
5676                 return ERR_PTR(-ENODEV);
5677
5678         if (translation_pre_enabled(iommu))
5679                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5680
5681         return &iommu->iommu;
5682 }
5683
5684 static void intel_iommu_release_device(struct device *dev)
5685 {
5686         struct intel_iommu *iommu;
5687         u8 bus, devfn;
5688
5689         iommu = device_to_iommu(dev, &bus, &devfn);
5690         if (!iommu)
5691                 return;
5692
5693         dmar_remove_one_dev_info(dev);
5694
5695         set_dma_ops(dev, NULL);
5696 }
5697
5698 static void intel_iommu_probe_finalize(struct device *dev)
5699 {
5700         struct iommu_domain *domain;
5701
5702         domain = iommu_get_domain_for_dev(dev);
5703         if (device_needs_bounce(dev))
5704                 set_dma_ops(dev, &bounce_dma_ops);
5705         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5706                 set_dma_ops(dev, &intel_dma_ops);
5707         else
5708                 set_dma_ops(dev, NULL);
5709 }
5710
5711 static void intel_iommu_get_resv_regions(struct device *device,
5712                                          struct list_head *head)
5713 {
5714         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5715         struct iommu_resv_region *reg;
5716         struct dmar_rmrr_unit *rmrr;
5717         struct device *i_dev;
5718         int i;
5719
5720         down_read(&dmar_global_lock);
5721         for_each_rmrr_units(rmrr) {
5722                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5723                                           i, i_dev) {
5724                         struct iommu_resv_region *resv;
5725                         enum iommu_resv_type type;
5726                         size_t length;
5727
5728                         if (i_dev != device &&
5729                             !is_downstream_to_pci_bridge(device, i_dev))
5730                                 continue;
5731
5732                         length = rmrr->end_address - rmrr->base_address + 1;
5733
5734                         type = device_rmrr_is_relaxable(device) ?
5735                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5736
5737                         resv = iommu_alloc_resv_region(rmrr->base_address,
5738                                                        length, prot, type);
5739                         if (!resv)
5740                                 break;
5741
5742                         list_add_tail(&resv->list, head);
5743                 }
5744         }
5745         up_read(&dmar_global_lock);
5746
5747 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5748         if (dev_is_pci(device)) {
5749                 struct pci_dev *pdev = to_pci_dev(device);
5750
5751                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5752                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5753                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5754                         if (reg)
5755                                 list_add_tail(&reg->list, head);
5756                 }
5757         }
5758 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5759
5760         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5761                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5762                                       0, IOMMU_RESV_MSI);
5763         if (!reg)
5764                 return;
5765         list_add_tail(&reg->list, head);
5766 }
5767
5768 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5769 {
5770         struct device_domain_info *info;
5771         struct context_entry *context;
5772         struct dmar_domain *domain;
5773         unsigned long flags;
5774         u64 ctx_lo;
5775         int ret;
5776
5777         domain = find_domain(dev);
5778         if (!domain)
5779                 return -EINVAL;
5780
5781         spin_lock_irqsave(&device_domain_lock, flags);
5782         spin_lock(&iommu->lock);
5783
5784         ret = -EINVAL;
5785         info = get_domain_info(dev);
5786         if (!info || !info->pasid_supported)
5787                 goto out;
5788
5789         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5790         if (WARN_ON(!context))
5791                 goto out;
5792
5793         ctx_lo = context[0].lo;
5794
5795         if (!(ctx_lo & CONTEXT_PASIDE)) {
5796                 ctx_lo |= CONTEXT_PASIDE;
5797                 context[0].lo = ctx_lo;
5798                 wmb();
5799                 iommu->flush.flush_context(iommu,
5800                                            domain->iommu_did[iommu->seq_id],
5801                                            PCI_DEVID(info->bus, info->devfn),
5802                                            DMA_CCMD_MASK_NOBIT,
5803                                            DMA_CCMD_DEVICE_INVL);
5804         }
5805
5806         /* Enable PASID support in the device, if it wasn't already */
5807         if (!info->pasid_enabled)
5808                 iommu_enable_dev_iotlb(info);
5809
5810         ret = 0;
5811
5812  out:
5813         spin_unlock(&iommu->lock);
5814         spin_unlock_irqrestore(&device_domain_lock, flags);
5815
5816         return ret;
5817 }
5818
5819 static void intel_iommu_apply_resv_region(struct device *dev,
5820                                           struct iommu_domain *domain,
5821                                           struct iommu_resv_region *region)
5822 {
5823         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5824         unsigned long start, end;
5825
5826         start = IOVA_PFN(region->start);
5827         end   = IOVA_PFN(region->start + region->length - 1);
5828
5829         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5830 }
5831
5832 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5833 {
5834         if (dev_is_pci(dev))
5835                 return pci_device_group(dev);
5836         return generic_device_group(dev);
5837 }
5838
5839 #ifdef CONFIG_INTEL_IOMMU_SVM
5840 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5841 {
5842         struct intel_iommu *iommu;
5843         u8 bus, devfn;
5844
5845         if (iommu_dummy(dev)) {
5846                 dev_warn(dev,
5847                          "No IOMMU translation for device; cannot enable SVM\n");
5848                 return NULL;
5849         }
5850
5851         iommu = device_to_iommu(dev, &bus, &devfn);
5852         if ((!iommu)) {
5853                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5854                 return NULL;
5855         }
5856
5857         return iommu;
5858 }
5859 #endif /* CONFIG_INTEL_IOMMU_SVM */
5860
5861 static int intel_iommu_enable_auxd(struct device *dev)
5862 {
5863         struct device_domain_info *info;
5864         struct intel_iommu *iommu;
5865         unsigned long flags;
5866         u8 bus, devfn;
5867         int ret;
5868
5869         iommu = device_to_iommu(dev, &bus, &devfn);
5870         if (!iommu || dmar_disabled)
5871                 return -EINVAL;
5872
5873         if (!sm_supported(iommu) || !pasid_supported(iommu))
5874                 return -EINVAL;
5875
5876         ret = intel_iommu_enable_pasid(iommu, dev);
5877         if (ret)
5878                 return -ENODEV;
5879
5880         spin_lock_irqsave(&device_domain_lock, flags);
5881         info = get_domain_info(dev);
5882         info->auxd_enabled = 1;
5883         spin_unlock_irqrestore(&device_domain_lock, flags);
5884
5885         return 0;
5886 }
5887
5888 static int intel_iommu_disable_auxd(struct device *dev)
5889 {
5890         struct device_domain_info *info;
5891         unsigned long flags;
5892
5893         spin_lock_irqsave(&device_domain_lock, flags);
5894         info = get_domain_info(dev);
5895         if (!WARN_ON(!info))
5896                 info->auxd_enabled = 0;
5897         spin_unlock_irqrestore(&device_domain_lock, flags);
5898
5899         return 0;
5900 }
5901
5902 /*
5903  * A PCI express designated vendor specific extended capability is defined
5904  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5905  * for system software and tools to detect endpoint devices supporting the
5906  * Intel scalable IO virtualization without host driver dependency.
5907  *
5908  * Returns the address of the matching extended capability structure within
5909  * the device's PCI configuration space or 0 if the device does not support
5910  * it.
5911  */
5912 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5913 {
5914         int pos;
5915         u16 vendor, id;
5916
5917         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5918         while (pos) {
5919                 pci_read_config_word(pdev, pos + 4, &vendor);
5920                 pci_read_config_word(pdev, pos + 8, &id);
5921                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5922                         return pos;
5923
5924                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5925         }
5926
5927         return 0;
5928 }
5929
5930 static bool
5931 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5932 {
5933         if (feat == IOMMU_DEV_FEAT_AUX) {
5934                 int ret;
5935
5936                 if (!dev_is_pci(dev) || dmar_disabled ||
5937                     !scalable_mode_support() || !iommu_pasid_support())
5938                         return false;
5939
5940                 ret = pci_pasid_features(to_pci_dev(dev));
5941                 if (ret < 0)
5942                         return false;
5943
5944                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5945         }
5946
5947         if (feat == IOMMU_DEV_FEAT_SVA) {
5948                 struct device_domain_info *info = get_domain_info(dev);
5949
5950                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5951                         info->pasid_supported && info->pri_supported &&
5952                         info->ats_supported;
5953         }
5954
5955         return false;
5956 }
5957
5958 static int
5959 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5960 {
5961         if (feat == IOMMU_DEV_FEAT_AUX)
5962                 return intel_iommu_enable_auxd(dev);
5963
5964         if (feat == IOMMU_DEV_FEAT_SVA) {
5965                 struct device_domain_info *info = get_domain_info(dev);
5966
5967                 if (!info)
5968                         return -EINVAL;
5969
5970                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5971                         return 0;
5972         }
5973
5974         return -ENODEV;
5975 }
5976
5977 static int
5978 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5979 {
5980         if (feat == IOMMU_DEV_FEAT_AUX)
5981                 return intel_iommu_disable_auxd(dev);
5982
5983         return -ENODEV;
5984 }
5985
5986 static bool
5987 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5988 {
5989         struct device_domain_info *info = get_domain_info(dev);
5990
5991         if (feat == IOMMU_DEV_FEAT_AUX)
5992                 return scalable_mode_support() && info && info->auxd_enabled;
5993
5994         return false;
5995 }
5996
5997 static int
5998 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5999 {
6000         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6001
6002         return dmar_domain->default_pasid > 0 ?
6003                         dmar_domain->default_pasid : -EINVAL;
6004 }
6005
6006 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6007                                            struct device *dev)
6008 {
6009         return attach_deferred(dev);
6010 }
6011
6012 static int
6013 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6014                             enum iommu_attr attr, void *data)
6015 {
6016         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6017         unsigned long flags;
6018         int ret = 0;
6019
6020         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6021                 return -EINVAL;
6022
6023         switch (attr) {
6024         case DOMAIN_ATTR_NESTING:
6025                 spin_lock_irqsave(&device_domain_lock, flags);
6026                 if (nested_mode_support() &&
6027                     list_empty(&dmar_domain->devices)) {
6028                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6029                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6030                 } else {
6031                         ret = -ENODEV;
6032                 }
6033                 spin_unlock_irqrestore(&device_domain_lock, flags);
6034                 break;
6035         default:
6036                 ret = -EINVAL;
6037                 break;
6038         }
6039
6040         return ret;
6041 }
6042
6043 const struct iommu_ops intel_iommu_ops = {
6044         .capable                = intel_iommu_capable,
6045         .domain_alloc           = intel_iommu_domain_alloc,
6046         .domain_free            = intel_iommu_domain_free,
6047         .domain_set_attr        = intel_iommu_domain_set_attr,
6048         .attach_dev             = intel_iommu_attach_device,
6049         .detach_dev             = intel_iommu_detach_device,
6050         .aux_attach_dev         = intel_iommu_aux_attach_device,
6051         .aux_detach_dev         = intel_iommu_aux_detach_device,
6052         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6053         .map                    = intel_iommu_map,
6054         .unmap                  = intel_iommu_unmap,
6055         .iova_to_phys           = intel_iommu_iova_to_phys,
6056         .probe_device           = intel_iommu_probe_device,
6057         .probe_finalize         = intel_iommu_probe_finalize,
6058         .release_device         = intel_iommu_release_device,
6059         .get_resv_regions       = intel_iommu_get_resv_regions,
6060         .put_resv_regions       = generic_iommu_put_resv_regions,
6061         .apply_resv_region      = intel_iommu_apply_resv_region,
6062         .device_group           = intel_iommu_device_group,
6063         .dev_has_feat           = intel_iommu_dev_has_feat,
6064         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6065         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6066         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6067         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6068         .def_domain_type        = device_def_domain_type,
6069         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6070 #ifdef CONFIG_INTEL_IOMMU_SVM
6071         .cache_invalidate       = intel_iommu_sva_invalidate,
6072         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6073         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6074         .sva_bind               = intel_svm_bind,
6075         .sva_unbind             = intel_svm_unbind,
6076         .sva_get_pasid          = intel_svm_get_pasid,
6077 #endif
6078 };
6079
6080 static void quirk_iommu_igfx(struct pci_dev *dev)
6081 {
6082         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6083         dmar_map_gfx = 0;
6084 }
6085
6086 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6094
6095 /* Broadwell igfx malfunctions with dmar */
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6120
6121 static void quirk_iommu_rwbf(struct pci_dev *dev)
6122 {
6123         /*
6124          * Mobile 4 Series Chipset neglects to set RWBF capability,
6125          * but needs it. Same seems to hold for the desktop versions.
6126          */
6127         pci_info(dev, "Forcing write-buffer flush capability\n");
6128         rwbf_quirk = 1;
6129 }
6130
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6132 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6133 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6134 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6138
6139 #define GGC 0x52
6140 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6141 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6142 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6143 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6144 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6145 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6146 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6147 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6148
6149 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6150 {
6151         unsigned short ggc;
6152
6153         if (pci_read_config_word(dev, GGC, &ggc))
6154                 return;
6155
6156         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6157                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6158                 dmar_map_gfx = 0;
6159         } else if (dmar_map_gfx) {
6160                 /* we have to ensure the gfx device is idle before we flush */
6161                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6162                 intel_iommu_strict = 1;
6163        }
6164 }
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6169
6170 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6171    ISOCH DMAR unit for the Azalia sound device, but not give it any
6172    TLB entries, which causes it to deadlock. Check for that.  We do
6173    this in a function called from init_dmars(), instead of in a PCI
6174    quirk, because we don't want to print the obnoxious "BIOS broken"
6175    message if VT-d is actually disabled.
6176 */
6177 static void __init check_tylersburg_isoch(void)
6178 {
6179         struct pci_dev *pdev;
6180         uint32_t vtisochctrl;
6181
6182         /* If there's no Azalia in the system anyway, forget it. */
6183         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6184         if (!pdev)
6185                 return;
6186         pci_dev_put(pdev);
6187
6188         /* System Management Registers. Might be hidden, in which case
6189            we can't do the sanity check. But that's OK, because the
6190            known-broken BIOSes _don't_ actually hide it, so far. */
6191         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6192         if (!pdev)
6193                 return;
6194
6195         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6196                 pci_dev_put(pdev);
6197                 return;
6198         }
6199
6200         pci_dev_put(pdev);
6201
6202         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6203         if (vtisochctrl & 1)
6204                 return;
6205
6206         /* Drop all bits other than the number of TLB entries */
6207         vtisochctrl &= 0x1c;
6208
6209         /* If we have the recommended number of TLB entries (16), fine. */
6210         if (vtisochctrl == 0x10)
6211                 return;
6212
6213         /* Zero TLB entries? You get to ride the short bus to school. */
6214         if (!vtisochctrl) {
6215                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6216                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6217                      dmi_get_system_info(DMI_BIOS_VENDOR),
6218                      dmi_get_system_info(DMI_BIOS_VERSION),
6219                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6220                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6221                 return;
6222         }
6223
6224         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6225                vtisochctrl);
6226 }